In [29]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.neighbors import KNeighborsClassifier
RAND=1234

In [2]:
data_cat = Path.cwd().parent.parent / 'datasets'
if Path.exists(data_cat):
    datasets = list(data_cat.glob('*_bin.csv'))
    print(*list(map(lambda x: x.name, datasets)), sep='\n')

bike_bin.csv
cancer_bin.csv
car_bin.csv


In [103]:
Models = ["DecisionTreeClassifier", "RandomForestClassifier", "XGBClassifier", "KNeighborsClassifier"]
data = []

# Dataset 1

In [104]:
df = pd.read_csv(datasets[0], index_col=0)
df

Unnamed: 0,Income,Age,Gender_Female,Gender_Male,Children_0,Children_1,Children_2,Children_3,Children_4,Children_5,...,Commute Distance_1-2 Miles,Commute Distance_10+ Miles,Commute Distance_2-5 Miles,Commute Distance_5-10 Miles,Region_Europe,Region_North America,Region_Pacific,Marital Status_Single,Home Owner_Yes,Purchased Bike_Yes
0,40000,42,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,30000,43,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,80000,60,0,1,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
3,70000,41,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,1
4,30000,36,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,60000,54,0,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,1
996,70000,35,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,1,1,1
997,60000,38,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
998,100000,38,0,1,0,0,0,1,0,0,...,1,0,0,0,0,1,0,1,0,0


In [105]:
X = df.drop("Purchased Bike_Yes", axis=1)
y = df["Purchased Bike_Yes"]
data_NO = 1

In [120]:
def train(X,y,model0,params,spliter, scores=None):
    if scores == None:
        scores=["accuracy", "f1"]
    clf = GridSearchCV(model0,params,scoring=scores,refit="f1",cv=spliter)
    clf.fit(X,y)
    ind = clf.best_index_
    res = [clf.best_params_]
    for sc in scores:
        res.append(clf.cv_results_["mean_test_"+sc][ind])
    return res

spliter = KFold(n_splits=5, shuffle=True, random_state=RAND)

In [119]:
{"a":1}.keys()

dict_keys(['a'])

In [107]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}
res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 1,
 {'max_depth': 20, 'min_samples_split': 2},
 0.674,
 0.6651908778402652]

In [108]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter))
data.append(res)
data[-1]

['RandomForestClassifier',
 1,
 {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150},
 0.716,
 0.7068055926534533]

In [109]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter))
data.append(res)
data[-1]

['XGBClassifier',
 1,
 {'learning_rate': 1.1, 'max_depth': 5, 'n_estimators': 11},
 0.693,
 0.6833924076638707]

In [110]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter))
data.append(res)
data[-1]

['KNeighborsClassifier',
 1,
 {'n_neighbors': 3},
 0.6770000000000002,
 0.6660383348871118]

# Dataset 2

In [111]:
df = pd.read_csv(datasets[1], index_col="index")
df

Unnamed: 0_level_0,Gender_1,Gender_2,Air Pollution_1,Air Pollution_2,Air Pollution_3,Air Pollution_4,Air Pollution_5,Air Pollution_6,Air Pollution_7,Air Pollution_8,...,Dry Cough_7,Snoring_1,Snoring_2,Snoring_3,Snoring_4,Snoring_5,Snoring_6,Snoring_7,Age,Level
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,33,0
1,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,17,1
2,1,0,0,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,35,2
3,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,37,2
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,46,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,44,2
996,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,37,2
997,0,1,0,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,25,2
998,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,18,2


In [112]:
X = df.drop("Level", axis=1)
y = df["Level"]
data_NO = 2

In [123]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}

res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 2,
 {'max_depth': 10, 'min_samples_split': 2},
 1.0,
 1.0]

In [124]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['RandomForestClassifier',
 2,
 {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 150},
 1.0,
 1.0]

In [125]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['XGBClassifier',
 2,
 {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 11},
 1.0,
 1.0]

In [126]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['KNeighborsClassifier',
 2,
 {'n_neighbors': 5},
 0.9949999999999999,
 0.9947282515657812]

# Dataset 3

In [139]:
df = pd.read_csv(datasets[2], index_col=0)
df

Unnamed: 0,Buying_Price_high,Buying_Price_low,Buying_Price_med,Buying_Price_vhigh,Maintenance_Price_high,Maintenance_Price_low,Maintenance_Price_med,Maintenance_Price_vhigh,No_of_Doors_2,No_of_Doors_3,...,Person_Capacity_2,Person_Capacity_4,Person_Capacity_more,Size_of_Luggage_big,Size_of_Luggage_med,Size_of_Luggage_small,Safety_high,Safety_low,Safety_med,Car_Acceptability
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,0
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,0
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,1,2
1724,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,0,3
1725,0,1,0,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
1726,0,1,0,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,2


In [140]:
X = df.drop("Car_Acceptability", axis=1)
y = df["Car_Acceptability"]
data_NO = 3

In [141]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}

res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 3,
 {'max_depth': 15, 'min_samples_split': 2},
 0.9641182876769708,
 0.9299887737126207]

In [142]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['RandomForestClassifier',
 3,
 {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100},
 0.9513814191170311,
 0.8649133577499739]

In [143]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['XGBClassifier',
 3,
 {'learning_rate': 0.9, 'max_depth': 5, 'n_estimators': 11},
 0.9855290273938175,
 0.9672695868185501]

In [144]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score, average="macro")}))
data.append(res)
data[-1]

['KNeighborsClassifier',
 3,
 {'n_neighbors': 10},
 0.9120382005529027,
 0.7393579317138732]

In [145]:
report3 = pd.DataFrame(data, columns=["model", "dataset", "params", "accuracy", "f1_score"])
report3

Unnamed: 0,model,dataset,params,accuracy,f1_score
0,DecisionTreeClassifier,1,"{'max_depth': 20, 'min_samples_split': 2}",0.674,0.665191
1,RandomForestClassifier,1,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.716,0.706806
2,XGBClassifier,1,"{'learning_rate': 1.1, 'max_depth': 5, 'n_esti...",0.693,0.683392
3,KNeighborsClassifier,1,{'n_neighbors': 3},0.677,0.666038
4,DecisionTreeClassifier,2,"{'max_depth': 10, 'min_samples_split': 2}",1.0,1.0
5,RandomForestClassifier,2,"{'max_depth': 5, 'min_samples_split': 2, 'n_es...",1.0,1.0
6,XGBClassifier,2,"{'learning_rate': 0.5, 'max_depth': 2, 'n_esti...",1.0,1.0
7,KNeighborsClassifier,2,{'n_neighbors': 5},0.995,0.994728
8,DecisionTreeClassifier,3,"{'max_depth': 15, 'min_samples_split': 2}",0.964118,0.929989
9,RandomForestClassifier,3,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.951381,0.864913


In [146]:
report3.to_csv("report.csv")

In [None]:
S