In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.neighbors import KNeighborsClassifier
RAND=1234

In [2]:
data_cat = Path.cwd().parent.parent / 'datasets'
if Path.exists(data_cat):
    datasets = list(data_cat.glob('*_bin.csv'))
    print(*list(map(lambda x: x.name, datasets)), sep='\n')

bank_bin.csv
bike_bin.csv
car_bin.csv


In [3]:
Models = ["DecisionTreeClassifier", "RandomForestClassifier", "XGBClassifier", "KNeighborsClassifier"]
data = []

# Dataset 1 Bank

In [7]:
df = pd.read_csv(datasets[0], index_col=0)
df = df.astype(int)
better_idx = df[df['y'] == 0].index[:2500].append(df[df['y'] == 1].index[:2500])
df = df.loc[better_idx].reset_index(drop=True)
df

Unnamed: 0,big_age,high_educ,has_job,default,housing,loan,cell_cont,before_con,influence_ccon,long_dur,poutcome,y
0,0,1,1,0,1,1,1,1,1,1,0,0
1,0,1,1,0,0,1,1,0,1,0,0,0
2,1,1,0,0,1,1,0,0,1,1,0,0
3,0,1,1,0,1,1,0,0,1,1,0,0
4,0,0,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1,0,0,0,0,1,0,0,0,0,0,1
4996,1,0,0,0,1,1,0,0,0,1,0,1
4997,0,0,1,0,0,1,1,0,0,1,0,1
4998,0,1,1,0,1,0,1,0,1,1,0,1


In [9]:
X = df.drop("y", axis=1)
y = df["y"]
data_NO = 1

In [10]:
def train(X,y,model0,params,spliter, scores=None):
    if scores == None:
        scores=["accuracy", "f1"]
    clf = GridSearchCV(model0,params,scoring=scores,refit="f1",cv=spliter)
    clf.fit(X,y)
    ind = clf.best_index_
    res = [clf.best_params_]
    for sc in scores:
        res.append(clf.cv_results_["mean_test_"+sc][ind])
    return res

spliter = KFold(n_splits=5, shuffle=True, random_state=RAND)

In [12]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}
res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 1,
 {'max_depth': 15, 'min_samples_split': 4},
 0.7145999999999999,
 0.7308827353532278]

In [13]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter))
data.append(res)
data[-1]

['RandomForestClassifier',
 1,
 {'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 75},
 0.7303999999999999,
 0.7479312717385442]

In [14]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter))
data.append(res)
data[-1]

['XGBClassifier',
 1,
 {'learning_rate': 1.1, 'max_depth': 3, 'n_estimators': 3},
 0.7304,
 0.7505538501008477]

In [15]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter))
data.append(res)
data[-1]

['KNeighborsClassifier',
 1,
 {'n_neighbors': 7},
 0.6207999999999999,
 0.4348901341278134]

# Dataset 2 Bike buyers

In [17]:
df = pd.read_csv(datasets[1], index_col=0)
df

Unnamed: 0,Income_10000-60000,Income_60000-170000,Cars_0,Cars_1-4,Commute Distance_0-1 Miles,Commute Distance_1-2 Miles,Commute Distance_10+ Miles,Commute Distance_2-5 Miles,Commute Distance_5-10 Miles,Gender_Male,Purchased
0,1,0,1,0,1,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,0,1,0
2,0,1,0,1,0,0,0,1,0,1,0
3,0,1,0,1,0,0,0,0,1,1,1
4,1,0,1,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
995,0,1,0,1,0,0,0,1,0,1,1
996,0,1,1,0,0,0,0,1,0,1,1
997,0,1,1,0,1,0,0,0,0,1,1
998,0,1,0,1,0,1,0,0,0,1,0


In [18]:
X = df.drop("Purchased", axis=1)
y = df["Purchased"]
data_NO = 2

In [19]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}

res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 2,
 {'max_depth': 10, 'min_samples_split': 2},
 0.574,
 0.5124674151658835]

In [20]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['RandomForestClassifier',
 2,
 {'max_depth': 5, 'min_samples_split': 7, 'n_estimators': 150},
 0.595,
 0.5525349609370924]

In [21]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['XGBClassifier',
 2,
 {'learning_rate': 0.7, 'max_depth': 3, 'n_estimators': 7},
 0.5970000000000001,
 0.5541893494637283]

In [22]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['KNeighborsClassifier',
 2,
 {'n_neighbors': 3},
 0.49700000000000005,
 0.49932882270459206]

# Dataset 3 Cars Accept

In [23]:
df = pd.read_csv(datasets[2], index_col=0)
df

Unnamed: 0,Buying_Price_high,Buying_Price_low,Buying_Price_med,Maintenance_Price_high,Maintenance_Price_low,Maintenance_Price_med,Person_Capacity_2,Person_Capacity_4,Person_Capacity_more,Safety_high,Safety_low,Safety_med,Car_Acceptability
0,1,0,0,1,0,0,1,0,0,0,1,0,0
1,1,0,0,1,0,0,1,0,0,0,0,1,0
2,1,0,0,1,0,0,1,0,0,1,0,0,0
3,1,0,0,1,0,0,1,0,0,0,1,0,0
4,1,0,0,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,0,1,0,0,1,0,0,0,1,0,0,1,1
968,0,1,0,0,1,0,0,0,1,1,0,0,1
969,0,1,0,0,1,0,0,0,1,0,1,0,0
970,0,1,0,0,1,0,0,0,1,0,0,1,1


In [24]:
X = df.drop("Car_Acceptability", axis=1)
y = df["Car_Acceptability"]
data_NO = 3

In [25]:
model1 = DecisionTreeClassifier(random_state=RAND)
params = \
    {"max_depth": [10,15,20],
     "min_samples_split": [2,4,6]}

res=[Models[0], data_NO]
res.extend(train(X,y,model1, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['DecisionTreeClassifier',
 3,
 {'max_depth': 10, 'min_samples_split': 2},
 0.9177055247158341,
 0.8953814734056674]

In [26]:
model2 = RandomForestClassifier(random_state=RAND)
params = \
    {"n_estimators": [50, 75, 100, 125, 150],
     "max_depth": [2,5,7,10],
     "min_samples_split" : [2,5,7,10]}
res=[Models[1], data_NO]
res.extend(train(X,y,model2, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['RandomForestClassifier',
 3,
 {'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 50},
 0.9403172085646313,
 0.9277842462172312]

In [27]:
model3 = XGBClassifier(random_state=RAND)
params = \
    {"n_estimators": [3, 5, 7, 9, 11],
     "max_depth": [2,3,4,5],
     "learning_rate" : [0.5, 0.7, 0.9, 1.1]}
res=[Models[2], data_NO]
res.extend(train(X,y,model3, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['XGBClassifier',
 3,
 {'learning_rate': 0.9, 'max_depth': 3, 'n_estimators': 3},
 0.9423632038065028,
 0.9289157644314987]

In [28]:
model4 = KNeighborsClassifier()
params = \
    {"n_neighbors": [3,5,7,10]}
res=[Models[3], data_NO]
res.extend(train(X,y,model4, params, spliter, scores={"acc" : make_scorer(accuracy_score), "f1" : make_scorer(f1_score)}))
data.append(res)
data[-1]

['KNeighborsClassifier',
 3,
 {'n_neighbors': 3},
 0.9259265133491938,
 0.9007443070654626]

In [29]:
report3 = pd.DataFrame(data, columns=["model", "dataset", "params", "accuracy", "f1_score"])
report3

Unnamed: 0,model,dataset,params,accuracy,f1_score
0,DecisionTreeClassifier,1,"{'max_depth': 15, 'min_samples_split': 4}",0.7146,0.730883
1,RandomForestClassifier,1,"{'max_depth': 2, 'min_samples_split': 2, 'n_es...",0.7304,0.747931
2,XGBClassifier,1,"{'learning_rate': 1.1, 'max_depth': 3, 'n_esti...",0.7304,0.750554
3,KNeighborsClassifier,1,{'n_neighbors': 7},0.6208,0.43489
4,DecisionTreeClassifier,2,"{'max_depth': 10, 'min_samples_split': 2}",0.574,0.512467
5,RandomForestClassifier,2,"{'max_depth': 5, 'min_samples_split': 7, 'n_es...",0.595,0.552535
6,XGBClassifier,2,"{'learning_rate': 0.7, 'max_depth': 3, 'n_esti...",0.597,0.554189
7,KNeighborsClassifier,2,{'n_neighbors': 3},0.497,0.499329
8,DecisionTreeClassifier,3,"{'max_depth': 10, 'min_samples_split': 2}",0.917706,0.895381
9,RandomForestClassifier,3,"{'max_depth': 2, 'min_samples_split': 2, 'n_es...",0.940317,0.927784


In [30]:
report3.to_csv("report.csv")

In [None]:
S