In [26]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.neighbors import KNeighborsClassifier
RAND=1234

In [4]:
data_cat = Path.cwd().parent.parent / 'datasets'
if Path.exists(data_cat):
    datasets = list(data_cat.glob('*_bin.csv'))
    print(*list(map(lambda x: x.name, datasets)), sep='\n')

bike_bin.csv
cancer_bin.csv
car_bin.csv


# Dataset 1

In [5]:
df = pd.read_csv(datasets[0], index_col=0)
df

Unnamed: 0,Income,Age,Gender_Female,Gender_Male,Children_0,Children_1,Children_2,Children_3,Children_4,Children_5,...,Commute Distance_1-2 Miles,Commute Distance_10+ Miles,Commute Distance_2-5 Miles,Commute Distance_5-10 Miles,Region_Europe,Region_North America,Region_Pacific,Marital Status_Single,Home Owner_Yes,Purchased Bike_Yes
0,40000,42,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,30000,43,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,80000,60,0,1,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
3,70000,41,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,1
4,30000,36,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,60000,54,0,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,1
996,70000,35,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,1,1,1
997,60000,38,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,1
998,100000,38,0,1,0,0,0,1,0,0,...,1,0,0,0,0,1,0,1,0,0


In [6]:
X = df.drop("Purchased Bike_Yes", axis=1)
y = df["Purchased Bike_Yes"]

In [7]:
def train(X,y,model,spliter):
    res_acc = []
    res_f1 = []
    for i, (train_index, test_index) in enumerate(spliter.split(X)):
        X0,y0 = X.loc[train_index], y[train_index]
        model.fit(X0,y0)
        X_t,y_t = X.loc[test_index], y[test_index]
        print(f'fold: {i}')
        y_pred = model.predict(X_t)
        acc, f1 = accuracy_score(y_t, y_pred), f1_score(y_t, y_pred)
        res_acc.append(acc)
        res_f1.append(f1)
        print(f'\t accuracy: {acc}')
        print(f'\t f1_score: {f1}')
    return np.array(res_acc).mean(), np.array(res_f1).mean()

In [10]:
spliter = KFold(n_splits=5, shuffle=True, random_state=RAND)
model1 = DecisionTreeClassifier(max_depth=15,min_samples_split=2, random_state=RAND)
train(X,y,model1, spliter)

fold: 0
	 accuracy: 0.655
	 f1_score: 0.6387434554973821
fold: 1
	 accuracy: 0.7
	 f1_score: 0.6511627906976745
fold: 2
	 accuracy: 0.655
	 f1_score: 0.6187845303867403
fold: 3
	 accuracy: 0.685
	 f1_score: 0.6926829268292682
fold: 4
	 accuracy: 0.67
	 f1_score: 0.6796116504854369


(0.6729999999999999, 0.6561970707793005)

In [9]:
model2 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=RAND)
train(X,y,model2, spliter)

fold: 0
	 accuracy: 0.715
	 f1_score: 0.6850828729281768
fold: 1
	 accuracy: 0.67
	 f1_score: 0.6292134831460674
fold: 2
	 accuracy: 0.715
	 f1_score: 0.6951871657754011
fold: 3
	 accuracy: 0.685
	 f1_score: 0.6865671641791045
fold: 4
	 accuracy: 0.695
	 f1_score: 0.6903553299492386


(0.696, 0.6772812031955977)

In [33]:
model3 = XGBClassifier(n_estimators=9, max_depth=2, learning_rate=0.9)
train(X,y,model3, spliter)

fold: 0
	 accuracy: 0.675
	 f1_score: 0.6486486486486486
fold: 1
	 accuracy: 0.67
	 f1_score: 0.625
fold: 2
	 accuracy: 0.635
	 f1_score: 0.6331658291457287
fold: 3
	 accuracy: 0.67
	 f1_score: 0.6732673267326732
fold: 4
	 accuracy: 0.67
	 f1_score: 0.6732673267326732


(0.664, 0.6506698262519448)

In [32]:
model4 = KNeighborsClassifier(n_neighbors=5)
train(X,y,model4, spliter)

fold: 0
	 accuracy: 0.66
	 f1_score: 0.6344086021505376
fold: 1
	 accuracy: 0.675
	 f1_score: 0.6285714285714287
fold: 2
	 accuracy: 0.69
	 f1_score: 0.6836734693877551
fold: 3
	 accuracy: 0.66
	 f1_score: 0.6494845360824743
fold: 4
	 accuracy: 0.665
	 f1_score: 0.6666666666666666


(0.67, 0.6525609405717725)