In [1]:
from ucimlrepo import fetch_ucirepo 

from sklearn.model_selection import train_test_split
from normalizer import Normalizer
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
mushroom = fetch_ucirepo(id=848) 
  
attributes_type = mushroom.variables.type
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

In [3]:
import importlib
import decision_tree
importlib.reload(decision_tree)  # Ricarica il modulo


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88, shuffle=True)


normalizer = Normalizer(n_bins=4, normalization='minmax')

#X_train = normalizer.normalize(X_train)
y_train = normalizer._fix_label(y_train)


#X = normalizer.normalize(X)
#y = normalizer.normalize(y)

#y_train = y_train.to_numpy().flatten()

# Decision tree classifier
dt = decision_tree.Decision_tree(splitting_criteria = 'gini', max_depth = 30, min_samples_split=20, min_impurity_decrease = 0.001)

training_error = dt.fit(X_train, y_train)
print(f'training error: {training_error}')

#----------PREDICTION
#X_test=normalizer.normalize(X_test)
y_test = normalizer._fix_label(y_test)
prediction = dt.predict(X_test)


accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction, average='binary')
recall = recall_score(y_test, prediction, average='binary')
f1 = f1_score(y_test, prediction, average='binary')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")



training error: 0.0027428103571794085
Accuracy: 0.9956607172097592
Precision: 0.9966789667896679
Recall: 0.9935626264484091
F1-score: 0.9951183568204844


# SCV

In [4]:
import importlib
import decision_tree
importlib.reload(decision_tree)  # Ricarica il modulo
from sklearn.model_selection import (train_test_split, KFold, StratifiedKFold, LeaveOneOut)

from ucimlrepo import fetch_ucirepo 

from normalizer import Normalizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


##-------------Load dataset
mushroom = fetch_ucirepo(id=848) 
  
attributes_type = mushroom.variables.type
# data (as pandas dataframes) 
X_tot = mushroom.data.features 
y_tot = mushroom.data.targets 

X, X_final_test, y, y_final_test = train_test_split(X_tot, y_tot, test_size=0.15, random_state=88, shuffle=True)
print('-- Dataset loaded --')
##-------------


normalizer = Normalizer(n_bins=4, normalization='minmax')

y = normalizer._fix_label(y)
y_final_test = normalizer._fix_label(y_final_test)

model = decision_tree.Decision_tree(splitting_criteria = 'gini', max_depth = 20, min_samples_split=20, min_impurity_decrease = 0.001)

N_SPLIT = 5

skf = StratifiedKFold(n_splits=N_SPLIT, shuffle=False)
skf_accuracies=[]
count = 1
for train_index, test_index in skf.split(X, y):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    training_error = model.fit(X_train,y_train)
    print(f'training error ({count}/{N_SPLIT}): {training_error}')
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"    Accuracy ({count}/{N_SPLIT}): {accuracy}")
    skf_accuracies.append(accuracy)
    count +=1

print(f"Stratified K-Fold Cross-Validation Accuracies: {skf_accuracies}")


-- Dataset loaded --
training error (1/5): 0.006526031883639166
    Accuracy (1/5): 0.99171643228665
training error (2/5): 0.010764340413235082
    Accuracy (2/5): 0.987093045655943
training error (3/5): 0.010186389250108365
    Accuracy (3/5): 0.9897900211905221
training error (4/5): 0.007922556409083247
    Accuracy (4/5): 0.9914266448319045
training error (5/5): 0.008933946588966214
    Accuracy (5/5): 0.9902706868317118
Stratified K-Fold Cross-Validation Accuracies: [0.99171643228665, 0.987093045655943, 0.9897900211905221, 0.9914266448319045, 0.9902706868317118]


# K-Fold Cross Validation

In [7]:
import importlib
import decision_tree
importlib.reload(decision_tree)  # Ricarica il modulo
from sklearn.model_selection import (train_test_split, KFold, StratifiedKFold, KFold,LeaveOneOut)

from ucimlrepo import fetch_ucirepo 

from normalizer import Normalizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


##-------------Load dataset
mushroom = fetch_ucirepo(id=848) 
  
attributes_type = mushroom.variables.type
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

print('-- Dataset loaded --')
##-------------


normalizer = Normalizer(n_bins=4, normalization='minmax')



model = decision_tree.Decision_tree(splitting_criteria = 'gini', max_depth = 20, min_samples_split=20, min_impurity_decrease = 0.001)

N_SPLIT = 10

kf = KFold(n_splits=N_SPLIT, shuffle=True, random_state=42)
kf_accuracies=[]
count = 1
for train_index, test_index in kf.split(X, y):
    X_train = X.loc[train_index]
    y_train = normalizer._fix_label(y.loc[train_index])
    X_test = X.loc[test_index]
    y_test = normalizer._fix_label(y.loc[test_index])
    training_error = model.fit(X_train,y_train)
    print(f'training error ({count}/{N_SPLIT}): {training_error}')
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    kf_accuracies.append(accuracy)
    count +=1

print(f"Stratified K-Fold Cross-Validation Accuracies: {kf_accuracies}")


-- Dataset loaded --
training error (1/10): 0.010134274589716531
training error (2/10): 0.008988028092136386
training error (3/10): 0.008824278592482079
training error (4/10): 0.009752192423856483
training error (5/10): 0.013718569193260798
training error (6/10): 0.013554819693606493
training error (7/10): 0.00967941486845457
training error (8/10): 0.006950256540882792
training error (9/10): 0.007132200429387577
training error (10/10): 0.008423848770991393
Stratified K-Fold Cross-Validation Accuracies: [0.9893564761748813, 0.9909939413787457, 0.9906664483379728, 0.9888652366137219, 0.985917799246766, 0.9857540527263796, 0.9890289831341084, 0.9927951531029966, 0.992958899623383, 0.9916475597772683]


In [24]:
import importlib
import decision_tree
importlib.reload(decision_tree)  # Ricarica il modulo
from sklearn.model_selection import (train_test_split, KFold, StratifiedKFold, KFold,LeaveOneOut)

from ucimlrepo import fetch_ucirepo 

from normalizer import Normalizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


##-------------Load dataset
mushroom = fetch_ucirepo(id=848) 
  
attributes_type = mushroom.variables.type
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

print('-- Dataset loaded --')
##-------------
param_grid = {
    'max_depth': [10, 15, 20], 
    'min_samples_split': [10, 20, 30]
}

best_max_depth = 10
best_min_samples_split = 10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88, shuffle=True)
normalizer = Normalizer(n_bins=4, normalization='minmax')
y_train = normalizer._fix_label(y_train)
y_test = normalizer._fix_label(y_test)

accuracies = []

for max_depth in param_grid['max_depth']:
    for min_samples_split in param_grid['min_samples_split']:
        model = decision_tree.Decision_tree(splitting_criteria = 'gini', max_depth = max_depth, min_samples_split=min_samples_split, min_impurity_decrease = 0.001)
        
        t_e = model.fit(X_train,y_train)
        print(f'Train error: {t_e} -> max_dept= {max_depth} | min_samp = {min_samples_split}')
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        
        if len(accuracies) == 0 or accuracy > max(accuracies) :
            best_max_depth = max_depth
            best_min_samples_split = min_samples_split
        
        accuracies.append(accuracy)


print(accuracies)
print(best_max_depth, best_min_samples_split)




-- Dataset loaded --
Train error: 0.16516221471701975 -> max_dept= 10 | min_samp = 10
Train error: 0.16516221471701975 -> max_dept= 10 | min_samp = 20
Train error: 0.16518268345102854 -> max_dept= 10 | min_samp = 30
Train error: 0.05929792242349811 -> max_dept= 15 | min_samp = 10
Train error: 0.05929792242349811 -> max_dept= 15 | min_samp = 20
Train error: 0.05946167229556852 -> max_dept= 15 | min_samp = 30
Train error: 0.0068774946269573225 -> max_dept= 20 | min_samp = 10
Train error: 0.0068774946269573225 -> max_dept= 20 | min_samp = 20
Train error: 0.007041244499027735 -> max_dept= 20 | min_samp = 30
[0.8357622400523989, 0.8357622400523989, 0.8357622400523989, 0.9355657442279351, 0.9355657442279351, 0.9354019977075487, 0.9904208285573931, 0.9904208285573931, 0.9902570820370067]
20 10


# Nested

In [8]:
import importlib
import decision_tree
importlib.reload(decision_tree)  # Ricarica il modulo
from sklearn.model_selection import (train_test_split, KFold, StratifiedKFold, KFold,LeaveOneOut)
import numpy as np
from ucimlrepo import fetch_ucirepo 

from normalizer import Normalizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import pickle

normalizer = Normalizer(n_bins=4, normalization='minmax')

##-------------Load dataset
mushroom = fetch_ucirepo(id=848) 
  
# data (as pandas dataframes) 
X_tot = mushroom.data.features 
y_tot = mushroom.data.targets 



X, X_final_test, y, y_final_test = train_test_split(X_tot, y_tot, test_size=0.15, random_state=88, shuffle=True)


y = normalizer._fix_label(y)
y_final_test = normalizer._fix_label(y_final_test)

print('-- Dataset loaded --')
##-------------
param_grid = {
    'max_depth': [20,25,30], 
    'min_samples_split': [10,20],
    'criterion' : ['gini', 'entropy', 'sqrt_split'],
    'min_impurity_decrease' : [0.001,0.005]
}

N_FOLDS = 5

#(max_depth, min_sample, criterion, min_impurity):{accuracy: int, precision: int, recall: int, f1: int}
metrics = {}

for max_depth in param_grid['max_depth']:
    for min_samples_split in param_grid['min_samples_split']:
        for criterion in param_grid['criterion']:
            for min_impurity_decrease in param_grid['min_impurity_decrease']:
                model = decision_tree.Decision_tree(splitting_criteria = criterion, max_depth = max_depth, min_samples_split=min_samples_split, min_impurity_decrease = min_impurity_decrease)
            
                skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=False)
                skf_accuracies=[]
                skf_precisions=[]
                skf_recalls = []
                skf_f1 = []
                print(f'-- TRAINING ::: max_dept= {max_depth} | min_samp = {min_samples_split} | criterion = {criterion} | min_impurity = {min_impurity_decrease} ::: --')
                count = 1
                for train_index, test_index in skf.split(X, y):
                    X_train = X.iloc[train_index]
                    y_train = y.iloc[train_index]
                    X_test = X.iloc[test_index]
                    y_test = y.iloc[test_index]
                    training_error = model.fit(X_train,y_train)
                    
                    print(f'    ({count}/{N_FOLDS}): training error: {training_error}')
                    
                    y_pred = model.predict(X_test)
                    
                    accuracy = accuracy_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred)
                    
                    print(f'     ({count}/{N_FOLDS}): accuracy: {accuracy}')
                    
                    skf_accuracies.append(accuracy)
                    skf_precisions.append(precision)
                    skf_recalls.append(recall)
                    skf_f1.append(f1)
                    
                    count+=1
                    
                metrics[(max_depth, min_samples_split, criterion, min_impurity_decrease)] = {
                    'accuracy' : np.mean(skf_accuracies),
                    'precision' : np.mean(skf_precisions),
                    'recall' : np.mean(skf_recalls),
                    'f1': np.mean(skf_f1)
                }
                

                    
with open('metrics.pkl', 'wb') as f:  # open a text file
    pickle.dump(metrics, f) # serialize the list






-- Dataset loaded --
-- TRAINING ::: max_dept= 20 | min_samp = 10 | criterion = gini | min_impurity = 0.001 ::: --
    (1/5): training error: 0.006453787988248326
     (1/5): accuracy: 0.99171643228665
    (2/5): training error: 0.01059577132398979
     (2/5): accuracy: 0.9871893662107494


KeyboardInterrupt: 

In [5]:
def best_parameter(metrics):
    best_accuracy = 0
    best_precision = 0
    best_recall = 0
    best_f1 = 0
    for combo,values in metrics.items():
        if values['accuracy'] > best_accuracy:
            best_accuracy = values['accuracy']
            best_combo_accuracy = combo
        if values['precision'] > best_precision:
            best_precision = values['precision']
            best_combo_precision = combo
        if values['recall'] > best_recall:
            best_recall = values['recall']
            best_combo_recall = combo
        if values['f1'] > best_f1:
            best_f1 = values['f1']
            best_combo_f1 = combo
    return best_combo_accuracy, best_combo_precision, best_combo_recall, best_combo_f1, best_accuracy, best_precision, best_recall, best_f1


best_combo_accuracy, best_combo_precision, best_combo_recall, best_combo_f1, best_accuracy, best_precision, best_recall, best_f1 = best_parameter(metrics)
#print the best parameters for each metric
print(f'Best parameters for accuracy: {best_combo_accuracy} with accuracy: {best_accuracy}')
print(f'Best parameters for precision: {best_combo_precision} with precision: {best_precision}')
print(f'Best parameters for recall: {best_combo_recall} with recall: {best_recall}')
print(f'Best parameters for f1: {best_combo_f1} with f1: {best_f1}')

Best parameters for accuracy: (30, 10, 'entropy', 0.005) with accuracy: 0.577069249058559
Best parameters for precision: (30, 10, 'entropy', 0.005) with precision: 0.5305435356425227
Best parameters for recall: (10, 10, 'entropy', 0.001) with recall: 0.5901204443856491
Best parameters for f1: (10, 10, 'entropy', 0.005) with f1: 0.5255871225929042
