In [183]:
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,f1_score
import collections
from IPython.display import Image
import os
import time

In [184]:
training_df = pd.read_csv(os.path.join(os.path.abspath(""), '../datasets/merged/training_dataset.csv'))
val_df = pd.read_csv(os.path.join(os.path.abspath(""), '../datasets/merged/validate_dataset.csv')) 
training_df.drop('Unnamed: 0',axis=1,inplace=True)
num_rows_with_nan = training_df.isna().any(axis=1).sum()

print(f"Number of rows with at least one NaN: {num_rows_with_nan}")
training_df.head()


Number of rows with at least one NaN: 0


Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,88,5,31,2,2008,1,3,29,False,True,...,True,False,False,False,False,False,True,False,False,1
1,32,9,30,3,2003,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
2,35,12,31,4,2003,0,5,34,False,False,...,False,False,False,False,False,True,False,False,False,1
3,94,11,30,4,2008,1,3,29,False,True,...,False,True,False,False,False,False,False,True,False,2
4,62,3,31,1,2006,0,5,34,False,False,...,False,True,False,False,False,False,False,True,False,1


In [185]:
X_train = training_df.drop('Total_Value',axis=1)
X_train.head()

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Prince Edward Island,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region
0,88,5,31,2,2008,1,3,29,False,True,...,False,True,False,False,False,False,False,True,False,False
1,32,9,30,3,2003,0,5,30,False,False,...,False,False,False,False,False,False,False,False,True,False
2,35,12,31,4,2003,0,5,34,False,False,...,False,False,False,False,False,False,True,False,False,False
3,94,11,30,4,2008,1,3,29,False,True,...,False,False,True,False,False,False,False,False,True,False
4,62,3,31,1,2006,0,5,34,False,False,...,False,False,True,False,False,False,False,False,True,False


In [186]:
y_train = training_df['Total_Value']
y_train.head()

0    1
1    1
2    1
3    2
4    1
Name: Total_Value, dtype: int64

In [187]:
X_val = val_df.drop(['Unnamed: 0','Total_Value'],axis=1)
y_val = val_df['Total_Value']

In [188]:
X_val.head()

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Prince Edward Island,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region
0,105,10,31,4,2009,0,5,28,True,False,...,False,False,False,False,False,False,False,True,False,False
1,85,2,29,1,2008,1,5,28,True,False,...,False,False,False,False,False,False,False,False,True,False
2,241,2,28,1,2021,0,5,28,False,False,...,False,False,False,False,False,False,False,True,False,False
3,186,7,31,3,2016,1,5,30,False,False,...,False,False,False,False,False,False,False,False,True,False
4,215,12,31,4,2018,0,5,34,False,False,...,False,False,False,False,False,False,False,False,True,False


In [189]:
y_val.head()

0    1
1    1
2    1
3    1
4    1
Name: Total_Value, dtype: int64

In [190]:
criteria = ["gini","entropy"]
splitters = ["best","random"]
classifiers_scores = []
#print("SPLIT")
for i in range(5,50,5):
    for criterion in criteria:
        for splitter in splitters:
            start = time.time()
            classifier = DecisionTreeClassifier(criterion=criterion,splitter=splitter,max_depth=i)
            classifier.fit(X_train, y_train)
            end= time.time()
            training_time = end - start  
            
            y_pred = classifier.predict(X_val)
            score = f1_score(y_val, y_pred, average='macro')
            classifiers_scores.append((criterion, splitter, i, "max_depth", score,training_time))

            print("Max Depth",str(i),criterion,splitter)
            print(confusion_matrix(y_val, y_pred))
            print(classification_report(y_val, y_pred))

for i in range(5,50,5):
    for criterion in criteria:
        for splitter in splitters:
            start = time.time()
            classifier = DecisionTreeClassifier(criterion=criterion,splitter=splitter,min_samples_split=i)
            classifier.fit(X_train, y_train)
            end= time.time()
            training_time = end - start  
            y_pred = classifier.predict(X_val)
            score = f1_score(y_val, y_pred, average='macro')
            classifiers_scores.append((criterion, splitter, i, "min_samples_split", score,training_time))
            

            print("Min Samples",str(i),criterion,splitter)
            print(confusion_matrix(y_val, y_pred))
            print(classification_report(y_val, y_pred))

Max Depth 5 gini best
[[1062    4    1    0    0    0    0    0]
 [  28   28   16    0    0    0    0    0]
 [   0    3   26    0    0    0    0    0]
 [   1    0   12    0    0    0    0    0]
 [   0    0    7    0    0    0    0    0]
 [   0    0    5    0    0    0    0    0]
 [   0    0    1    0    0    0    0    0]
 [   0    0    1    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.97      1.00      0.98      1067
           2       0.80      0.39      0.52        72
           3       0.38      0.90      0.53        29
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.27      0.29      0.25      1195
weighted avg       0.93      0.93    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Max Depth 5 entropy random
[[1056   10    1    0    0    0    0    0]
 [  26   38    5    3    0    0    0    0]
 [   0   11   13    5    0    0    0    0]
 [   0    7    2    4    0    0    0    0]
 [   0    1    1    5    0    0    0    0]
 [   0    1    0    4    0    0    0    0]
 [   0    0    0    1    0    0    0    0]
 [   0    0    0    1    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.98      1067
           2       0.56      0.53      0.54        72
           3       0.59      0.45      0.51        29
           4       0.17      0.31      0.22        13
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.29      0.28      0.28      1195
weighted avg       0.92      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Max Depth 10 entropy best
[[1057   10    0    0    0    0    0    0]
 [  14   45   12    1    0    0    0    0]
 [   0   10   17    2    0    0    0    0]
 [   0    2    5    6    0    0    0    0]
 [   0    0    5    2    0    0    0    0]
 [   0    0    2    3    0    0    0    0]
 [   0    0    0    1    0    0    0    0]
 [   0    0    0    1    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.99      0.99      0.99      1067
           2       0.67      0.62      0.65        72
           3       0.41      0.59      0.49        29
           4       0.38      0.46      0.41        13
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.94      1195
   macro avg       0.31      0.33      0.32      1195
weighted avg       0.94      0.94

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Max Depth 15 gini random
[[1050   14    2    1    0    0    0    0    0    0]
 [  12   43   15    0    0    0    2    0    0    0]
 [   1   10   10    8    0    0    0    0    0    0]
 [   0    1    8    1    2    1    0    0    0    0]
 [   0    1    1    3    1    0    1    0    0    0]
 [   0    0    0    2    2    0    1    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    1    0]
 [   0    0    0    0    0    0    0    0    0    1]
 [   0    0    0    0    0    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.99      0.98      0.99      1067
           2       0.62      0.60      0.61        72
           3       0.28      0.34      0.31        29
           4       0.07      0.08      0.07        13
           5       0.20      0.14      0.17         7
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         0
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           1       0.98      0.99      0.98      1067
           2       0.60      0.56      0.58        72
           3       0.33      0.34      0.34        29
           4       0.25      0.31      0.28        13
           5       0.38      0.43      0.40         7
           6       0.25      0.20      0.22         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.35      0.35      0.35      1195
weighted avg       0.93      0.93      0.93      1195

Max Depth 20 gini best
[[1049   14    4    0    0    0    0    0    0]
 [  18   41   12    1    0    0    0    0    0]
 [   1   12    8    7    0    1    0    0    0]
 [   0    1    2    8    2    0    0    0    0]
 [   0    1    3    0    2    0    1    0    0]
 [   0    0    1    1    1    2    0    0    0]
 [   0    0    0    0    0    0    0  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Max Depth 25 gini random
[[1046   19    0    1    1    0    0    0    0]
 [  21   41    8    1    0    0    0    0    1]
 [   1    8   11    6    2    1    0    0    0]
 [   0    1    6    5    1    0    0    0    0]
 [   0    1    2    1    3    0    0    0    0]
 [   0    0    0    1    2    2    0    0    0]
 [   0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    0    0    0    1]]
              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1067
           2       0.59      0.57      0.58        72
           3       0.41      0.38      0.39        29
           4       0.33      0.38      0.36        13
           5       0.33      0.43      0.38         7
           6       0.67      0.40      0.50         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.50      1.00      0.67         1



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Max Depth 30 gini best
[[1046   17    4    0    0    0    0    0]
 [  16   41   14    1    0    0    0    0]
 [   2   12    8    7    0    0    0    0]
 [   0    2    1    8    2    0    0    0]
 [   0    0    3    1    2    0    0    1]
 [   0    0    1    0    2    2    0    0]
 [   0    0    0    0    0    0    0    1]
 [   0    0    0    0    0    0    1    0]]
              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1067
           2       0.57      0.57      0.57        72
           3       0.26      0.28      0.27        29
           4       0.47      0.62      0.53        13
           5       0.33      0.29      0.31         7
           6       1.00      0.40      0.57         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.45      0.39      0.40      1195
weighted avg       0.93      0.93   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Max Depth 30 entropy random
[[   0    0    0    0    0    0    0    0    0    0]
 [   1 1048   17    0    1    0    0    0    0    0]
 [   0   19   39   13    0    0    0    0    0    1]
 [   0    0   11   11    5    0    2    0    0    0]
 [   0    0    2    6    5    0    0    0    0    0]
 [   0    0    0    3    1    3    0    0    0    0]
 [   0    0    0    0    1    1    2    0    0    1]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    1]
 [   0    0    0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.98      0.98      0.98      1067
           2       0.57      0.54      0.55        72
           3       0.33      0.38      0.35        29
           4       0.38      0.38      0.38        13
           5       0.75      0.43      0.55         7
           6       0.50      0.40      0.44         5
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Max Depth 35 entropy best
[[1052   15    0    0    0    0    0    0    0]
 [  15   38   17    2    0    0    0    0    0]
 [   0   12    8    6    3    0    0    0    0]
 [   0    2    3    8    0    0    0    0    0]
 [   0    1    3    1    1    0    1    0    0]
 [   0    0    1    0    1    3    0    0    0]
 [   0    0    0    0    0    0    0    0    0]
 [   0    0    1    0    0    0    0    0    0]
 [   0    0    0    1    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.99      0.99      0.99      1067
           2       0.56      0.53      0.54        72
           3       0.24      0.28      0.26        29
           4       0.44      0.62      0.52        13
           5       0.20      0.14      0.17         7
           6       1.00      0.60      0.75         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Max Depth 40 gini random
[[1048   17    2    0    0    0    0    0    0    0]
 [  18   34   17    2    0    0    1    0    0    0]
 [   1   10   11    5    2    0    0    0    0    0]
 [   1    0    4    6    1    1    0    0    0    0]
 [   0    0    2    3    0    0    1    0    1    0]
 [   0    0    0    2    1    1    0    0    0    1]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    0    0    0    1    0]
 [   0    0    0    0    0    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1067
           2       0.56      0.47      0.51        72
           3       0.31      0.38      0.34        29
           4       0.33      0.46      0.39        13
           5       0.00      0.00      0.00         7
           6       0.50      0.20      0.29         5
           7       0.00      0.00      0.00         0
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Max Depth 45 gini best
[[1049   15    3    0    0    0    0    0    0]
 [  17   38   16    1    0    0    0    0    0]
 [   2   12    9    5    0    1    0    0    0]
 [   0    2    1    8    2    0    0    0    0]
 [   0    1    3    0    2    0    1    0    0]
 [   0    0    1    1    1    2    0    0    0]
 [   0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    0    0    1    0]]
              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1067
           2       0.56      0.53      0.54        72
           3       0.27      0.31      0.29        29
           4       0.53      0.62      0.57        13
           5       0.40      0.29      0.33         7
           6       0.67      0.40      0.50         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Max Depth 45 entropy random
[[   0    0    0    0    0    0    0    0    0    0]
 [   2 1048   13    3    0    1    0    0    0    0]
 [   0   17   46    8    0    0    1    0    0    0]
 [   0    0   10    9    4    3    2    1    0    0]
 [   0    0    4    3    5    0    0    1    0    0]
 [   0    1    0    2    2    2    0    0    0    0]
 [   0    0    0    0    0    1    1    1    0    2]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    0    0    0    0    1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.98      0.98      0.98      1067
           2       0.63      0.64      0.63        72
           3       0.36      0.31      0.33        29
           4       0.45      0.38      0.42        13
           5       0.29      0.29      0.29         7
           6       0.25      0.20      0.22         5
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Min Samples 5 entropy best
[[   0    0    0    0    0    0    0    0    0    0]
 [   1 1049   17    0    0    0    0    0    0    0]
 [   0   14   40   17    1    0    0    0    0    0]
 [   0    1   11   12    3    1    1    0    0    0]
 [   0    0    0    5    7    1    0    0    0    0]
 [   0    0    0    3    1    2    0    1    0    0]
 [   0    0    0    1    0    2    2    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.99      0.98      0.98      1067
           2       0.59      0.56      0.57        72
           3       0.32      0.41      0.36        29
           4       0.58      0.54      0.56        13
           5       0.33      0.29      0.31         7
           6       0.67      0.40      0.50         5
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Min Samples 10 gini random
[[1057   10    0    0    0    0    0    0    0]
 [  22   37   11    2    0    0    0    0    0]
 [   1   12    6   10    0    0    0    0    0]
 [   0    5    3    4    0    0    1    0    0]
 [   0    0    3    2    1    1    0    0    0]
 [   0    0    1    1    2    0    1    0    0]
 [   0    0    0    0    0    0    0    0    0]
 [   0    0    0    1    0    0    0    0    0]
 [   0    0    0    1    0    0    0    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.98      1067
           2       0.58      0.51      0.54        72
           3       0.25      0.21      0.23        29
           4       0.19      0.31      0.24        13
           5       0.33      0.14      0.20         7
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.98      0.99      0.99      1067
           2       0.61      0.60      0.61        72
           3       0.32      0.38      0.35        29
           4       0.00      0.00      0.00        13
           5       0.40      0.29      0.33         7
           6       0.33      0.40      0.36         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.27      0.26      0.26      1195
weighted avg       0.93      0.93      0.93      1195

Min Samples 15 gini best
[[1052   14    1    0    0    0    0    0    0]
 [  16   44   11    0    1    0    0    0    0]
 [   0   14    9    4    1    1    0    0    0]
 [   0    1    3    9    0    0    0    0    0]
 [   0    0    4    2   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Min Samples 20 gini random
[[1055   10    1    0    1    0    0    0]
 [  22   42    5    3    0    0    0    0]
 [   0   13   11    5    0    0    0    0]
 [   0    3    5    3    0    2    0    0]
 [   0    1    3    1    1    1    0    0]
 [   0    0    1    1    1    2    0    0]
 [   0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.98      1067
           2       0.61      0.58      0.60        72
           3       0.42      0.38      0.40        29
           4       0.23      0.23      0.23        13
           5       0.33      0.14      0.20         7
           6       0.29      0.40      0.33         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.36      0.34      0.34      1195
weighted avg       0.93      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Min Samples 25 gini best
[[1057   10    0    0    0    0    0    0]
 [  22   37   12    1    0    0    0    0]
 [   0   13    8    8    0    0    0    0]
 [   0    1    1   11    0    0    0    0]
 [   0    0    2    4    0    1    0    0]
 [   0    0    1    2    0    2    0    0]
 [   0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.99      1067
           2       0.61      0.51      0.56        72
           3       0.33      0.28      0.30        29
           4       0.42      0.85      0.56        13
           5       0.00      0.00      0.00         7
           6       0.40      0.40      0.40         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.93      1195
   macro avg       0.34      0.38      0.35      1195
weighted avg       0.93      0.93 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.99      0.99      0.99      1067
           2       0.65      0.65      0.65        72
           3       0.44      0.38      0.41        29
           4       0.38      0.69      0.49        13
           5       0.00      0.00      0.00         7
           6       0.33      0.20      0.25         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.94      1195
   macro avg       0.31      0.32      0.31      1195
weighted avg       0.94      0.94      0.94      1195

Min Samples 25 entropy random
[[1053   14    0    0    0    0    0    0]
 [  21   32   18    0    0    1    0    0]
 [   0   11   10    7    1    0    0    0]
 [   0    6    0    7    0    0    0    0]
 [   0    1    1    3    1    1    0    0]
 [   0    0    1    1    1    2    0    0]
 [   0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Min Samples 35 gini best
[[1057   10    0    0    0    0    0    0]
 [  21   38   12    1    0    0    0    0]
 [   0   13   10    6    0    0    0    0]
 [   0    1    2   10    0    0    0    0]
 [   0    0    1    4    1    1    0    0]
 [   0    0    1    1    1    2    0    0]
 [   0    0    0    0    0    1    0    0]
 [   0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.99      1067
           2       0.61      0.53      0.57        72
           3       0.38      0.34      0.36        29
           4       0.45      0.77      0.57        13
           5       0.50      0.14      0.22         7
           6       0.40      0.40      0.40         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.94      1195
   macro avg       0.42      0.40      0.39      1195
weighted avg       0.93      0.94 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      1067
           2       0.71      0.76      0.74        72
           3       0.52      0.41      0.46        29
           4       0.38      0.77      0.51        13
           5       0.00      0.00      0.00         7
           6       0.33      0.20      0.25         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.95      1195
   macro avg       0.37      0.39      0.37      1195
weighted avg       0.94      0.95      0.95      1195

Min Samples 35 entropy random
[[1058    9    0    0    0    0    0    0]
 [  18   49    3    2    0    0    0    0]
 [   0    8   20    0    1    0    0    0]
 [   0    2    8    3    0    0    0    0]
 [   0    0    4    3    0    0    0    0]
 [   0    0    2    1    1    1    0    0]
 [   0    0    0    1    0    0    0    0]
 [   0    0    0  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Min Samples 45 gini random
[[1055   12    0    0    0    0    0    0]
 [  19   49    2    2    0    0    0    0]
 [   0   12   14    2    0    1    0    0]
 [   0    3    7    2    0    1    0    0]
 [   0    0    2    4    0    1    0    0]
 [   0    0    1    1    1    2    0    0]
 [   0    0    0    1    0    0    0    0]
 [   0    0    0    0    0    1    0    0]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.99      1067
           2       0.64      0.68      0.66        72
           3       0.54      0.48      0.51        29
           4       0.17      0.15      0.16        13
           5       0.00      0.00      0.00         7
           6       0.33      0.40      0.36         5
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.94      1195
   macro avg       0.33      0.34      0.34      1195
weighted avg       0.93      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [191]:

classifiers_scores.sort(key=lambda x: x[4], reverse=True)
results = pd.DataFrame(classifiers_scores,columns=['criterion','splitter', 'max_depth / min_samples','technique', 'score','training_time'])

print(classifiers_scores[0])

# Print the best classifier
print("Best classifier configuration:")
print("Criterion: {}, Splitter: {}, Value: {}, Parameter: {}, F1 Score: {}".format(*classifiers_scores[0]))
results

('entropy', 'best', 45, 'max_depth', 0.4621582429403033, 0.09668827056884766)
Best classifier configuration:
Criterion: entropy, Splitter: best, Value: 45, Parameter: max_depth, F1 Score: 0.4621582429403033


Unnamed: 0,criterion,splitter,max_depth / min_samples,technique,score,training_time
0,entropy,best,45,max_depth,0.462158,0.096688
1,entropy,random,35,max_depth,0.437309,0.035627
2,entropy,best,15,max_depth,0.431227,0.097068
3,gini,random,5,min_samples_split,0.430645,0.034418
4,gini,random,25,max_depth,0.427666,0.042227
...,...,...,...,...,...,...
67,gini,random,5,max_depth,0.261585,0.028427
68,gini,best,5,max_depth,0.254778,0.077224
69,gini,random,10,min_samples_split,0.243384,0.041437
70,gini,random,15,max_depth,0.214163,0.032619


In [192]:
test_df = pd.read_csv(os.path.join(os.path.abspath(""), '../datasets/merged/test_dataset.csv')) 
X_test = val_df.drop(['Unnamed: 0','Total_Value'],axis=1)
y_test = val_df['Total_Value']
features = X_train.columns.tolist()
type = 'max_depth'
if classifiers_scores[0][3] == type:
    dtree = DecisionTreeClassifier(criterion=classifiers_scores[0][0],splitter=classifiers_scores[0][1],max_depth=classifiers_scores[0][2])
else:
    dtree = DecisionTreeClassifier(criterion=classifiers_scores[0][0],splitter=classifiers_scores[0][1],min_samples_split=classifiers_scores[0][2])
dtree.fit(X_train, y_train)

end= time.time()
training_time = end - start       
y_pred = classifier.predict(X_test)
score = f1_score(y_test, y_pred, average='macro')
print(classifiers_scores[0][0],classifiers_scores[0][1],classifiers_scores[0][2],classifiers_scores[0][3],score,training_time)

data = tree.export_graphviz(dtree, out_file=None, feature_names=features, node_ids=True)
graph = pydotplus.graph_from_dot_data(data)
graph.set_size('"100,100!"')


entropy best 45 0.30230813939414597 0.28783178329467773
