In [0]:
import tqdm
import pandas as pd
from collections import Counter
from itertools import cycle
import graphviz
from sklearn import datasets, metrics, tree
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder

def draw_tree(name, data, features, class_names):
  dot_data = tree.export_graphviz(data, out_file=None) 
  graph = graphviz.Source(dot_data) 
  graph.render(name)
  dot_data = tree.export_graphviz(data, out_file=None, 
                      feature_names=features,  
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
  graph = graphviz.Source(dot_data)  
  return graph

In [3]:
bank_df = pd.read_csv('/content/drive/My Drive/bank-additional-full.csv', sep=";")
categorical = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week",
               "poutcome", "y"]
bank_df_target = bank_df["y"]
for i in bank_df.columns:
  if i in categorical:
    bank_df[i] = pd.Categorical(bank_df[i]).codes
X_bank_train, X_bank_test = train_test_split(bank_df, test_size=0.3333)
Y_bank_train, Y_bank_test = X_bank_train['y'], X_bank_test['y']
X_bank_test = X_bank_test.drop('y', axis = 1)
X_bank_train = X_bank_train.drop('y', axis = 1)
print(bank_df_target)
print(bank_df["y"])
bank_df.head()

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int8


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,6,1,261,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,149,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,226,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,151,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,307,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [0]:
def checkClassif(X_train, Y_train, X_test, Y_test, samples=None, max_depth=None):
  def printMax(matrices_gini, matrices_entropy, f1_gini, f1_entropy):
    v=list(f1_gini.values())
    k=list(f1_gini.keys())
    f1_gini_max = k[v.index(max(v))]
    v=list(f1_entropy.values())
    k=list(f1_entropy.keys())
    f1_entropy_max = k[v.index(max(v))]
    print(f'Min_samples gini: {f1_gini_max}')
    print(f'Min_samples entropy: {f1_entropy_max}')
    print('Confusion matrix for gini')
    print(matrices_gini[f1_gini_max])
    print('Confusion matrix for entropy')
    print(matrices_entropy[f1_entropy_max])
    print('F1 score gini')
    print(f1_gini[f1_gini_max])  
    print('F1 score entropy')
    print(f1_entropy[f1_entropy_max])
  matrices_gini = {}
  matrices_entropy = {}
  f1_gini = {}
  f1_entropy = {}
  if samples:
    for i in range(1, samples+1, 2):
      if max_depth:
        bank_tree_gini = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=i, max_depth=max_depth)
        bank_tree_entropy = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=i, max_depth=max_depth)
      else:
        bank_tree_gini = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=i)
        bank_tree_entropy = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=i)
      bank_tree_gini.fit(X_train, Y_train)
      bank_tree_entropy.fit(X_train, Y_train)
      predicts_gini = bank_tree_gini.predict(X_bank_test)
      predicts_entropy = bank_tree_entropy.predict(X_test)
      matrices_gini[i] = metrics.confusion_matrix(predicts_gini, Y_test)
      matrices_entropy[i] = metrics.confusion_matrix(predicts_entropy, Y_test)
      f1_gini[i] = metrics.f1_score(predicts_gini, Y_test)
      f1_entropy[i] = metrics.f1_score(predicts_entropy, Y_test)
    printMax(matrices_gini, matrices_entropy, f1_gini, f1_entropy)
  else:
    if max_depth:
      bank_tree_gini = tree.DecisionTreeClassifier(criterion='gini', max_depth=max_depth)
      bank_tree_entropy = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    else:
      bank_tree_gini = tree.DecisionTreeClassifier(criterion='gini')
      bank_tree_entropy = tree.DecisionTreeClassifier(criterion='entropy')
    bank_tree_gini.fit(X_train, Y_train)
    bank_tree_entropy.fit(X_train, Y_train)
    predicts_gini = bank_tree_gini.predict(X_bank_test)
    predicts_entropy = bank_tree_entropy.predict(X_test)
    print('Confusion matrix for gini')
    print(metrics.confusion_matrix(predicts_gini, Y_test))

    print('Confusion matrix for entropy')
    print(metrics.confusion_matrix(predicts_entropy, Y_test))
    print('F1 score gini')
    print(metrics.f1_score(predicts_gini, Y_test))  
    print('F1 score entropy')
    print(metrics.f1_score(predicts_entropy, Y_test))

In [5]:
checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test)

Confusion matrix for gini
[[11404   716]
 [  780   828]]
Confusion matrix for entropy
[[11403   733]
 [  781   811]]
F1 score gini
0.5253807106598986
F1 score entropy
0.5172193877551021


In [8]:
for i in tqdm.tqdm(range(2, 20)):
  checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test, samples=150, max_depth=i)

  6%|▌         | 1/18 [00:06<01:44,  6.14s/it]

[1, 1, 0.49101338432122366, 0.49453448925744437]


 11%|█         | 2/18 [00:13<01:45,  6.60s/it]

[1, 1, 0.5857329842931936, 0.5875888817065288]


 17%|█▋        | 3/18 [00:23<01:50,  7.39s/it]

[11, 13, 0.5440414507772021, 0.5164319248826291]


 22%|██▏       | 4/18 [00:33<01:57,  8.40s/it]

[113, 125, 0.5750360750360749, 0.5774597495527727]


 28%|██▊       | 5/18 [00:46<02:04,  9.58s/it]

[41, 39, 0.5942778352292313, 0.592032967032967]


 33%|███▎      | 6/18 [00:59<02:08, 10.72s/it]

[59, 59, 0.592205984690327, 0.5867132867132867]


 39%|███▉      | 7/18 [01:13<02:09, 11.79s/it]

[59, 51, 0.6109238031018206, 0.5978817902289034]


 44%|████▍     | 8/18 [01:28<02:07, 12.77s/it]

[61, 41, 0.6109238031018206, 0.5941011235955055]


 50%|█████     | 9/18 [01:44<02:02, 13.63s/it]

[61, 41, 0.6079836233367452, 0.5953716690042076]


 56%|█████▌    | 10/18 [02:00<01:54, 14.33s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 61%|██████    | 11/18 [02:16<01:43, 14.82s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 67%|██████▋   | 12/18 [02:32<01:31, 15.20s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 72%|███████▏  | 13/18 [02:48<01:17, 15.52s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 78%|███████▊  | 14/18 [03:05<01:03, 15.79s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 83%|████████▎ | 15/18 [03:21<00:47, 15.94s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 89%|████████▉ | 16/18 [03:37<00:32, 16.03s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


 94%|█████████▍| 17/18 [03:53<00:16, 16.09s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]


100%|██████████| 18/18 [04:10<00:00, 16.11s/it]

[61, 39, 0.6079836233367452, 0.5945385413066021]



