In [0]:
import tqdm
import pandas as pd
from collections import Counter
from itertools import cycle
import graphviz
from sklearn import datasets, metrics, tree
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

def draw_tree(name, data, features, class_names):
  dot_data = tree.export_graphviz(data, out_file=None) 
  graph = graphviz.Source(dot_data) 
  graph.render(name)
  dot_data = tree.export_graphviz(data, out_file=None, 
                      feature_names=features,  
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
  graph = graphviz.Source(dot_data)  
  return graph

In [105]:
bank_df = pd.read_csv('/content/drive/My Drive/bank-additional-full.csv', sep=";")
categorical = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week",
               "poutcome", "y"]
bank_df_target = bank_df["y"]
for i in bank_df.columns:
  if i in categorical:
    bank_df[i] = pd.Categorical(bank_df[i]).codes
X_bank_train, X_bank_test = train_test_split(bank_df, test_size=0.3333)
Y_bank_train, Y_bank_test = X_bank_train['y'], X_bank_test['y']
X_bank_test = X_bank_test.drop('y', axis = 1)
X_bank_train = X_bank_train.drop('y', axis = 1)
print(bank_df_target)
print(bank_df["y"])
bank_df.head()

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int8


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,6,1,261,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,149,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,226,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,151,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,307,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [0]:
def checkClassif(X_train, Y_train, X_test, Y_test, criterion, samples=None, max_depth=None):
  def printMax(f1_cv):
    v=list(f1_cv.values())
    k=list(f1_cv.keys())
    f1_cv_max = k[v.index(max(v))]
    x = f1_cv_max.split(";")
    print('\n'+'-'*8)
    print('Values for best cv score:')
    print(f'Min_samples_leaf={x[0]}')
    print(f'Max_depth={x[1]}') 
    print('F1 cv score for train:')
    print(f1_cv[f1_cv_max][0])
    print('-'*8)
    return [int(x[0]), int(x[1])]
  matrices = {}
  depth = {}
  f1_cv = {}
  if samples or max_depth:
    for i in range(50, samples+1, 2):
      for j in range(5, max_depth+1, 1):
        bank_tree = tree.DecisionTreeClassifier(criterion=criterion, min_samples_leaf=i, max_depth=j)
        f1_cv[str(i)+";"+str(j)] = [np.mean(cross_val_score(bank_tree, X_train, Y_train, cv=5, scoring="f1")), j]

    params = printMax(f1_cv)
    bank_tree = tree.DecisionTreeClassifier(criterion=criterion, min_samples_leaf=params[0], max_depth=params[1])
    bank_tree.fit(X_train, Y_train)
    predicts = bank_tree.predict(X_bank_test)
    print('Confusion matrix')
    print(metrics.confusion_matrix(predicts, Y_test))  
    print('F1 score')
    print(metrics.f1_score(predicts, Y_test))
  else:
      bank_tree = tree.DecisionTreeClassifier(criterion=criterion)
      bank_tree.fit(X_train, Y_train)
      predicts = bank_tree.predict(X_test)
      print('Confusion matrix')
      print(metrics.confusion_matrix(predicts, Y_test))  
      print('F1 score')
      print(metrics.f1_score(predicts, Y_test))
      print(f'Cross validation f1 score: {np.mean(cross_val_score(bank_tree, X_train, Y_train, cv=5, scoring="f1"))}')

In [157]:
checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test, 'gini')

Confusion matrix
[[11337   722]
 [  866   803]]
F1 score
0.5028177833437696
Cross validation f1 score: 0.5291726746454721


In [158]:
checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test, 'entropy')

Confusion matrix
[[11394   729]
 [  809   796]]
F1 score
0.5086261980830671
Cross validation f1 score: 0.5320713926638355


In [159]:
checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test, 'gini', samples=120, max_depth=15)


--------
Values for best cv score:
Min_samples_leaf=52
Max_depth=6
F1 cv score for train:
0.5921872912230148
--------
Confusion matrix
[[11674   657]
 [  529   868]]
F1 score
0.5941136208076659


In [160]:
checkClassif(X_bank_train, Y_bank_train, X_bank_test, Y_bank_test, 'entropy', samples=120, max_depth=15)


--------
Values for best cv score:
Min_samples_leaf=50
Max_depth=6
F1 cv score for train:
0.5953954041461059
--------
Confusion matrix
[[11643   641]
 [  560   884]]
F1 score
0.595486695857191
