Decision Tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [2]:
col_names = ['pregnants', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
dframe = pd.read_csv('../dataset/Pima.csv', header = None, names = col_names)

In [3]:
data = pd.read_csv('../dataset/Maternal Health Risk Data Set.csv')

In [4]:
le = LabelEncoder()

In [5]:
data['CodeLevel'] = le.fit_transform(data.RiskLevel)

In [6]:
X = data.drop(['CodeLevel', 'RiskLevel'], axis=1 )
y = data.CodeLevel

In [7]:
X = dframe.drop(['label'], axis = 1)
y = dframe.label

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
Model = DecisionTreeClassifier()
Model_tree = Model.fit(X_train, y_train)

In [10]:
confusion_matrix(y_test , Model.predict(X_test))

array([[81, 18],
       [26, 29]], dtype=int64)

In [11]:
params_grid = {'criterion': ['gini', 'entropy'], 'max_depth': np.arange(4, 40)}

In [12]:
Model_Grid = GridSearchCV(Model, params_grid, cv=5)

In [13]:
Model_Grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
       38, 39])})

In [14]:
Model_Grid.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [15]:
Model_Grid.best_score_

0.7345061975209916

In [16]:
confusion_matrix(y_test , Model_Grid.predict(X_test))

array([[88, 11],
       [20, 35]], dtype=int64)

In [17]:
def best_param_for_Decision_Tree(number_cv=2):
    params_grid = {'criterion': ['gini', 'entropy'], 'max_depth': np.arange(4, 40)}
    # grid sarch
    list_best_param = []
    list_best_scores = []
    for i in range(2, number_cv+1):   
        df_grid = GridSearchCV(estimator=Model, param_grid= params_grid, cv= i)
        df_grid.fit(X_train, y_train)
        list_best_param.append(df_grid.best_params_)
        list_best_scores.append(df_grid.best_score_)
# crate a data frame with the cv and the bst score
    print(pd.DataFrame(list_best_scores, index = range(2, number_cv+1), 
            columns = ['Best score']).sort_index())
    best_score_index = 0
# looks for the params with the best score
    for i in list_best_scores:
        if i > i+1:
            best_score_index = i.index
# best score
    # best = list_best_scores.sort()
    # print(best)
    # print(list_best_scores)
    print('best score: ',list_best_scores[number_cv-2])
# give the bst params
    print(list_best_param[best_score_index])

In [18]:
best_param_for_Decision_Tree(number_cv=20)

KeyboardInterrupt: 