In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter

In [18]:
X, y = make_classification(
    n_features=10,
    n_samples=1000,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='gini', max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       130
           1       0.81      0.80      0.81       120

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



In [20]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='gini', max_depth=5), X, y, cv=5)

array([0.78 , 0.785, 0.745, 0.805, 0.775])

In [21]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='gini', max_depth=10), X, y, cv=5)

array([0.815, 0.74 , 0.795, 0.765, 0.815])

In [22]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='entropy', max_depth=5), X, y, cv=5)

array([0.765, 0.78 , 0.755, 0.815, 0.78 ])

In [23]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='entropy', max_depth=10), X, y, cv=5)

array([0.765, 0.79 , 0.825, 0.775, 0.795])

In [24]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 15]
    },
    cv=5,
    return_train_score=False
)

clf.fit(X, y)

clf.cv_results_

{'mean_fit_time': array([0.00243979, 0.00324712, 0.0034297 , 0.00288253, 0.00408311,
        0.00440989]),
 'std_fit_time': array([2.35527602e-04, 2.35921303e-04, 2.50106949e-04, 6.13712579e-05,
        1.38506659e-04, 1.61766301e-04]),
 'mean_score_time': array([0.00018954, 0.00015836, 0.00015635, 0.00015378, 0.00015249,
        0.00016212]),
 'std_score_time': array([3.38628250e-05, 5.47304376e-06, 5.52678745e-06, 4.78978119e-06,
        6.19814942e-06, 7.67989137e-06]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth'

In [25]:
df = pd.DataFrame(clf.cv_results_)

df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00244,0.000236,0.00019,3.4e-05,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.775,0.815,0.75,0.805,0.77,0.783,0.023791,5
1,0.003247,0.000236,0.000158,5e-06,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.795,0.74,0.8,0.79,0.805,0.786,0.023537,4
2,0.00343,0.00025,0.000156,6e-06,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.8,0.72,0.815,0.815,0.82,0.794,0.037603,3
3,0.002883,6.1e-05,0.000154,5e-06,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.78,0.755,0.815,0.775,0.778,0.020396,6
4,0.004083,0.000139,0.000152,6e-06,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.78,0.795,0.825,0.785,0.79,0.795,0.015811,2
5,0.00441,0.000162,0.000162,8e-06,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.77,0.79,0.85,0.805,0.875,0.818,0.038807,1


In [26]:
df[['param_criterion', 'param_max_depth', 'mean_test_score']]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.783
1,gini,10,0.786
2,gini,15,0.794
3,entropy,5,0.778
4,entropy,10,0.795
5,entropy,15,0.818


In [27]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [28]:
clf.best_estimator_

In [29]:
from sklearn import svm

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [5, 10, 15]
        }
    }
}

scores = []

for key, val in model_params.items():
    clf = GridSearchCV(val['model'], val['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })


df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
1,decision_tree,0.816,"{'criterion': 'entropy', 'max_depth': 15}"
