In [2]:
import pandas as pd
from python_scripts import csv_processing as dp
from python_scripts import evaluation

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [3]:
df = pd.read_csv("../data/3year.csv")

In [4]:
dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)

### Without SMOTE

In [5]:
X_train, X_test, y_train, y_test = dp.get_train_test(df)

In [6]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
evaluation.print_res(y_test, y_pred)

Confusion Matrix:
 [[2867  132]
 [ 122   30]]
Accuracy: 0.919
Precision Score: 0.185
Recall Score: 0.197
F1 Score: 0.191
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      2999
           1       0.19      0.20      0.19       152

    accuracy                           0.92      3151
   macro avg       0.57      0.58      0.57      3151
weighted avg       0.92      0.92      0.92      3151



##### Grid Search

In [7]:
# Define the parameter grid to search
param_grid = {
    "max_depth": [1, 10, 100, 1000],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_leaf_nodes": [None, 10, 100, 1000, 10000],
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Predictions on the test set
y_pred = best_dt_model.predict(X_test)

evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 100, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Confusion Matrix:
 [[2912   87]
 [ 125   27]]
Accuracy: 0.933
Precision Score: 0.237
Recall Score: 0.178
F1 Score: 0.203
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96      2999
           1       0.24      0.18      0.20       152

    accuracy                           0.93      3151
   macro avg       0.60      0.57      0.58      3151
weighted avg       0.92      0.93      0.93      3151



### SMOTE

In [8]:
X_train, X_test, y_train, y_test = dp.pre_process(df)

In [9]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
evaluation.print_res(y_test, y_pred)

Confusion Matrix:
 [[2675  324]
 [ 100   52]]
Accuracy: 0.865
Precision Score: 0.138
Recall Score: 0.342
F1 Score: 0.197
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.89      0.93      2999
           1       0.14      0.34      0.20       152

    accuracy                           0.87      3151
   macro avg       0.55      0.62      0.56      3151
weighted avg       0.92      0.87      0.89      3151



##### Grid Search Round 1

In [10]:
param_grid = {
    "max_depth": [1, 10, 100, 1000],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_leaf_nodes": [None, 10, 100, 1000, 10000],
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 100, 'max_leaf_nodes': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2}
Confusion Matrix:
 [[2659  340]
 [  98   54]]
Accuracy: 0.861
Precision Score: 0.137
Recall Score: 0.355
F1 Score: 0.198
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.89      0.92      2999
           1       0.14      0.36      0.20       152

    accuracy                           0.86      3151
   macro avg       0.55      0.62      0.56      3151
weighted avg       0.92      0.86      0.89      3151



##### Grid Search Round 2

In [27]:
param_grid = {
    "max_depth": [20, 40, 60, 80, 100],
    "min_samples_split": [2],
    "min_samples_leaf": [1],
    "max_leaf_nodes": [None],
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 40, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Confusion Matrix:
 [[2613  404]
 [  89   45]]
Accuracy: 0.844
Precision Score: 0.1
Recall Score: 0.336
F1 Score: 0.154
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.87      0.91      3017
           1       0.10      0.34      0.15       134

    accuracy                           0.84      3151
   macro avg       0.53      0.60      0.53      3151
weighted avg       0.93      0.84      0.88      3151

