In [10]:
import pandas as pd
import csv_processing as dp
import evaluation

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [2]:
df = pd.read_csv("../data/3year.csv")

In [3]:
dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)

### Without SMOTE

In [4]:
X_train, X_test, y_train, y_test = dp.get_train_test(df)

In [5]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
evaluation.print_res(y_test, y_pred)

Confusion Matrix:
 [[1919   84]
 [  77   21]]
Accuracy: 0.923
Precision Score: 0.2
Recall Score: 0.214
F1 Score: 0.207
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      2003
           1       0.20      0.21      0.21        98

    accuracy                           0.92      2101
   macro avg       0.58      0.59      0.58      2101
weighted avg       0.93      0.92      0.92      2101



##### Grid Search

In [21]:
# Define the parameter grid to search
param_grid = {
    "max_depth": [1, 10, 100, 1000],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_leaf_nodes": [None, 10, 100, 1000, 10000],
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Get the best model
best_dt_model = grid_search.best_estimator_

# Predictions on the test set
y_pred = best_dt_model.predict(X_test)

evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 100, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Confusion Matrix:
 [[1929   74]
 [  77   21]]
Accuracy: 0.928
Precision Score: 0.221
Recall Score: 0.214
F1 Score: 0.218
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      2003
           1       0.22      0.21      0.22        98

    accuracy                           0.93      2101
   macro avg       0.59      0.59      0.59      2101
weighted avg       0.93      0.93      0.93      2101



### SMOTE

In [25]:
X_train, X_test, y_train, y_test = dp.pre_process(df)

In [21]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
evaluation.print_res(y_test, y_pred)

Confusion Matrix:
 [[2613  404]
 [  89   45]]
Accuracy: 0.844
Precision Score: 0.1
Recall Score: 0.336
F1 Score: 0.154
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.87      0.91      3017
           1       0.10      0.34      0.15       134

    accuracy                           0.84      3151
   macro avg       0.53      0.60      0.53      3151
weighted avg       0.93      0.84      0.88      3151



##### Grid Search Round 1

In [26]:
param_grid = {
    "max_depth": [1, 10, 100, 1000],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_leaf_nodes": [None, 10, 100, 1000, 10000],
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 100, 'max_leaf_nodes': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2}
Confusion Matrix:
 [[2625  392]
 [  91   43]]
Accuracy: 0.847
Precision Score: 0.099
Recall Score: 0.321
F1 Score: 0.151
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.87      0.92      3017
           1       0.10      0.32      0.15       134

    accuracy                           0.85      3151
   macro avg       0.53      0.60      0.53      3151
weighted avg       0.93      0.85      0.88      3151



##### Grid Search Round 2

In [27]:
param_grid = {
    "max_depth": [20, 40, 60, 80, 100],
    "min_samples_split": [2],
    "min_samples_leaf": [1],
    "max_leaf_nodes": [None],
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=make_scorer(f1_score))
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
evaluation.print_res(y_test, y_pred)

Best parameters: {'max_depth': 40, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Confusion Matrix:
 [[2613  404]
 [  89   45]]
Accuracy: 0.844
Precision Score: 0.1
Recall Score: 0.336
F1 Score: 0.154
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.87      0.91      3017
           1       0.10      0.34      0.15       134

    accuracy                           0.84      3151
   macro avg       0.53      0.60      0.53      3151
weighted avg       0.93      0.84      0.88      3151



In [24]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier()