In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, plot_confusion_matrix, 
                             precision_score, recall_score, classification_report)
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
def score_retreiver(true_results, predictions):
    precision = precision_score(true_results, predictions, average='weighted') * 100
    recall = recall_score(true_results, predictions, average='weighted') * 100
    accuracy = accuracy_score(true_results, predictions) * 100
    f1 = f1_score(true_results, predictions, average='weighted') * 100
    return precision, recall, accuracy, f1

### Decision Tree

In [4]:
dt_classifier_default_params = DecisionTreeClassifier(random_state=42)
dt_classifier_baseline_score = np.mean(cross_val_score(dt_classifier_default_params, X_train, y_train, cv=5))
dt_classifier_baseline_score

0.8025289265286826

In [5]:
dt_classifier_default_params.fit(X_train, y_train)
dt_classifier_baseline_predictions = dt_classifier_default_params.predict(X_test)
print('Test Scores')
print(f'Decision Tree Baseline Precision: {score_retreiver(y_test, dt_classifier_baseline_predictions)[0]}')
print(f'Decision Tree Baseline Recall: {score_retreiver(y_test, dt_classifier_baseline_predictions)[1]}')
print(f'Decision Tree Baseline Accuracy: {score_retreiver(y_test, dt_classifier_baseline_predictions)[2]}')
print(f'Decision Tree Baseline F1: {score_retreiver(y_test, dt_classifier_baseline_predictions)[3]}')

Test Scores
Decision Tree Baseline Precision: 71.4164849726491
Decision Tree Baseline Recall: 80.19397106765746
Decision Tree Baseline Accuracy: 80.19397106765746
Decision Tree Baseline F1: 73.1753385047404


In [6]:
print(classification_report(y_test, dt_classifier_baseline_predictions))

              precision    recall  f1-score   support

           0       0.24      0.04      0.06      6791
           1       0.31      0.03      0.06     23863
           2       0.81      0.98      0.89    128545

    accuracy                           0.80    159199
   macro avg       0.46      0.35      0.34    159199
weighted avg       0.71      0.80      0.73    159199



In [7]:
path = dt_classifier_default_params.cost_complexity_pruning_path(X_train, y_train)
## Gather all alphas except for the last one which would return only the root node
ccp_alphas = path.ccp_alphas[:-1]

In [8]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'ccp_alpha': np.sort(ccp_alphas)[list(range(0, len(ccp_alphas), 500))]
}

In [10]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search = dt_grid_search.fit(X_train, y_train)

In [None]:
dt_grid_search.best_params_

In [None]:
dt_classifier_best_params = DecisionTreeClassifier(ccp_alpha=dt_grid_search.best_params_['ccp_alpha'],
                                                   criterion=dt_grid_search.best_params_['criterion'],
                                                   max_depth=dt_grid_search.best_params_['max_depth'],
                                                   min_samples_leaf=dt_grid_search.best_params_['min_samples_leaf'],
                                                   min_samples_split=dt_grid_search.best_params_['min_samples_split'])

In [None]:
dt_classifier_best_params.fit(X_train, y_train)
dt_best_parameters_predictions = dt_classifier_best_params.predict(X_test_mini)
print('Test Scores')
print(f'Decision Tree Tuned Precision: {score_retreiver(y_test, dt_best_parameters_predictions)[0]}')
print(f'Decision Tree Tuned Recall: {score_retreiver(y_test, dt_best_parameters_predictions)[1]}')
print(f'Decision Tree Tuned Accuracy: {score_retreiver(y_test, dt_best_parameters_predictions)[2]}')
print(f'Decision Tree Tuned F1: {score_retreiver(y_test, dt_best_parameters_predictions)[3]}')

In [None]:
print(classification_report(y_test, dt_best_parameters_predictions))

In [None]:
# plt.figure(figsize=(12, 8))
# plot_tree(dt_classifier_best_parameters)
# plt.show()

### Random Forest 

In [None]:
rf_classifier_default_params = RandomForestClassifier(random_state=42)
rf_classifier_baseline_score = np.mean(cross_val_score(rf_classifier_default_params, X_train, y_train, cv=5))
rf_classifier_baseline_score

In [None]:
rf_classifier_default_params.fit(X_train, y_train)
rf_classifier_baseline_predictions = rf_classifier_default_params.predict(X_test)