In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, f1_score, 
                             precision_score, recall_score)
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns 

In [2]:
## Import data split and preprocessed in attached notebooks
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [4]:
def score_retreiver(true_results, predictions):
    precision = precision_score(true_results, predictions, average='weighted') * 100
    recall = recall_score(true_results, predictions, average='weighted') * 100
    accuracy = accuracy_score(true_results, predictions) * 100
    f1 = f1_score(true_results, predictions, average='weighted') * 100
    return precision, recall, accuracy, f1

In [5]:
algorithm_scores = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'Accuracy', 'F1_Score'])

### Decision Tree

In [6]:
dt_classifier_default_params = DecisionTreeClassifier(random_state=42)
dt_classifier_baseline_score = np.mean(cross_val_score(dt_classifier_default_params,
                                                       X_train, y_train.values.ravel(), cv=5))
dt_classifier_baseline_score

0.47212554417731323

In [7]:
dt_classifier_default_params.fit(X_train, y_train.values.ravel())
dt_classifier_baseline_predictions = dt_classifier_default_params.predict(X_test)
print('Test Scores')
print(f'Decision Tree Baseline Precision: {score_retreiver(y_test, dt_classifier_baseline_predictions)[0]}')
print(f'Decision Tree Baseline Recall: {score_retreiver(y_test, dt_classifier_baseline_predictions)[1]}')
print(f'Decision Tree Baseline Accuracy: {score_retreiver(y_test, dt_classifier_baseline_predictions)[2]}')
print(f'Decision Tree Baseline F1: {score_retreiver(y_test, dt_classifier_baseline_predictions)[3]}')

Test Scores
Decision Tree Baseline Precision: 71.33890215543977
Decision Tree Baseline Recall: 58.286798283908816
Decision Tree Baseline Accuracy: 58.286798283908816
Decision Tree Baseline F1: 63.56492187427085


In [8]:
algorithm_scores = algorithm_scores.append({'Model': 'Decision Tree Baseline',
                                            'Precision': score_retreiver(y_test, dt_classifier_baseline_predictions)[0],
                                            'Recall': score_retreiver(y_test, dt_classifier_baseline_predictions)[1],
                                            'Accuracy': score_retreiver(y_test, dt_classifier_baseline_predictions)[2],
                                            'F1_Score': score_retreiver(y_test, dt_classifier_baseline_predictions)[3]},
                                                        ignore_index=True)

In [9]:
print(classification_report(y_test, dt_classifier_baseline_predictions))

              precision    recall  f1-score   support

           0       0.07      0.36      0.12      6791
           1       0.21      0.21      0.21     23863
           2       0.84      0.66      0.74    128545

    accuracy                           0.58    159199
   macro avg       0.37      0.41      0.36    159199
weighted avg       0.71      0.58      0.64    159199



In [10]:
path = dt_classifier_default_params.cost_complexity_pruning_path(X_train, y_train)
## Gather all alphas except for the last one which would return only the root node
ccp_alphas = path.ccp_alphas[1:-1]

In [11]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'ccp_alpha': np.sort(ccp_alphas)[list(range(0, len(ccp_alphas), 500))]
}

In [None]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search = dt_grid_search.fit(X_train, y_train.values.ravel())

In [None]:
dt_grid_search.best_params_

In [None]:
dt_classifier_best_params = DecisionTreeClassifier(ccp_alpha=dt_grid_search.best_params_['ccp_alpha'],
                                                   criterion=dt_grid_search.best_params_['criterion'],
                                                   max_depth=dt_grid_search.best_params_['max_depth'],
                                                   min_samples_leaf=dt_grid_search.best_params_['min_samples_leaf'],
                                                   min_samples_split=dt_grid_search.best_params_['min_samples_split'],
                                                   random_state=42)

In [None]:
dt_classifier_best_params.fit(X_train, y_train)
dt_best_parameters_predictions = dt_classifier_best_params.predict(X_test)
print('Test Scores')
print(f'Decision Tree Tuned Precision: {score_retreiver(y_test, dt_best_parameters_predictions)[0]}')
print(f'Decision Tree Tuned Recall: {score_retreiver(y_test, dt_best_parameters_predictions)[1]}')
print(f'Decision Tree Tuned Accuracy: {score_retreiver(y_test, dt_best_parameters_predictions)[2]}')
print(f'Decision Tree Tuned F1: {score_retreiver(y_test, dt_best_parameters_predictions)[3]}')

In [None]:
algorithm_scores = algorithm_scores.append({'Model': 'Decision Tree Tuned',
                                            'Precision': score_retreiver(y_test, dt_best_parameters_predictions)[0],
                                            'Recall': score_retreiver(y_test, dt_best_parameters_predictions)[1],
                                            'Accuracy': score_retreiver(y_test, dt_best_parameters_predictions)[2],
                                            'F1_Score': score_retreiver(y_test, dt_best_parameters_predictions)[3]},
                                                        ignore_index=True)

In [None]:
print(classification_report(y_test, dt_best_parameters_predictions))

### Random Forest 

In [None]:
rf_classifier_default_params = RandomForestClassifier(random_state=42)
rf_classifier_baseline_score = np.mean(cross_val_score(rf_classifier_default_params,
                                                       X_train, y_train.values.ravel(), cv=5))
rf_classifier_baseline_score

In [None]:
rf_classifier_default_params.fit(X_train, y_train.values.ravel())
rf_classifier_baseline_predictions = rf_classifier_default_params.predict(X_test)
print('Test Scores')
print(f'Random Forest Baseline Precision: {score_retreiver(y_test, rf_classifier_baseline_predictions)[0]}')
print(f'Random Forest Baseline Recall: {score_retreiver(y_test, rf_classifier_baseline_predictions)[1]}')
print(f'Random Forest Baseline Accuracy: {score_retreiver(y_test, rf_classifier_baseline_predictions)[2]}')
print(f'Random Forest Baseline F1: {score_retreiver(y_test, rf_classifier_baseline_predictions)[3]}')

In [None]:
algorithm_scores = algorithm_scores.append({'Model': 'Random Forest Baseline',
                                            'Precision': score_retreiver(y_test, rf_classifier_baseline_predictions)[0],
                                            'Recall': score_retreiver(y_test, rf_classifier_baseline_predictions)[1],
                                            'Accuracy': score_retreiver(y_test, rf_classifier_baseline_predictions)[2],
                                            'F1_Score': score_retreiver(y_test, rf_classifier_baseline_predictions)[3]},
                                                        ignore_index=True)

In [None]:
print(classification_report(y_test, rf_classifier_baseline_predictions))

In [None]:
rf_param_grid = {
    'criterion': ['entropy', 'gini'],
    'max_depth': [None, 2, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'n_estimators': [10, 20, 50, 100]
}

In [None]:
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search = rf_grid_search.fit(X_train, y_train.values.ravel())

In [None]:
rf_grid_search.best_params_

In [None]:
rf_classifier_best_params = RandomForestClassifier(criterion=rf_grid_search.best_params_['criterion'],
                                                   max_depth=rf_grid_search.best_params_['max_depth'],
                                                   min_samples_leaf=rf_grid_search.best_params_['min_samples_leaf'],
                                                   min_samples_split=rf_grid_search.best_params_['min_samples_split'],
                                                   n_estimators=rf_grid_search.best_params_['n_estimators'],
                                                   random_state=42)

In [None]:
rf_classifier_best_params.fit(X_train, y_train)
rf_best_parameters_predictions = rf_classifier_best_params.predict(X_test)
print('Test Scores')
print(f'Random Forest Tuned Precision: {score_retreiver(y_test, rf_best_parameters_predictions)[0]}')
print(f'Random Forest Tuned Recall: {score_retreiver(y_test, rf_best_parameters_predictions)[1]}')
print(f'Random Forest Tuned Accuracy: {score_retreiver(y_test, rf_best_parameters_predictions)[2]}')
print(f'Random Forest Tuned F1: {score_retreiver(y_test, rf_best_parameters_predictions)[3]}')

In [None]:
algorithm_scores = algorithm_scores.append({'Model': 'Random Forest Tuned',
                                            'Precision': score_retreiver(y_test, rf_best_parameters_predictions)[0],
                                            'Recall': score_retreiver(y_test, rf_best_parameters_predictions)[1],
                                            'Accuracy': score_retreiver(y_test, rf_best_parameters_predictions)[2],
                                            'F1_Score': score_retreiver(y_test, rf_best_parameters_predictions)[3]},
                                                        ignore_index=True)

In [None]:
print(classification_report(y_test, rf_best_parameters_predictions))

In [None]:
algorithm_scores

In [None]:
rf_feature_importance_dict = dict(zip(X_train.columns, rf_classifier_best_params.feature_importances_))

In [None]:
dummied_categories = ['steward', 'guards']
for name in dummied_categories:
    category_sum = sum([value for key, value in rf_feature_importance_dict.items() if name in key])
    remove_keys = [key for key in rf_feature_importance_dict.keys() if name in key]
    for key in remove_keys:
        rf_feature_importance_dict.pop(key)
    rf_feature_importance_dict[name] = category_sum

In [None]:
importances = pd.Series(rf_feature_importance_dict.values(), 
                        index=rf_feature_importance_dict.keys()).sort_values(ascending=False)
sns.barplot(x=importances, y=importances.index);

### eXtreme Gradient Boosting

In [None]:
xgb_classifier_default_params = XGBClassifier(random_state=42, use_label_encoder=False)
xgb_classifier_baseline_score = np.mean(cross_val_score(xgb_classifier_default_params,
                                                        X_train, y_train.values.ravel(), cv=5))
xgb_classifier_baseline_score

In [None]:
xgb_classifier_default_params.fit(X_train, y_train.values.ravel())
xgb_classifier_baseline_predictions = xgb_classifier_default_params.predict(X_test)
print('Test Scores')
print(f'eXtreme Gradient Boost Baseline Precision: {score_retreiver(y_test, xgb_classifier_baseline_predictions)[0]}')
print(f'eXtreme Gradient Boost Baseline Recall: {score_retreiver(y_test, xgb_classifier_baseline_predictions)[1]}')
print(f'eXtreme Gradient Boost Baseline Accuracy: {score_retreiver(y_test, xgb_classifier_baseline_predictions)[2]}')
print(f'eXtreme Gradient Boost Baseline F1: {score_retreiver(y_test, xgb_classifier_baseline_predictions)[3]}')

In [None]:
algorithm_scores = algorithm_scores.append({'Model': 'XGBoost Baseline',
                                            'Precision': score_retreiver(y_test, xgb_classifier_baseline_predictions)[0],
                                            'Recall': score_retreiver(y_test, xgb_classifier_baseline_predictions)[1],
                                            'Accuracy': score_retreiver(y_test, xgb_classifier_baseline_predictions)[2],
                                            'F1_Score': score_retreiver(y_test, xgb_classifier_baseline_predictions)[3]},
                                                        ignore_index=True)

In [None]:
print(classification_report(y_test, xgb_classifier_baseline_predictions))

In [None]:
xgb_param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100],
}

In [None]:
xgb_grid_search = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False),
                               xgb_param_grid, scoring='accuracy', cv=5)
xgb_grid_search = xgb_grid_search.fit(X_train, y_train.values.ravel())