# ERROR ANALYSIS

Understanding where does our model fail to predict correctly can often be a key to creating the most accurate models.

## IMPORTS

In [None]:
import os, joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## UTILS

In [None]:
def feature_engineering(dataframe):
    features = dataframe.columns[1:101]
    return dataframe, features

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

## CONFIG

In [None]:
config = {
    "TRAIN_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-AUG2021/train.csv",
    "TEST_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-AUG2021/test.csv",
    "TARGET_VAR" : "loss"
}

In [None]:
params = {
    "objective": "reg:squarederror",
    "n_estimators" : 10000,
    "max_depth": 10,
    "learning_rate": 0.006,
    "colsample_bytree": 0.5,
    "subsample": 0.6,
    "reg_alpha" : 0.006221417528979453,
    "reg_lambda": 3.178956727410822e-07,
    "min_child_weight": 123,
    "n_jobs": 2,
    "seed": 95,
    'tree_method': "gpu_hist",
    "gpu_id": 0,
    'predictor': 'gpu_predictor'
}

## LOADING DATA

In [None]:
df_train = pd.read_csv(config["TRAIN_PATH"])
df_test = pd.read_csv(config["TEST_PATH"])

In [None]:
df_train, features_train = feature_engineering(df_train)
df_test, features_test = feature_engineering(df_test)

In [None]:
target_train = df_train[config["TARGET_VAR"]].values
df_train = df_train[features_train]
train_x, valid_x, train_y, valid_y = train_test_split(df_train, target_train, test_size=0.2, random_state=95)

## TRAINING A MODEL

### OR LOAD A TRAINED MODEL..

In [None]:
model = xgb.XGBClassifier(**params)
model.fit(
    train_x, 
    train_y, 
    eval_set=[(valid_x, valid_y)], 
    early_stopping_rounds=200, 
    verbose = 1000
)
valid_oof = model.predict(valid_x)
temp_test = model.predict(df_test[features_test])
rmse = roc_auc_score(valid_y, valid_oof[:, 1])

print('RMSE score %.6f' % rmse)

## ERROR ANALYSIS

In [None]:
detection_threshold = 0.5
valid_pred = valid_oof[:, 1] >= detection_threshold
l = lambda x : x*1
valid_pred = l(valid_pred)

### CONFUSION MATRIX

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
cm = confusion_matrix(valid_y, valid_pred)
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
valid = pd.DataFrame(valid_x)
valid[config["TARGET_VAR"]] = valid_y
valid["preds"] = valid_oof[:, 1]
valid["preds_int"] = valid_pred
valid.head()

In [None]:
valid["error"] = abs(valid[config["TARGET_VAR"]] - valid["preds"])
valid.head()

Let's take a look at some of the biggest errors the model made

In [None]:
sorted_desc = valid.sort_values(by=['error'], ascending=False)

LOOKING AT FALSE POSITIVE

In [None]:
sorted_desc[sorted_desc[config["TARGET_VAR"]]==0] [0:20]

NOW FALSE NEGATIVE

In [None]:
sorted_desc[sorted_desc[config["TARGET_VAR"]]==1] [0:20]

The inverse let's look at case where the model is correct and very sure about it

In [None]:
sorted_asc = valid.sort_values(by=['error'], ascending=True)

TRUE NEGATIVE

In [None]:
sorted_asc[sorted_asc[config["TARGET_VAR"]]==0] [0:20]

TRUE POSITIVE

In [None]:
sorted_asc[sorted_asc[config["TARGET_VAR"]]==1] [0:20]

Analyzing model error depending on variable value

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 0], valid["preds_int"][valid["cat16"] == 0])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 1], valid["preds_int"][valid["cat16"] == 1])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 2], valid["preds_int"][valid["cat16"] == 2])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 3], valid["preds_int"][valid["cat16"] == 3])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")