Customized confusion matrix plot

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

----

Create basic Log Reg ann plot some metrics

In [1]:
def create_basic_lr(X_train, y_train, X_test, y_test, model_name):
    basic_lr = LogisticRegression()
    basic_lr.fit(X_train, y_train)
    print('Accuracy: ',accuracy_score(y_test,basic_lr.predict(X_test)))
    print('F1-Score: ',f1_score(y_test,basic_lr.predict(X_test)))
    print('ROC AUC: ',roc_auc_score(y_test,basic_lr.predict(X_test)))
    cm = confusion_matrix(y_test,basic_lr.predict(X_test))
    plot_confusion_matrix(cm, ['Not Survived','Survived'],
                              normalize=False,
                              title='Confusion matrix',
                              cmap=plt.cm.Oranges)
    return basic_lr

------

Plot proba dist (on test set) from binary clf

In [2]:
def plot_proba_dist_binary_clf(clf_bin, X_test, y_test):
    
    p_test_lr = clf_bin.predict_proba(X_test)[:,1]
    df_a = pd.DataFrame({"target":y_test, "pred_probas":p_test_lr})
    
    plt.figure(figsize=(12,6))
    _ = sns.distplot(df_p[df_p["target"]==1].loc[:,"pred_probas"],bins=20,kde=False,color="g")
    _ = sns.distplot(df_p[df_p["target"]==0].loc[:,"pred_probas"],bins=20,kde=False,color="b")

-----

Same for multiclass classifier (OVR: One VS Rest)

In [None]:
def plot_proba_dist_multiclass(clf_multi, X_test, y_test):
    """
    Func. that takes a previously trained multiclass classifier (one that outputs probabilities)
    and prints a series of distplots with the probability distribution of each predicted class
    VS the rest.
    This makes sense when the aforesaid classifier has been trained using One-VS-Rest logic.
    """
    
    pred = clf_multi.predict(X_test)
    pred_proba = clf_multi.predict_proba(X_test)
    
    dict_proba = {}
    for i in range(pred_proba.shape[1]):
        dict_proba["Proba_"+str(i)] = pred_proba[:,i]

    dict_proba["Label"] = pred
    
    df_a = pd.DataFrame(data=dict_proba)

    fig, axs = plt.subplots(pred_proba.shape[1], 1, figsize=(12, 6 * pred_proba.shape[1]))
    for i in range(pred_proba.shape[1]):
        for j in range(pred_proba.shape[1]):
            if i == j:
                _ = sns.distplot(df_a[df_a["Label"]==j].loc[:,"Proba_"+str(i)],bins=20,kde=False,color="g",ax=axs[i],norm_hist=True)
            else:
                _ = sns.distplot(df_a[df_a["Label"]==j].loc[:,"Proba_"+str(i)],bins=20,kde=False,color="b",ax=axs[i],norm_hist=True)

----

Train a series of classifiers and compute their F1-Scores (or any other classification metric from those available in sklearn)
Then plot these metrics on a horizontal barplot

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

kfold = StratifiedKFold(n_splits=10)

random_state = 0
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = y_train, scoring = "f1", cv = kfold, n_jobs=-1))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean F1-Score")
g = g.set_title("Cross validation scores")

----

Hyper-parameter tuning using Grid-Search CV for a Random Forest

In [None]:
kfold = StratifiedKFold(n_splits=10)

# RFC Parameters tunning 
RFC = RandomForestClassifier()


## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10, 20],
              "min_samples_split": [2, 3, 10, 15],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,200,300,400],
              "criterion": ["gini","entropy"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="f1", n_jobs= -1, verbose = 1)

gsRFC.fit(X_train_imp_eng,y_train)

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

-----

Plot features importance rankings for different (pre-trained) tree-based algorithms

In [None]:
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))

names_classifiers = [("AdaBoosting", ada_best),("ExtraTrees",ExtC_best),("RandomForest",RFC_best),("GradientBoosting",GBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:20]
        g = sns.barplot(y=X_train_imp_eng.columns[indices][:20],x = classifier.feature_importances_[indices][:20] , orient='h',ax=axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " feature importance")
        nclassifier += 1