In [16]:
import warnings
from sklearn.utils import compute_sample_weight


warnings.simplefilter('ignore')

In [17]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


In [19]:
from pathlib import Path

import pandas as pd


csc_prediction_data_file = Path('./data_files/CSCFull.csv')

csc_prediction_true_values = ['Yes', 'Pass']
csc_prediction_false_values = ['No', 'Fail']
csc_prediction_data_types = {
    'Application Year': 'category',
    'Quintile': 'category',
    'EngBin': 'category',
    'AveSciBin': 'category',
    'ALQLBin': 'category',
    'Province': 'category',
}
csc_prediction_na_values = {
    'CSCBin': '*',
}

csc_prediction_data_rows = [
    'FinAid',
    'Application Year',
    'EngBin',
    'AveSciBin',
    'ALQLBin',
    'Province',
    'Quintile',
    'AdMathAttempt',
]

csc_prediction_data = pd.read_csv(
    filepath_or_buffer=csc_prediction_data_file,
    dtype=csc_prediction_data_types,
    true_values=csc_prediction_true_values,
    false_values=csc_prediction_false_values,
    na_values=csc_prediction_na_values,
    keep_default_na=False,
)
csc_prediction_data = csc_prediction_data.astype({'CSCBin': 'bool'})
csc_prediction_data.dtypes

FinAid                  bool
Application Year    category
EngBin              category
AveSciBin           category
ALQLBin             category
Province            category
Quintile            category
CSCBin                  bool
AdMathAttempt           bool
dtype: object

In [20]:
from collections import defaultdict

from autosklearn.classification import AutoSklearnClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder


label_encoded_cols = defaultdict(LabelEncoder)

training_data = csc_prediction_data.apply(
    lambda x: label_encoded_cols[x.name].fit_transform(x)
)

X = training_data[csc_prediction_data_rows]
y = training_data['CSCBin']

leave_one_out = LeaveOneGroupOut()

# Auto-sklearn

In [21]:
csc_prediction_data.groupby('CSCBin').size()

CSCBin
False     91
True     692
dtype: int64

In [22]:
%%time

auto_sklearn_metrics = {
    'accuracy': [],
    'f1': [],
    'mcc': [],
}

for train_index, test_index in leave_one_out.split(X, y, X['Application Year']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    automl = AutoSklearnClassifier(
#         time_left_for_this_task=1200,
        n_jobs=4,
    )
    automl.fit(X_train, y_train)
    
    y_hat = automl.predict(X_test)
    
    auto_sklearn_metrics['accuracy'].append(accuracy_score(y_test, y_hat))
    auto_sklearn_metrics['f1'].append(f1_score(y_test, y_hat))
    auto_sklearn_metrics['mcc'].append(matthews_corrcoef(y_test, y_hat))
    # 
    # np.set_printoptions(precision=2)
    # 
    # # Plot non-normalized confusion matrix
    # plot_confusion_matrix(y_test, y_hat, classes=['Pass', 'Fail'],
    #                       title='Confusion matrix, without normalization')
    # 
    # # Plot normalized confusion matrix
    # plot_confusion_matrix(y_test, y_hat, classes=['Pass', 'Fail'], normalize=True,
    #                       title='Normalized confusion matrix')
    # 
    # plt.show()

print(auto_sklearn_metrics)

{'accuracy': [0.8536585365853658, 0.8076923076923077, 0.8695652173913043, 0.8928571428571429, 0.9333333333333333, 0.9247311827956989, 0.8021978021978022, 0.8297872340425532, 0.8604651162790697, 0.8968253968253969], 'f1': [0.9210526315789475, 0.8936170212765957, 0.9302325581395349, 0.9433962264150944, 0.9655172413793104, 0.9608938547486032, 0.8888888888888891, 0.9058823529411765, 0.9249999999999999, 0.9446808510638297], 'mcc': [0.0, -0.06406221326384731, 0.0, -0.04222003309207491, 0.0, 0.0, 0.06940930684335787, 0.018415951966208682, 0.0, 0.2362159324052733]}
CPU times: user 4h 7min 3s, sys: 6min 44s, total: 4h 13min 48s
Wall time: 10h 1min 3s


# Auto-sklearn - Sample Weights

In [23]:
sample_weights = compute_sample_weight('balanced', y)
sample_weights[:5]

array([0.56575145, 0.56575145, 0.56575145, 0.56575145, 0.56575145])

In [24]:
%%time

auto_sklearn_metrics = {
    'accuracy': [],
    'f1': [],
    'mcc': [],
}


for train_index, test_index in leave_one_out.split(X, y, X['Application Year']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    test_weights = sample_weights[test_index]
    
    automl = AutoSklearnClassifier(
#         time_left_for_this_task=1200,
        n_jobs=4,
    )
    automl.fit(X_train, y_train)
    
    y_hat = automl.predict(X_test)
    
    auto_sklearn_metrics['accuracy'].append(
        accuracy_score(y_test, y_hat, sample_weight=test_weights),
    )
    auto_sklearn_metrics['f1'].append(f1_score(y_test, y_hat, sample_weight=test_weights))
    auto_sklearn_metrics['mcc'].append(
        matthews_corrcoef(y_test, y_hat, sample_weight=test_weights),
    )
    # 
    # np.set_printoptions(precision=2)
    # 
    # # Plot non-normalized confusion matrix
    # plot_confusion_matrix(y_test, y_hat, classes=['Pass', 'Fail'],
    #                       title='Confusion matrix, without normalization')
    # 
    # # Plot normalized confusion matrix
    # plot_confusion_matrix(y_test, y_hat, classes=['Pass', 'Fail'], normalize=True,
    #                       title='Normalized confusion matrix')
    # 
    # plt.show()

print(auto_sklearn_metrics)



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [25]:
# from tpot import TPOTClassifier


# tpot_metrics = {
#     'accuracy': [],
#     'f1': [],
#     'mcc': [],
# }

# for train_index, test_index in leave_one_out.split(X, y, X['Application Year']):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     tpot = TPOTClassifier(
#         generations=5, 
#         population_size=20, 
#         max_time_mins=20, 
#         n_jobs=4,
#         verbosity=2,
#     )
#     tpot.fit(X_train, y_train)
   
#     y_hat = tpot.predict(X_test)
    
#     application_year_label_id = X_train['Application Year'].iloc[0]
#     application_year_label = label_encoded_cols['Application Year'].inverse_transform(
#         [application_year_label_id],
#     )[0]
    
#     tpot.export(f'tpot_csc_pipeline_{application_year_label}.py')
    
#     tpot_metrics['accuracy'].append(accuracy_score(y_test, y_hat))
#     tpot_metrics['f1'].append(f1_score(y_test, y_hat))
    
# print(tpot_metrics)