In [5]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm


csc_prediction_data_file = Path('../data_files/CSCFull.csv')

csc_prediction_true_values = ['Yes', 'Pass']
csc_prediction_false_values = ['No', 'Fail']
csc_prediction_data_types = {
    'ALQLBin': 'category',
    'Application Year': 'category',
    'AveSciBin': 'category',
    'EngBin': 'category',
    'Quintile': 'category',
    'Province': 'category',
}
csc_prediction_na_values = {
    'CSCBin': ['*'],
    'ALQLBin': ['*'],
    'AveSciBin': ['*'],
    'EngBin': ['*'],
    'Quintile': ['*'],
    'Province': ['*'],
}

csc_prediction_data_rows = [
    'AdMathAttempt',
    'ALQLBin',
    'Application Year',
    'AveSciBin',
    'EngBin',
    'FinAid',
    'Province',
    'Quintile',
]

csc_prediction_data = pd.read_csv(
    filepath_or_buffer=csc_prediction_data_file,
    dtype=csc_prediction_data_types,
    usecols=csc_prediction_data_rows + ['CSCBin'],
    true_values=csc_prediction_true_values,
    false_values=csc_prediction_false_values,
    na_values=csc_prediction_na_values,
    keep_default_na=False,
)
csc_prediction_data = csc_prediction_data.astype({'CSCBin': 'bool'})
csc_prediction_data.dtypes


FinAid                  bool
Application Year    category
EngBin              category
AveSciBin           category
ALQLBin             category
Province            category
Quintile            category
CSCBin                  bool
AdMathAttempt           bool
dtype: object

In [11]:
from collections import defaultdict

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder


label_encoded_cols = defaultdict(LabelEncoder)

training_data = csc_prediction_data.apply(
    lambda x: label_encoded_cols[x.name].fit_transform(x)
)

X = training_data[csc_prediction_data_rows]
y = training_data['CSCBin']

leave_one_out = LeaveOneGroupOut()


In [12]:
auto_sklearn_metrics = {
    'accuracy': [],
    'f1': [],
    'mcc': [],
}

for train_index, test_index in tqdm(leave_one_out.split(X, y, X['Application Year'])):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    random_forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
    random_forest.fit(X_train, y_train)
    
    y_hat = random_forest.predict(X_test)
    
    auto_sklearn_metrics['accuracy'].append(accuracy_score(y_test, y_hat))
    auto_sklearn_metrics['f1'].append(f1_score(y_test, y_hat))
    auto_sklearn_metrics['mcc'].append(matthews_corrcoef(y_test, y_hat))
    
print(auto_sklearn_metrics)


10it [00:02,  4.59it/s]

{'accuracy': [0.8292682926829268, 0.75, 0.855072463768116, 0.9285714285714286, 0.8666666666666667, 0.9032258064516129, 0.7802197802197802, 0.7446808510638298, 0.8255813953488372, 0.8809523809523809], 'f1': [0.9014084507042254, 0.8505747126436781, 0.9206349206349207, 0.9622641509433962, 0.9285714285714286, 0.9491525423728814, 0.875, 0.8518518518518517, 0.9044585987261146, 0.9350649350649352], 'mcc': [0.26745913993429266, 0.08670081183498068, 0.1284457725980754, 0.43064433753916403, -0.07142857142857142, -0.04229549344378136, 0.008157868529511848, -0.0515386141661764, -0.07655901625358509, 0.2326771136615827]}



