In [1]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm


csc_prediction_data_file = Path('../data_files/CSCFull.csv')

csc_prediction_true_values = ['Yes', 'Pass']
csc_prediction_false_values = ['No', 'Fail']
csc_prediction_data_types = {
    'Application Year': 'category',
    'Quintile': 'category',
    'EngBin': 'category',
    'AveSciBin': 'category',
    'ALQLBin': 'category',
    'Province': 'category',
}
csc_prediction_na_values = {
    'CSCBin': '*',
}

csc_prediction_data_rows = [
    'FinAid',
    'Application Year',
    'EngBin',
    'AveSciBin',
    'ALQLBin',
    'Province',
    'Quintile',
    'AdMathAttempt',
]

csc_prediction_data = pd.read_csv(
    filepath_or_buffer=csc_prediction_data_file,
    dtype=csc_prediction_data_types,
    true_values=csc_prediction_true_values,
    false_values=csc_prediction_false_values,
    na_values=csc_prediction_na_values,
    keep_default_na=False,
)
csc_prediction_data = csc_prediction_data.astype({'CSCBin': 'bool'})
csc_prediction_data.dtypes


In [None]:
from collections import defaultdict

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder


label_encoded_cols = defaultdict(LabelEncoder)

training_data = csc_prediction_data.apply(
    lambda x: label_encoded_cols[x.name].fit_transform(x)
)

X = training_data[csc_prediction_data_rows]
y = training_data['CSCBin']

leave_one_out = LeaveOneGroupOut()


In [None]:
auto_sklearn_metrics = {
    'accuracy': [],
    'f1': [],
    'mcc': [],
}

for train_index, test_index in tqdm(leave_one_out.split(X, y, X['Application Year'])):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    
    y_hat = decision_tree.predict(X_test)
    
    auto_sklearn_metrics['accuracy'].append(accuracy_score(y_test, y_hat))
    auto_sklearn_metrics['f1'].append(f1_score(y_test, y_hat))
    auto_sklearn_metrics['mcc'].append(matthews_corrcoef(y_test, y_hat))
    
print(auto_sklearn_metrics)
