In [3]:
import warnings
warnings.simplefilter('ignore')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge, ARDRegression, Lasso, Ridge
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [5]:
df = pd.read_csv('../datasets/SEHIR/processed_course_clustering_dataset.csv')
df.head()

Unnamed: 0,Course Code,Course Title,Student Number,Department Code,Course Level,Letter Grade,Status,GPA,Standing,Completed Credits,...,C rate,C- rate,D+ rate,D rate,D- rate,F rate,Mean GPA - Students taken,Mean Grade - Students taken,STDEV GPA - Students taken,STDEV Grade - Students taken
0,UNI 111,Critical Reading & Writing in Turkish I,240,SOC,Undergraduate,F,Unsuccessful,2.62,Freshman,18,...,0.054422,0.040816,0.047619,0.020408,0.020408,0.088435,2.467279,2.680952,0.864273,1.193667
1,UNI 107,World Civilizations& Global Encounters I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,...,0.090909,0.136364,0.045455,0.0,0.045455,0.0,2.893182,2.986364,0.821569,1.098612
2,UNI 105,Understanding Society and Culture I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,...,0.038462,0.0,0.0,0.0,0.0,0.038462,2.713846,3.211538,0.95208,0.937369
3,UNI 103,Understanding Science and Technology,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,...,0.081967,0.032787,0.0,0.021858,0.027322,0.065574,2.483224,2.813115,0.977342,1.117219
4,UNI 105,Understanding Society and Culture I,240,SOC,Undergraduate,A,Successful,2.62,Freshman,18,...,0.038462,0.0,0.0,0.0,0.0,0.038462,2.713846,3.211538,0.95208,0.937369


In [58]:
df.drop([df.columns[0], df.columns[1], df.columns[2]], inplace=True, axis=1)   # dropping course details

In [60]:
# applying one-hot encoding on categorical features
df = pd.concat([df, pd.get_dummies(df['Course Year'], prefix='Course Year'), pd.get_dummies(df['Subject'], prefix='Subject'), pd.get_dummies(df['Department Code'], prefix='Department Code'), pd.get_dummies(df['Course Level'], prefix='Course Level'), pd.get_dummies(df['Standing'], prefix='Standing'), pd.get_dummies(df['Status'], prefix='Status')], axis=1)
df.drop(['Course Year', 'Subject', 'Department Code', 'Course Level', 'Status', 'Standing'], axis=1, inplace=True)

In [62]:
columns = df.columns

In [64]:
le = LabelEncoder()
le.fit(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F'])

In [66]:
def standardize(X_train, X_test):
    X_train_cols = X_train.columns
    X_test_cols = X_test.columns
    sc = StandardScaler()
    fitted_sc = sc.fit(X_train)
    X_train_std = pd.DataFrame(fitted_sc.transform(X_train), columns=X_train_cols)
    X_test_std = pd.DataFrame(fitted_sc.transform(X_test), columns=X_test_cols)
    return X_train_std, X_test_std

In [68]:
def get_train_data(df, train_sem, columns):
    dataFrame = pd.DataFrame(columns=columns)
    
    # extracting instances from the dataset which should be in training data
    for sem in train_sem:
        dataFrame = pd.concat([dataFrame, df[df.iloc[:, 7] == sem]], ignore_index=True)
    
    X_train = dataFrame.drop('Semester', axis=1)
    y_train = le.transform(X_train.pop('Letter Grade'))
    return X_train, y_train

In [70]:
def fit_cluster(n_clusters, X_train, y_train, reg_model, cluster_model):
    reg_models = {}   # {cluster_label: fitted regression model (object), ...}
    clusters_dataset = {}  # {cluster_label: {'X': dataset (pd.DataFrame), 'y': true target values (list)}, ...}
    cluster_features = X_train[['Course Credit'] + list(X_train.columns[12:69])]
    
    # fitting a clustering model based on GPA, Completed Credits and Departments
    fitted_cluster_model = cluster_model(n_clusters=n_clusters).fit(cluster_features)
    print(['Course Credit'] + list(X_train.columns[12:69]))
    cluster_labels = fitted_cluster_model.labels_
    
    # splitting the main dataset into sub-dataFrames based on their cluster label
    for i in range(len(cluster_labels)):
        clusters_dataset.setdefault(cluster_labels[i], {})
        clusters_dataset[cluster_labels[i]].setdefault('X', pd.DataFrame(columns=X_train.columns))
        clusters_dataset[cluster_labels[i]].setdefault('y', [])

        df_row = list(X_train.iloc[i, :])   # getting the corresponding row from main dataset
        cluster_data = clusters_dataset[cluster_labels[i]]   # getting the corresponding cluster data structure
        cluster_data['X'].loc[len(cluster_data['X'])] = df_row
        cluster_data['y'].append(y_train[i])
    
    # fitting a regression model to each clustering and storing the fitted models
    for cluster_label in clusters_dataset:
        reg_models.setdefault(cluster_label, 0)
        fitted_reg_model = reg_model.fit(clusters_dataset[cluster_label]['X'], clusters_dataset[cluster_label]['y'])
        reg_models[cluster_label] = fitted_reg_model
        
    return reg_models, fitted_cluster_model

In [72]:
def cluster_test_data(cluster_model, X_test, y_test):
    test_dfs = {}   # splitting the test dataset into sub-dataFrames based on their predicted cluster label
    cluster_features = X_test[['Course Credit'] + list(X_test.columns[12:69])]
    
    # predicting the cluster labels of test data using a cluster model fitted on the whole dataset so far
    predicted_clusters = cluster_model.predict(cluster_features)
    
    # splitting the test dataset based on their cluster label
    for i in range(len(predicted_clusters)):
        test_dfs.setdefault(predicted_clusters[i], {})
        test_dfs[predicted_clusters[i]].setdefault('X', pd.DataFrame(columns=X_test.columns))
        test_dfs[predicted_clusters[i]].setdefault('y', [])
        
        df_row = list(X_test.iloc[i, :])
        cluster_data = test_dfs[predicted_clusters[i]]
        cluster_data['X'].loc[len(cluster_data['X'])] = df_row
        cluster_data['y'].append(y_test[i])
        
    return test_dfs

In [74]:
def get_error_score(df, columns, reg_model, cluster_model):
    error_scores = {}   # storing error scores in a dict with shape: 
                        # {num_clusters (k=2,3,...,7): 
                          # {num_training_semesters (N=1,2,...,7): 
                              # {cluster_label: [RMSE, MAE], ...}, 
                          # ...}, 
                        #...}

    sorted_semesters = sorted(set(df.iloc[:, 7]))   # sorting semesters in a time series manner
    for num_clusters in range(10, 31, 5):
        error_scores.setdefault(str(num_clusters), {})
        for sem_idx in range(1, len(sorted_semesters)):
            error_scores[str(num_clusters)].setdefault(str(sem_idx), {'y_true': [], 'y_pred': []})
            
            # preparing the training data from the beginning of the dataset so far
            training_sem = sorted_semesters[:sem_idx]
            test_sem = sorted_semesters[sem_idx]
            X_train, y_train = get_train_data(df, training_sem, columns)
            
            # preparing the test dataframe
            X_test = df[df.iloc[:, 7] == test_sem]
            X_test.drop('Semester', axis=1, inplace=True)
            X_test.index = range(len(X_test))
            y_test = le.transform(X_test.pop('Letter Grade'))
            
            # standardizing the dataset for faster optimization
            X_train, X_test = standardize(X_train, X_test)
            
            # getting the cluster model fitted on training data and each clusters' regression model in a dict
            reg_models, fitted_cluster_model = fit_cluster(num_clusters, X_train, y_train, reg_model, cluster_model)
            
            # getting the clustered test data
            test_dfs = cluster_test_data(fitted_cluster_model, X_test, y_test)
            
            # for each cluster label in test dataFrames:
                # 1. retrieve the regression model fitted earlier on the same cluster label
                # 2. predict the test data from the same cluster label with the retrieved regression model
                
            for cluster_label in test_dfs:
                fitted_reg_model = reg_models[cluster_label]
                y_true = test_dfs[cluster_label]['y']
                y_pred = fitted_reg_model.predict(test_dfs[cluster_label]['X'])
                y_pred = list(y_pred)
                                
                error_scores[str(num_clusters)][str(sem_idx)]['y_true'] += y_true
                error_scores[str(num_clusters)][str(sem_idx)]['y_pred'] += y_pred
                
    return error_scores

In [76]:
def plot_error_scores(scores, title):
    best_scores = {}
    for n_clusters in scores:
        best_scores.setdefault(n_clusters, [100, 100, -100, -100])   # [rmse_train, rmse_test, r2_train, r2_test]
        for n_training_sem in scores[n_clusters]:
            clust_scores = scores[n_clusters][n_training_sem].values()
            for s in clust_scores:
                if s['train'][0] < best_scores[n_clusters][0]:
                    best_scores[n_clusters][0] = s['train'][0]
                elif s['train'][1] > best_scores[n_clusters][2]:
                    best_scores[n_clusters][2] = s['train'][1]
                elif s['test'][0] < best_scores[n_clusters][1]:
                    best_scores[n_clusters][1] = s['test'][0]
                elif s['test'][1] > best_scores[n_clusters][3]:
                    best_scores[n_clusters][3] = s['test'][1]
        
    fig, ax = plt.subplots(figsize=(15,5))
    x = np.arange(2,8)
    ax.plot(x, [best_scores[k][0] for k in sorted(best_scores)], marker='o', label='RMSE train', linestyle='--', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][1] for k in sorted(best_scores)], marker='x', label='RMSE test', linestyle='-', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][2] for k in sorted(best_scores)], marker='v', label='R\u00b2 train', linestyle='dotted', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][3] for k in sorted(best_scores)], marker='d', label='R\u00b2 test', linestyle='dashdot', linewidth=3, mew=3)
    
    ax.set_title(title, fontfamily='serif', fontsize=20)
    ax.set_yticklabels([round(i,1) for i in ax.get_yticks()], fontfamily='serif', fontsize=20)
    ax.set_xticks(x)
    ax.set_xticklabels(['k={}'.format(i) for i in range(2, 8)], fontfamily='serif', fontsize=20)
    ax.set_xlabel('Number of Clusters (k)', fontsize=20, fontfamily='serif')
    ax.set_ylabel('Error', fontsize=20, fontfamily='serif')
    ax.grid(True)
    ax.legend(prop={'size': 20, 'family': 'serif'}, loc='center', bbox_to_anchor=(0.5, -0.25), ncol=4)
    
#     plt.savefig('sample.svg', bbox='tight')

In [78]:
with open('../hyperparameters/tuned_hyperparams (course based).json') as fr:
    tuned_hyperparams = json.load(fr)

In [80]:
model_errors = {}

In [82]:
with open('../results/clustering_course_based_regression_results (with KMeans).json') as fr:
    results = json.load(fr)

In [84]:
hps = tuned_hyperparams['SVR']
errors = get_error_score(df, columns, SVR(C=hps['C'], kernel=hps['kernel'], epsilon=hps['epsilon']), KMeans)
model_errors['SVR'] = errors

['Course Credit', 'A- rate', 'B+ rate', 'B rate', 'B- rate', 'C+ rate', 'C rate', 'C- rate', 'D+ rate', 'D rate', 'D- rate', 'F rate', 'Mean GPA - Students taken', 'Mean Grade - Students taken', 'STDEV GPA - Students taken', 'STDEV Grade - Students taken', 'Course Year_1', 'Course Year_2', 'Course Year_3', 'Course Year_4', 'Course Year_5', 'Course Year_6', 'Subject_ARAB', 'Subject_ARM', 'Subject_BGM', 'Subject_CS', 'Subject_CTV', 'Subject_CULT', 'Subject_ECE', 'Subject_ECON', 'Subject_EE', 'Subject_EECS', 'Subject_ENGR', 'Subject_FRE', 'Subject_GER', 'Subject_HIST', 'Subject_HUK', 'Subject_IE', 'Subject_ISE', 'Subject_ISS', 'Subject_ITM', 'Subject_LAW', 'Subject_LIFE', 'Subject_LING', 'Subject_LIT', 'Subject_MATH', 'Subject_MGT', 'Subject_MTS', 'Subject_PERS', 'Subject_PHIL', 'Subject_PHYS', 'Subject_POLS', 'Subject_PSY', 'Subject_SOC', 'Subject_SPA', 'Subject_UNI', 'Department Code_BGM/NT', 'Department Code_BGM/T']


KeyboardInterrupt: 

In [None]:
plot_error_scores(results['SVR'], 'SupportVectorRegressor')

In [None]:
hps = tuned_hyperparams['BayesianRidge']
errors = get_error_score(df, columns, BayesianRidge(lambda_1=hps['lambda_1'], lambda_2=hps['lambda_2'],
                                                    alpha_1=hps['alpha_1'], alpha_2=hps['alpha_2']), KMeans)
model_errors['BayesianRidge'] = errors

In [None]:
plot_error_scores(results['BayesianRidge'], 'BayesianRidge')

In [None]:
hps = tuned_hyperparams['Lasso']
errors = get_error_score(df, columns, Lasso(alpha=hps['alpha']), KMeans)
model_errors['Lasso'] = errors

In [None]:
plot_error_scores(results['Lasso'], 'Lasso')

In [None]:
hps = tuned_hyperparams['Ridge']
errors = get_error_score(df, columns, Ridge(alpha=hps['alpha']), KMeans)
model_errors['Ridge'] = errors

In [None]:
plot_error_scores(results['Ridge'], 'Ridge')

In [None]:
hps = tuned_hyperparams['BaggingRegressor']
errors = get_error_score(df, columns, BaggingRegressor(n_estimators=hps['n_estimators']), KMeans)
model_errors['BaggingRegressor'] = errors

In [None]:
plot_error_scores(results['BaggingRegressor'], 'BaggingRegressor')

In [None]:
hps = tuned_hyperparams['AdaBoostRegressor']
scores = get_error_score(df, columns, AdaBoostRegressor(n_estimators=hps['n_estimators'], learning_rate=hps['learning_rate']), KMeans)
model_errors['AdaBoostRegressor'] = errors

In [None]:
plot_error_scores(results['AdaBoostRegressor'], 'AdaBoostRegressor')

In [None]:
hps = tuned_hyperparams['RandomForestRegressor']
errors = get_error_score(df, columns, RandomForestRegressor(n_estimators=hps['n_estimators']), KMeans)
model_errors['RandomForestRegressor'] = errors

In [None]:
plot_error_scores(results['RandomForestRegressor'], 'RandomForestRegressor')

In [None]:
hps = tuned_hyperparams['GradientBoostingRegressor']
errors = get_error_score(df, columns, GradientBoostingRegressor(learning_rate=hps['learning_rate'], loss=hps['loss'],
                                                                n_estimators=hps['n_estimators'], max_depth=hps['max_depth']), KMeans)
model_errors['GradientBoostingRegressor'] = errors

In [None]:
plot_error_scores(results['GradientBoostingRegressor'], 'GradientBoostingRegressor')

In [None]:
with open('clustering_course_based_regression_results (with KMeans).json', 'w') as fw:
    json.dump(model_errors, fw)