In [None]:
# Only use one thread
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1

# Do not use GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
import pandas as pd
import numpy as np

import inspect
import random
import pickle
import math
import textwrap
import time
import warnings

from scipy.stats import pearsonr, mode
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.metrics import mean_squared_error, roc_auc_score, f1_score, accuracy_score, mean_absolute_error, log_loss
from sklearn.model_selection import cross_val_score, RepeatedKFold, KFold
from sklearn.svm import SVR
from tqdm import tqdm

import xgboost as xgb

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor

import tensorflow as tf

from utils import *

In [None]:
USE_INDIV_SURVEY_VARS = True
USE_IMPORTANCE_ITEMS = False

df = pd.read_csv('./research-data/processed/lak22-courseload-final-studydata.csv')

ADDITIONAL_INDIV_VARS = [
    'course_name_number', 'is_stem_course', 'is_stem_student', 'course_student_stem_match',
     'n_satisfied_prereqs_2021_Spring', 'n_satisfied_prereqs_all_past_semesters',
    'percent_satisfied_prereqs_2021_Spring', 'percent_satisfied_prereqs_all_past_semesters',
    'is_non_letter_grade_course', 'student_gpa', 'student_gpa_major', 
    'tl_importance', 'me_importance', 'ps_importance', 'combined_importance', 
    'tl_manage', 'me_manage', 'ps_manage', 'cl_combined_manage'
]
if not USE_IMPORTANCE_ITEMS:
    for var in ['tl_importance', 'me_importance', 'ps_importance', 'combined_importance']:
        del df[var]

if not USE_INDIV_SURVEY_VARS:
    for var in ADDITIONAL_INDIV_VARS:
        del df[var]

# Remove string section information
for col in ['section_num','secondary_section_number','all_section_numbers']:
    if col in df.columns:
        del df[col]
        
# Remove Labels that are not needed
for col in ['tl2', 'tl_sensitivity', 'me_sensitivity', 'ps_sensitivity', 'cl_sensitivity',
            'tl1_smoothed_lmm', 'me_smoothed_lmm', 'ps_smoothed_lmm', 'cl_smoothed_lmm', 
            'tl1_smoothed_student_average', 'me_smoothed_student_average', 'ps_smoothed_student_average',
            'cl_smoothed_student_average']:
    if col in df.columns:
        del df[col]

# Drop string columns and get dummies for string var
df = df.set_index('course_name_number')
df = pd.get_dummies(df, columns=['class_type']) # upper, lower division, grad

# Train (CV) and holdout
train, test = train_test_split(df, test_size=0.15, random_state=12345, shuffle=True)

In [None]:
with open('./workload-ml/models/model-results-25-control variables.p', 'rb') as f:
    MODELS = pickle.load(f)

In [None]:
CHOOSEN_MODELS = {
    'tl1': 'linreg',
    'me': 'xgb',
    'ps': 'xgb',
    'cl_combined': 'ensemble'
}

In [None]:
def get_extrapolations(f = '../research-data/processed/course-features-2021 Spring.csv'):
    
    # Read file
    X_all = pd.read_csv(f)
    
    # Keep secondary section numbers based on which the prediction was made
    secondary_sections = list(X_all['secondary_section_number'].values)

    # Input transformation for models
    X_all = X_all.set_index('course_name_number')
    X_all = pd.get_dummies(X_all, columns=['class_type'])
    X_all.drop(columns=list(set(X_all.columns) - set(train.columns)), inplace=True)

    # Rename for model input
    X_all['n_satisfied_prereqs_2021_Spring'] = X_all['n_satisfied_prereqs_current_semester']
    X_all['percent_satisfied_prereqs_2021_Spring'] = X_all['percent_satisfied_prereqs_current_semester']

    
    # Initalize labels
    LABELS = ['tl1', 'me', 'ps', 'cl_combined']

    for l in LABELS:
        X_all[l] = 1
    
    # Run
    ignore_warnings=True
    if ignore_warnings:
            warnings.filterwarnings("ignore")

    extrapolations = dict()
    for l in LABELS: extrapolations[l] = dict()    

    for target in LABELS:
        for model in ['random', 'linreg', 'rf', 'xgb', 'enet', 'svm', 'nn']:
            extrapolations[target][model] = apply_model(MODELS, train.copy(), X_all.copy(), 
                                                        target=target, model_ref=model,
                                                        imputing_strategy='control variables')
    # Add ensemble
    for target in LABELS:
        temp = []
        for model in ['linreg', 'rf', 'xgb', 'enet', 'svm', 'nn']:
            temp.append(extrapolations[target][model][0])
        extrapolations[target]['ensemble'] = (list(map(np.mean, zip(*temp))), extrapolations[target]['linreg'][1])
        
    for label in CHOOSEN_MODELS.keys():
        X_all[label] = extrapolations[label][CHOOSEN_MODELS[label]][0]
    
    # Export predictions
    ref = f.split('-')[-1].split('.')[0]
    outf = f'../research-data/processed/predicted-course-loads-{ref}.csv'
    outdf = X_all[LABELS]
    outdf['secondary_sections'] = secondary_sections
    outdf.to_csv(outf, index=True)
        
    return

In [None]:
fs = [
    '../research-data/processed/course-features-2017 Spring.csv',
    '../research-data/processed/course-features-2017 Fall.csv',
    '../research-data/processed/course-features-2018 Spring.csv',
    '../research-data/processed/course-features-2018 Fall.csv',
    '../research-data/processed/course-features-2019 Spring.csv',
    '../research-data/processed/course-features-2019 Fall.csv',
    '../research-data/processed/course-features-2020 Spring.csv',
    '../research-data/processed/course-features-2020 Fall.csv',
    '../research-data/processed/course-features-2021 Spring.csv'
]

In [None]:
for f in tqdm(fs):
    get_extrapolations(f)