In [1]:
# Importing the Usual
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [2]:
# Bringing in the non major courses and doing something cleaning
gg = pd.read_csv('NonMathMajors.csv')
gg = gg.drop(['ENTRY_CCYY','SEM_CCYY.1','GRAD_TIME','DEG_CD'],axis='columns')
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0
9682,13060,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0


In [3]:
# Removing studets that have not taken any of the listed courses
course_names = gg.columns.tolist()[2:]
gg['sum'] = gg[course_names].sum(axis=1)
gg = gg[gg['sum'] != 0]
gg = gg.drop(['sum'],axis=1)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0


In [4]:
# New column that groups together success in 150 and 151
# Reason: these courses typically taken as a sequence
conditions = [
    (gg['150'] == 1) & (gg['151'] == 1),
    (gg['150'] == 0 ) | (gg['151'] == 0),
    (gg['151'] == -1)]
values = [1,0,-1]

gg['soc_calc'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267,soc_calc
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0


In [5]:
# New column that groups together success in 140 and 143
# Reason: These are the ``prep for calculus'' courses
conditions = [
    (gg['140'] == 1) & (gg['143'] == 1),
    (gg['143'] == 0 ) | (gg['140'] == 0),
    (gg['143'] == -1)]
values = [1,0,-1]

gg['alg_pcalc'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267,soc_calc,alg_pcalc
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0


In [6]:
# New column that groups togehter success in 143 and 165
# Reason: This is precalc and calc 1
conditions = [
    (gg['143'] == 1) & (gg['165'] == 1),
    (gg['143'] == 0 ) | (gg['165'] == 0),
    (gg['165'] == -1)]
values = [1,0,-1]

gg['p_calc1'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267,soc_calc,alg_pcalc,p_calc1
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# New column that groups togehter success in 165 and 166
# Reason: This is calc 1 and calc 2 (some majors do no require calc 3)
conditions = [
    (gg['165'] == 1) & (gg['166'] == 1),
    (gg['165'] == 0 ) | (gg['166'] == 0),
    (gg['166'] == -1)]
values = [1,0,-1]

gg['calc_12'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267,soc_calc,alg_pcalc,p_calc1,calc_12
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# New column that groups togehter success in 165, 166, 265
# Reason: This is the full calc sequence
conditions = [
    (gg['165'] == 1) & (gg['166'] == 1) & (gg['265'] == 1),
    (gg['165'] == 0 ) | (gg['166'] == 0) | (gg['265'] == 0),
    (gg['265'] == -1)]
values = [1,0,-1]

gg['calc_123'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,166,207,265,266,267,soc_calc,alg_pcalc,p_calc1,calc_12,calc_123
0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0
3,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# New column that groups togehter success in 165 and 207
# Reason: Calc and linear algebra (only 165 is a prereq for 207)
conditions = [
    (gg['165'] == 1) & (gg['207'] == 1),
    (gg['165'] == 0 ) | (gg['207'] == 0),
    (gg['207'] == -1)]
values = [1,0,-1]

gg['calc_lin'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,...,207,265,266,267,soc_calc,alg_pcalc,p_calc1,calc_12,calc_123,calc_lin
0,0,1,0,0,0,0,0,0,0,0,...,0,-1,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,1
3,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# New column that groups togehter success in 166 and 266
# Reason: Calc prereq and Differential Equations (version 1)
conditions = [
    (gg['166'] == 1) & (gg['266'] == 1),
    (gg['166'] == 0) | (gg['266'] == 0),
    (gg['266'] == -1)]
values = [1,0,-1]

gg['calc_diff1'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,...,265,266,267,soc_calc,alg_pcalc,p_calc1,calc_12,calc_123,calc_lin,calc_diff1
0,0,1,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
3,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# New column that groups togehter success in 166 and 267
# Reason: Calc prereq and Differential Equations (version 2)
conditions = [
    (gg['166'] == 1) & (gg['267'] == 1),
    (gg['166'] == 0) | (gg['267'] == 0),
    (gg['267'] == -1)]
values = [1,0,-1]

gg['calc_diff2'] = np.select(conditions,values)
gg

Unnamed: 0,STUDENT,GRAD,104,105,140,143,150,151,160,165,...,266,267,soc_calc,alg_pcalc,p_calc1,calc_12,calc_123,calc_lin,calc_diff1,calc_diff2
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
3,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,13057,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,13059,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,13061,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9684,13062,1,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Making the train/test spilt
X = gg.drop(['STUDENT','GRAD'],axis= 1)
y = gg['GRAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [None]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}

In [None]:
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models

# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")

In [None]:
# Print accuracies
accs = {model_name: accuracy_score(y_test, model.predict(X_test)) for model_name, model in classifiers.items()}
print("Accuracies:", accs)

In [None]:
# Let's finish with a baseline check for just guessing graduating based on success in any of the courses/course combos

for j in gg.columns.tolist()[2:]:
    just_guess = X_test[j].apply(lambda x: 1 if x == 1 else 0)
    print('Accuracy from just guessing', j, accuracy_score(y_test,just_guess))