In [9]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [10]:
grades = pd.read_csv('NonMathMajors.csv')

In [11]:
X = grades.drop(['STUDENT','ENTRY_CCYY','SEM_CCYY.1','GRAD','GRAD_TIME','DEG_CD'], axis=1)
y = grades['GRAD']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)


In [13]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}

In [14]:
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models


# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
151: -1.0058811224309507
160: 0.49390334847216977
266: 0.33923499340194685
104: 0.3226622722942135
166: -0.29737105459588287
150: 0.2802778091506886
267: -0.23139245795134047
140: -0.08724608433932464
105: -0.07985312344527754
207: -0.06171367092665122
165: 0.04110866968772742
143: 0.030189241194664655
265: 0.021317446941191025

Feature importances for log_reg:
151: -0.8417418342875841
160: 0.5818568919570145
104: 0.34576842978229605
266: 0.33759313145373093
150: 0.29161152904353715
166: -0.2800151667946509
267: -0.19534476191133926
140: -0.09866178212525177
105: -0.08011805103018194
207: -0.061722009238288945
165: 0.03557441710603366
143: 0.025441094302739112
265: 0.011639187968201925

Feature importances for svc_linear:
151: -0.21285853090675755
160: 0.10506933477277822
266: 0.0721715333002807
104: 0.06875325183467838
166: -0.06327508775966345
150: 0.05968676211110577
267: -0.04920675073418531
140: -0.018501058120744544
105: -0.0169283773680735
207: -0.0

In [15]:
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.8779, Std = 0.0003
log_reg: Mean -logloss = 0.8779, Std = 0.0003
svc_linear: Mean -logloss = 0.8779, Std = 0.0003
qda: Mean -logloss = 0.8183, Std = 0.0261
lda_poly: Mean -logloss = 0.8731, Std = 0.0024
log_reg_poly: Mean -logloss = 0.8771, Std = 0.0009
gnb: Mean -logloss = 0.7971, Std = 0.0285
knn: Mean -logloss = 0.8556, Std = 0.0101
svc_rbf: Mean -logloss = 0.8780, Std = 0.0005
