In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('lee_final_dataset_11_17.csv')
X = df.drop(['STUDENT','MAJ','GRAD'], axis=1)
y = df['GRAD']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [3]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
160: 0.2139149646317243
104: 0.17179653542647036
304: 0.14012113433684725
140: 0.13527241526523082
266: 0.1291440188619155
150: 0.10444378060901896
207: 0.08666631613823239
105: 0.0806184634793855
373: 0.07596503982786074
435: 0.06932469745131165
414: 0.0681800264008768
317: 0.06806937784760178
350: 0.06581599220492798
385: 0.05670086389654027
314: 0.05443025556156666
143: -0.04268539704346301
302: 0.04059635178403889
265: 0.03044159427838767
436: -0.02977412702521382
301: 0.029391804598413895
267: -0.02852414759233579
165: 0.026977500623554203
201: -0.02250949307721884
151: 0.021172088952536167
166: -0.0199013186509885
415: 0.007707709677979346
145: -0.006414386635651497
365: -0.0023973371956744283

Feature importances for log_reg:
160: 0.22468264415518652
304: 0.19609735991595037
104: 0.1793230188336787
140: 0.14189663474306036
266: 0.1359031692678961
150: 0.10441579678485682
373: 0.10428780748710324
435: 0.09308642855226142
350: 0.09180668158299887
207:

In [4]:
for model_name, model in classifiers.items():
    model.fit(X_train,y_train)

accs = {model_name: accuracy_score(y_test, model.predict(X_test)) for model_name, model in classifiers.items()}

accs

{'lda': 0.6006219458018658,
 'log_reg': 0.6055086628165259,
 'svc_linear': 0.6006219458018658,
 'qda': 0.5575299866725899,
 'lda_poly': 0.6130608618391826,
 'log_reg_poly': 0.6117281208351844,
 'gnb': 0.5548645046645935,
 'knn': 0.6077298978231896,
 'svc_rbf': 0.6219458018658374}