In [13]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [14]:
grades = pd.read_csv('NonMathMajors.csv')

In [15]:
X = grades.drop(['STUDENT','ENTRY_CCYY','SEM_CCYY.1','GRAD','GRAD_TIME','DEG_CD'], axis=1)
y = grades['GRAD']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)


In [17]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}

In [20]:
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models

accs = {model_name: accuracy_score(y_test, model.predict(X_test)) for model_name, model in classifiers.items()}

# Print accuracies
print("Accuracies:", accs)

# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")

Accuracies: {'lda': 0.8777869529314616, 'log_reg': 0.8777869529314616, 'svc_linear': 0.8777869529314616, 'qda': 0.8352601156069365, 'lda_poly': 0.8757225433526011, 'log_reg_poly': 0.8773740710156895, 'gnb': 0.8129644921552436, 'knn': 0.8447563996696945, 'svc_rbf': 0.8769611890999174}

Feature importances for lda:
151: -1.0058811224310062
160: 0.4939033484722013
266: 0.3392349934019342
104: 0.3226622722942299
166: -0.2973710545958796
150: 0.28027780915069817
267: -0.23139245795134325
140: -0.08724608433932891
105: -0.07985312344528141
207: -0.06171367092664991
165: 0.04110866968772837
143: 0.030189241194664287
265: 0.02131744694119313

Feature importances for log_reg:
151: -0.8417418342875841
160: 0.5818568919570142
104: 0.3457684297822979
266: 0.3375931314537306
150: 0.29161152904353616
166: -0.2800151667946509
267: -0.19534476191134004
140: -0.0986617821252528
105: -0.08011805103018248
207: -0.06172200923828945
165: 0.03557441710603357
143: 0.025441094302742273
265: 0.0116391879682007

In [6]:
for model_name, model in classifiers.items():
    model.fit(X_train,y_train)

accs = {model_name: accuracy_score(y_test, model.predict(X_test)) for model_name, model in classifiers.items()}

accs

{'lda': 0.8777869529314616,
 'log_reg': 0.8777869529314616,
 'svc_linear': 0.8777869529314616,
 'qda': 0.8352601156069365,
 'lda_poly': 0.8757225433526011,
 'log_reg_poly': 0.8773740710156895,
 'gnb': 0.8129644921552436,
 'knn': 0.8447563996696945,
 'svc_rbf': 0.8769611890999174}