In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Read in the math majors data set
df = pd.read_csv('math_dataset.csv')
df.head()

Unnamed: 0,STUDENT,GRADUATE,165,166,201,265,266,267,301,302,...,314,317,350,365,373,385,414,415,435,436
0,51,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
1,75,1,1.5,3.0,4.0,4.0,0.0,5.5,6.5,0.0,...,0.0,5.5,0.0,9.0,0.0,9.0,8.0,0.0,8.0,9.0
2,86,1,0.0,0.0,1.5,0.0,0.0,0.0,3.0,0.0,...,4.0,0.5,0.0,0.0,0.0,0.5,4.0,0.0,3.0,1.5
3,94,1,0.0,0.0,0.5,0.0,0.0,0.0,2.0,3.0,...,0.0,0.5,0.0,0.5,2.0,0.0,1.0,0.0,0.0,0.0
4,127,1,0.0,0.0,0.5,0.0,0.0,0.0,3.0,0.0,...,0.0,0.5,0.0,0.0,0.0,1.5,1.5,4.0,3.0,0.0


In [3]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 472
Graduation Rate: 0.847457627118644


In [5]:
# Total enrollment for each course
courses = df.columns.tolist()[2:21]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 165 is 100
Total enrollment for 166 is 176
Total enrollment for 201 is 333
Total enrollment for 265 is 260
Total enrollment for 266 is 81
Total enrollment for 267 is 202
Total enrollment for 301 is 320
Total enrollment for 302 is 65
Total enrollment for 304 is 89
Total enrollment for 314 is 103
Total enrollment for 317 is 352
Total enrollment for 350 is 79
Total enrollment for 365 is 64
Total enrollment for 373 is 86
Total enrollment for 385 is 67
Total enrollment for 414 is 350
Total enrollment for 415 is 79
Total enrollment for 435 is 191
Total enrollment for 436 is 183


In [6]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 165 is 0.95
Success rate for 166 is 0.9431818181818182
Success rate for 201 is 0.9579579579579579
Success rate for 265 is 0.9538461538461539
Success rate for 266 is 0.9629629629629629
Success rate for 267 is 0.9405940594059405
Success rate for 301 is 0.853125
Success rate for 302 is 0.8769230769230769
Success rate for 304 is 0.9325842696629213
Success rate for 314 is 0.8737864077669902
Success rate for 317 is 0.8948863636363636
Success rate for 350 is 0.9493670886075949
Success rate for 365 is 0.953125
Success rate for 373 is 0.9186046511627907
Success rate for 385 is 0.9552238805970149
Success rate for 414 is 0.8428571428571429
Success rate for 415 is 0.9367088607594937
Success rate for 435 is 0.9057591623036649
Success rate for 436 is 0.9453551912568307


In [7]:
# Create train-test split
X = df.drop(['STUDENT','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [8]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
266: 0.23278397147462782
267: 0.23256187190048838
385: 0.2049167902219841
304: 0.20483146954413328
373: 0.20381721913030998
435: 0.19548502061972364
317: 0.16995004326976781
166: -0.15594594313309157
201: -0.13909633304959698
415: 0.11719067434806499
165: -0.10393989079659291
414: 0.09742914549272325
302: 0.09559395604640789
365: 0.07020751096312995
350: -0.05967593302764393
436: 0.04423954841521614
301: 0.040572099861574874
265: -0.034696537406879675
314: 0.007929723666072

Feature importances for log_reg:
385: 16.895918441590137
304: 0.6502300635548561
373: 0.3979769242649966
302: 0.38169385366253505
435: 0.2826181967907263
266: 0.262791758990796
365: 0.24141545189222352
317: 0.22956801175130784
415: 0.16442946821407967
267: 0.1166624912193127
350: -0.10346704475192951
414: 0.09888276402980312
201: -0.09141409609621047
301: 0.06662520274770442
436: 0.056736394654530106
314: -0.04794686271779804
165: -0.04285931378911827
265: 0.018869796043679395
166: -0.

In [9]:
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.8418, Std = 0.0210
log_reg: Mean -logloss = 0.8869, Std = 0.0241
svc_linear: Mean -logloss = 0.8643, Std = 0.0196
qda: Mean -logloss = 0.9123, Std = 0.0356
lda_poly: Mean -logloss = 0.7685, Std = 0.0444




log_reg_poly: Mean -logloss = 0.9067, Std = 0.0148
gnb: Mean -logloss = 0.7197, Std = 0.1832
knn: Mean -logloss = 0.8927, Std = 0.0190
svc_rbf: Mean -logloss = 0.9096, Std = 0.0329
