In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
# Read in the math majors data set
df = pd.read_csv('lee_math_dataset.csv')
df.head()

Unnamed: 0,STUDENT,165,166,201,265,266,267,301,302,304,...,317,350,365,373,385,414,415,435,436,GRADUATE
0,51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1
1,75,2.0,3.0,4.0,4.0,0.0,5.0,6.0,0.0,0.0,...,5.0,0.0,8.0,0.0,8.0,7.0,0.0,7.0,8.0,1
2,86,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,4.0,0.0,3.0,2.0,1
3,94,0.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,...,1.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,1
4,128,0.0,0.0,2.0,1.0,0.0,3.0,5.0,0.0,7.0,...,3.0,4.0,0.0,0.0,0.0,8.0,0.0,7.0,0.0,1


In [4]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 453
Graduation Rate: 0.6114790286975718


In [7]:
# Total enrollment for each course
courses = df.columns.tolist()[1:20]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 165 is 102
Total enrollment for 166 is 175
Total enrollment for 201 is 323
Total enrollment for 265 is 250
Total enrollment for 266 is 76
Total enrollment for 267 is 194
Total enrollment for 301 is 282
Total enrollment for 302 is 56
Total enrollment for 304 is 74
Total enrollment for 314 is 91
Total enrollment for 317 is 327
Total enrollment for 350 is 68
Total enrollment for 365 is 53
Total enrollment for 373 is 74
Total enrollment for 385 is 56
Total enrollment for 414 is 299
Total enrollment for 415 is 64
Total enrollment for 435 is 170
Total enrollment for 436 is 164


In [8]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 165 is 0.9509803921568627
Success rate for 166 is 0.9371428571428572
Success rate for 201 is 0.9504643962848297
Success rate for 265 is 0.948
Success rate for 266 is 0.9605263157894737
Success rate for 267 is 0.9484536082474226
Success rate for 301 is 0.8617021276595744
Success rate for 302 is 0.9107142857142857
Success rate for 304 is 0.9594594594594594
Success rate for 314 is 0.8901098901098901
Success rate for 317 is 0.8990825688073395
Success rate for 350 is 0.9558823529411765
Success rate for 365 is 0.9622641509433962
Success rate for 373 is 0.9324324324324325
Success rate for 385 is 0.9642857142857143
Success rate for 414 is 0.862876254180602
Success rate for 415 is 0.921875
Success rate for 435 is 0.9294117647058824
Success rate for 436 is 0.9634146341463414


In [10]:
# Create train-test split
X = df.drop(['STUDENT','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [11]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
304: 0.29480903463057434
373: 0.19857899745733695
166: -0.17591901600926874
201: -0.17441130077460523
302: 0.13346870535714114
267: 0.1074490981675274
317: 0.0885300002426222
414: 0.08537006440453646
435: 0.08462325750613356
385: 0.06903843637349816
436: 0.06901881859338377
415: 0.06348876437789831
266: 0.05263261493942331
314: 0.04638814970008905
265: -0.04258564927674052
350: 0.0424487906004509
165: -0.02415811430653786
301: 0.01801503805113098
365: -0.008387256955811185

Feature importances for log_reg:
304: 0.5313305218752841
373: 0.24997778062293594
302: 0.21629448127823378
166: -0.1821276519466776
201: -0.17300594479211837
435: 0.10316461472353484
385: 0.0906470114961204
415: 0.0871929281117465
267: 0.08497473313758966
436: 0.08011148411975524
317: 0.07795188176543578
414: 0.07558692345574804
265: -0.05281680957061115
314: 0.052477396774416964
350: 0.048398510075103006
266: 0.039158136780787486
365: 0.03632478567343711
165: 0.016889828377409394
301: 

In [12]:
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.7581, Std = 0.0514
log_reg: Mean -logloss = 0.7493, Std = 0.0549
svc_linear: Mean -logloss = 0.7552, Std = 0.0496
qda: Mean -logloss = 0.7491, Std = 0.0646
lda_poly: Mean -logloss = 0.6873, Std = 0.0521
log_reg_poly: Mean -logloss = 0.7787, Std = 0.0467
gnb: Mean -logloss = 0.7019, Std = 0.0463
knn: Mean -logloss = 0.7877, Std = 0.0340
svc_rbf: Mean -logloss = 0.8023, Std = 0.0392
lda: Mean -logloss = 0.7581, Std = 0.0514
log_reg: Mean -logloss = 0.7493, Std = 0.0549
svc_linear: Mean -logloss = 0.7552, Std = 0.0496
qda: Mean -logloss = 0.7491, Std = 0.0646
lda_poly: Mean -logloss = 0.6873, Std = 0.0521
log_reg_poly: Mean -logloss = 0.7787, Std = 0.0467
gnb: Mean -logloss = 0.7019, Std = 0.0463
knn: Mean -logloss = 0.7877, Std = 0.0340
svc_rbf: Mean -logloss = 0.8023, Std = 0.0392
