In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Read in the full data set
df = pd.read_csv('lee_full_dataset.csv')
df.head()

Unnamed: 0,STUDENT,MAJOR,104,105,140,143,145,150,151,160,...,317,350,365,373,385,414,415,435,436,GRADUATE
0,0,Chemisty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,3,Liberal Arts and Sciences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,4,Computer Science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,5,Interior Design,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7,Psychology,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [3]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 9353
Number of Majors: 119
Graduation Rate: 0.5127766492034641


In [4]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Psychology                        1424
Computer Science                  1054
Biology                            481
Mathematics                        453
Liberal Arts and Sciences          420
Chemisty                           296
Statistics                         260
Biochemistry                       249
Open Option                        235
English                            219
Management Information Systems     213
Criminal Justice                   202
History                            196
Comunication Studies               186
Physics                            185
Meteorology                        161
Economics                          158
Environmental Science              144
Political Science                  134
Kinesiology and Health             125
Genetics                           122
Marketing                          104
Criminal Justice Studies            98
Business                            96
Music                               94
Name: count, dtype:

In [5]:
# Total enrollment for each course
courses = df.columns.tolist()[2:30]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 104 is 1171
Total enrollment for 105 is 524
Total enrollment for 140 is 1299
Total enrollment for 143 is 1546
Total enrollment for 145 is 87
Total enrollment for 150 is 1079
Total enrollment for 151 is 98
Total enrollment for 160 is 347
Total enrollment for 165 is 2634
Total enrollment for 166 is 2226
Total enrollment for 201 is 703
Total enrollment for 207 is 874
Total enrollment for 265 is 1628
Total enrollment for 266 is 402
Total enrollment for 267 is 643
Total enrollment for 301 is 433
Total enrollment for 302 is 76
Total enrollment for 304 is 118
Total enrollment for 314 is 163
Total enrollment for 317 is 635
Total enrollment for 350 is 113
Total enrollment for 365 is 91
Total enrollment for 373 is 114
Total enrollment for 385 is 219
Total enrollment for 414 is 413
Total enrollment for 415 is 89
Total enrollment for 435 is 190
Total enrollment for 436 is 177


In [6]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 104 is 0.8531169940222032
Success rate for 105 is 0.816793893129771
Success rate for 140 is 0.8498845265588915
Success rate for 143 is 0.8111254851228978
Success rate for 145 is 0.7701149425287356
Success rate for 150 is 0.8331788693234476
Success rate for 151 is 0.826530612244898
Success rate for 160 is 0.8501440922190202
Success rate for 165 is 0.8523158694001519
Success rate for 166 is 0.8463611859838275
Success rate for 201 is 0.9345661450924608
Success rate for 207 is 0.8501144164759725
Success rate for 265 is 0.8660933660933661
Success rate for 266 is 0.9104477611940298
Success rate for 267 is 0.895800933125972
Success rate for 301 is 0.8729792147806005
Success rate for 302 is 0.9210526315789473
Success rate for 304 is 0.9576271186440678
Success rate for 314 is 0.9141104294478528
Success rate for 317 is 0.9007874015748032
Success rate for 350 is 0.9380530973451328
Success rate for 365 is 0.967032967032967
Success rate for 373 is 0.9385964912280702
Success rate fo

In [7]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [8]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
160: 0.22797166331827395
104: 0.17451267608440163
304: 0.14874930032038952
140: 0.14867632263808128
266: 0.11679489880793657
150: 0.1023238875604928
414: 0.09755342156682675
435: 0.08725676953779374
207: 0.08496605362648374
373: 0.0791451091282421
105: 0.07726782506993511
385: 0.07233873179630504
302: 0.0671856045443982
317: 0.061355721350489056
143: -0.05172650594308262
350: 0.04273363166305006
201: -0.04220473651934754
314: 0.03970949813640744
436: -0.031453334977110614
151: 0.025763038661751746
415: 0.02226939174097745
265: 0.02137312441309826
145: -0.02086539073087955
166: -0.010185280843562253
301: 0.009967389269456183
365: -0.006888295192371208
267: -0.005518766732736198
165: -0.0033297969044547484

Feature importances for log_reg:
160: 0.24468549802374842
304: 0.23611000074694732
104: 0.18305614445091364
140: 0.15896958947704576
435: 0.12584034790053963
266: 0.12270738045269654
414: 0.11610079292829915
373: 0.112833378860537
302: 0.10547201013102492

In [9]:
# Cross Validation
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")


lda: Mean -logloss = 0.6158, Std = 0.0105
log_reg: Mean -logloss = 0.6155, Std = 0.0096
svc_linear: Mean -logloss = 0.6159, Std = 0.0107
qda: Mean -logloss = 0.5465, Std = 0.0138
lda_poly: Mean -logloss = 0.6317, Std = 0.0193
log_reg_poly: Mean -logloss = 0.6259, Std = 0.0180
gnb: Mean -logloss = 0.5394, Std = 0.0137
knn: Mean -logloss = 0.6052, Std = 0.0143
svc_rbf: Mean -logloss = 0.6263, Std = 0.0123
