In [10]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [11]:
# Read in the full data set
df = pd.read_csv('lee_full_dataset.csv')
df.head()

Unnamed: 0,STUDENT,MAJOR,104,105,140,143,145,150,151,160,...,317,350,365,373,385,414,415,435,436,GRADUATE
0,0,Chemisty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,3,Liberal Arts and Sciences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,4,Computer Science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,5,Interior Design,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7,Psychology,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [12]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 9374
Number of Majors: 119
Graduation Rate: 0.5138681459355665


In [13]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Psychology                        1434
Computer Science                  1054
Biology                            481
Mathematics                        454
Liberal Arts and Sciences          426
Chemisty                           297
Statistics                         261
Biochemistry                       249
Open Option                        235
English                            219
Management Information Systems     213
Criminal Justice                   202
History                            198
Comunication Studies               186
Physics                            185
Meteorology                        161
Economics                          158
Environmental Science              144
Political Science                  134
Kinesiology and Health             125
Genetics                           122
Marketing                          104
Criminal Justice Studies            98
Business                            96
Music                               94
Name: count, dtype:

In [14]:
# Total enrollment for each course
courses = df.columns.tolist()[2:30]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 104 is 1177
Total enrollment for 105 is 526
Total enrollment for 140 is 1303
Total enrollment for 143 is 1547
Total enrollment for 145 is 87
Total enrollment for 150 is 1084
Total enrollment for 151 is 99
Total enrollment for 160 is 347
Total enrollment for 165 is 2634
Total enrollment for 166 is 2226
Total enrollment for 201 is 703
Total enrollment for 207 is 875
Total enrollment for 265 is 1629
Total enrollment for 266 is 402
Total enrollment for 267 is 643
Total enrollment for 301 is 434
Total enrollment for 302 is 76
Total enrollment for 304 is 119
Total enrollment for 314 is 163
Total enrollment for 317 is 636
Total enrollment for 350 is 113
Total enrollment for 365 is 92
Total enrollment for 373 is 115
Total enrollment for 385 is 219
Total enrollment for 414 is 414
Total enrollment for 415 is 90
Total enrollment for 435 is 190
Total enrollment for 436 is 177


In [15]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 104 is 0.8530161427357689
Success rate for 105 is 0.8174904942965779
Success rate for 140 is 0.8503453568687643
Success rate for 143 is 0.8112475759534583
Success rate for 145 is 0.7701149425287356
Success rate for 150 is 0.8339483394833949
Success rate for 151 is 0.8282828282828283
Success rate for 160 is 0.8501440922190202
Success rate for 165 is 0.8523158694001519
Success rate for 166 is 0.8463611859838275
Success rate for 201 is 0.9345661450924608
Success rate for 207 is 0.8502857142857143
Success rate for 265 is 0.8661755678330264
Success rate for 266 is 0.9104477611940298
Success rate for 267 is 0.895800933125972
Success rate for 301 is 0.8732718894009217
Success rate for 302 is 0.9210526315789473
Success rate for 304 is 0.957983193277311
Success rate for 314 is 0.9141104294478528
Success rate for 317 is 0.9009433962264151
Success rate for 350 is 0.9380530973451328
Success rate for 365 is 0.9565217391304348
Success rate for 373 is 0.9391304347826087
Success rate 

In [16]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [17]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
160: 0.19668494504926642
104: 0.17354961084234893
304: 0.1665037818276473
140: 0.1445857186745379
266: 0.1116586419483708
151: 0.0944955856083807
150: 0.0917605405237262
385: 0.09098954628629068
105: 0.09063973889608037
414: 0.0893947323804049
207: 0.08241399661152932
435: 0.077450389210422
302: 0.07343251557318274
373: 0.07276311867627754
317: 0.06256166443730399
143: -0.0562839853707329
350: 0.049506071197634366
201: -0.04494256074958322
145: -0.03361326713071107
436: -0.031233005508180763
314: 0.02700562615436856
265: 0.02266607585425074
267: -0.022493525704424693
166: -0.016844360574515563
301: 0.015352707756538178
165: 0.012318738438474438
365: -0.008417704490276623
415: -0.001352458993603174

Feature importances for log_reg:
304: 0.2616300939527771
160: 0.20527227280739918
104: 0.18177487360443292
140: 0.1533592408900054
266: 0.11514768042587754
302: 0.11121559680942003
435: 0.10806317694369703
414: 0.10739485219029746
385: 0.10363659217618405
373: 0

In [18]:
# Cross Validation
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")


lda: Mean -logloss = 0.6175, Std = 0.0079
log_reg: Mean -logloss = 0.6166, Std = 0.0075
svc_linear: Mean -logloss = 0.6166, Std = 0.0092
qda: Mean -logloss = 0.5498, Std = 0.0104
lda_poly: Mean -logloss = 0.6266, Std = 0.0044
log_reg_poly: Mean -logloss = 0.6233, Std = 0.0071
gnb: Mean -logloss = 0.5413, Std = 0.0088
knn: Mean -logloss = 0.6196, Std = 0.0105
svc_rbf: Mean -logloss = 0.6370, Std = 0.0063
