In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Read in the full data set
df = pd.read_csv('full_dataset.csv')
df.head()

Unnamed: 0,STUDENT,GRADUATE,MAJOR,104,105,140,143,145,150,151,...,314,317,350,365,373,385,414,415,435,436
0,0,1,Chemisty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,1,Computer Science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,Psychology,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,1,Physics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,5.5,0.0,0.0,0.0,0.0
4,10,0,Liberal Arts and Sciences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 9014
Number of Majors: 115
Graduation Rate: 0.7883292655868649


In [4]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Psychology                        1462
Computer Science                   975
Mathematics                        472
Biology                            467
Liberal Arts and Sciences          360
Chemisty                           287
Statistics                         272
Biochemistry                       236
English                            216
Management Information Systems     201
Criminal Justice                   197
History                            190
Comunication Studies               186
Open Option                        184
Physics                            170
Economics                          164
Meteorology                        150
Environmental Science              143
Political Science                  130
Genetics                           122
Kinesiology and Health             113
Marketing                          101
Interdisciplinary Studies           99
Criminal Justice Studies            98
Music                               90
Name: count, dtype:

In [5]:
# Total enrollment for each course
courses = df.columns.tolist()[3:31]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 104 is 1209
Total enrollment for 105 is 523
Total enrollment for 140 is 1299
Total enrollment for 143 is 1372
Total enrollment for 145 is 82
Total enrollment for 150 is 1063
Total enrollment for 151 is 114
Total enrollment for 160 is 354
Total enrollment for 165 is 2424
Total enrollment for 166 is 2094
Total enrollment for 201 is 708
Total enrollment for 207 is 877
Total enrollment for 265 is 1588
Total enrollment for 266 is 410
Total enrollment for 267 is 637
Total enrollment for 301 is 472
Total enrollment for 302 is 86
Total enrollment for 304 is 136
Total enrollment for 314 is 177
Total enrollment for 317 is 663
Total enrollment for 350 is 127
Total enrollment for 365 is 105
Total enrollment for 373 is 131
Total enrollment for 385 is 227
Total enrollment for 414 is 481
Total enrollment for 415 is 107
Total enrollment for 435 is 215
Total enrollment for 436 is 196


In [6]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 104 is 0.8535980148883374
Success rate for 105 is 0.8260038240917782
Success rate for 140 is 0.850654349499615
Success rate for 143 is 0.8177842565597667
Success rate for 145 is 0.7804878048780488
Success rate for 150 is 0.8325493885230479
Success rate for 151 is 0.8421052631578947
Success rate for 160 is 0.8418079096045198
Success rate for 165 is 0.8543729372937293
Success rate for 166 is 0.8514804202483286
Success rate for 201 is 0.940677966101695
Success rate for 207 is 0.8597491448118586
Success rate for 265 is 0.8677581863979849
Success rate for 266 is 0.9073170731707317
Success rate for 267 is 0.8932496075353218
Success rate for 301 is 0.8707627118644068
Success rate for 302 is 0.8837209302325582
Success rate for 304 is 0.9264705882352942
Success rate for 314 is 0.8983050847457628
Success rate for 317 is 0.8959276018099548
Success rate for 350 is 0.9291338582677166
Success rate for 365 is 0.9523809523809523
Success rate for 373 is 0.9312977099236641
Success rate 

In [7]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [9]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
145: -0.29166652524359304
104: 0.14129147576307471
160: 0.13637320150088017
105: 0.12071535374360685
150: 0.11418176733015645
385: 0.11270002473162337
151: 0.10903043014542368
266: 0.09294479262692026
140: 0.09160473332834779
317: 0.08326021052108545
207: 0.0810868950989915
304: 0.07742773497486231
314: 0.06796805299242234
435: 0.06299868470101025
415: 0.05848663368699742
373: 0.04872174938370264
436: 0.04129279168277617
143: -0.03941272108678133
302: 0.03590284247679015
165: 0.02331370221292569
301: 0.020882898771798684
414: 0.018813961329100425
350: -0.01863742498824377
201: -0.01785793173519084
166: -0.009886819851891984
265: -0.008976095580730942
365: -0.00825141648276761
267: -0.0009325317051656029

Feature importances for log_reg:
304: 0.2453109899274401
145: -0.22546934342234823
385: 0.2032154502607883
314: 0.2025080090582656
435: 0.16226354185669498
104: 0.1569271265557381
415: 0.14868449471568618
160: 0.14724562887549922
151: 0.14687284638285406
3

In [10]:
# Cross Validation
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")


lda: Mean -logloss = 0.7845, Std = 0.0023
log_reg: Mean -logloss = 0.7818, Std = 0.0041
svc_linear: Mean -logloss = 0.7862, Std = 0.0011
qda: Mean -logloss = 0.3541, Std = 0.0204
lda_poly: Mean -logloss = 0.7746, Std = 0.0029
log_reg_poly: Mean -logloss = 0.7741, Std = 0.0018
gnb: Mean -logloss = 0.3632, Std = 0.0165
knn: Mean -logloss = 0.7722, Std = 0.0067
svc_rbf: Mean -logloss = 0.7871, Std = 0.0024
