In [11]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [12]:
# Read in the general math courses data set
df = pd.read_csv('lee_gen_dataset.csv')
df.head()

Unnamed: 0,STUDENT,MAJOR,104,105,140,143,145,150,151,160,165,166,207,265,266,267,GRADUATE
0,0,Chemisty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,1
1,3,Liberal Arts and Sciences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2,4,Computer Science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,1
3,5,Interior Design,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7,Psychology,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [13]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 9144
Number of Majors: 119
Graduation Rate: 0.5110454943132109


In [14]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Psychology                        1434
Computer Science                  1032
Biology                            479
Liberal Arts and Sciences          412
Mathematics                        336
Chemisty                           294
Biochemistry                       249
Statistics                         243
Open Option                        235
English                            219
Management Information Systems     213
Criminal Justice                   202
History                            198
Comunication Studies               186
Physics                            164
Meteorology                        157
Economics                          148
Environmental Science              144
Political Science                  134
Kinesiology and Health             125
Genetics                           121
Marketing                          104
Criminal Justice Studies            98
Business                            96
Music                               93
Name: count, dtype:

In [15]:
# Total enrollment for each course
courses = df.columns.tolist()[2:16]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 104 is 1177
Total enrollment for 105 is 526
Total enrollment for 140 is 1303
Total enrollment for 143 is 1547
Total enrollment for 145 is 87
Total enrollment for 150 is 1084
Total enrollment for 151 is 99
Total enrollment for 160 is 347
Total enrollment for 165 is 2634
Total enrollment for 166 is 2226
Total enrollment for 207 is 875
Total enrollment for 265 is 1629
Total enrollment for 266 is 402
Total enrollment for 267 is 643


In [16]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 104 is 0.8530161427357689
Success rate for 105 is 0.8174904942965779
Success rate for 140 is 0.8503453568687643
Success rate for 143 is 0.8112475759534583
Success rate for 145 is 0.7701149425287356
Success rate for 150 is 0.8339483394833949
Success rate for 151 is 0.8282828282828283
Success rate for 160 is 0.8501440922190202
Success rate for 165 is 0.8523158694001519
Success rate for 166 is 0.8463611859838275
Success rate for 207 is 0.8502857142857143
Success rate for 265 is 0.8661755678330264
Success rate for 266 is 0.9104477611940298
Success rate for 267 is 0.895800933125972


In [17]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [18]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
160: 0.18728999819930592
266: 0.18113863886068943
104: 0.17842256471067233
140: 0.11610088332837531
105: 0.09644918582249726
150: 0.09375484725635014
145: -0.07813462569612281
207: 0.07325672095845182
267: 0.06453576516487226
143: -0.03504932516576837
265: 0.03222327026317216
166: -0.012299948003859296
151: -0.011362391442944066
165: -0.00020588687219289052

Feature importances for log_reg:
160: 0.1973797762670131
104: 0.18899264821101533
266: 0.18843306753937203
140: 0.12225454633250536
105: 0.09682009976979859
150: 0.0939540789931498
145: -0.079075581520165
207: 0.07265405611962245
267: 0.06371928446772211
143: -0.03394707284977092
265: 0.03261394795288073
166: -0.011879390861404495
151: -0.009771955343388386
165: 0.0006794694620200828

Feature importances for svc_linear:
160: 0.09020084317413549
266: 0.08727109489637677
104: 0.08599810750483755
140: 0.055965595246721206
105: 0.04653154901937378
150: 0.04515634605809703
145: -0.03761043618391857
207: 0.0

In [19]:
# Cross Validation
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.6060, Std = 0.0102
log_reg: Mean -logloss = 0.6048, Std = 0.0109
svc_linear: Mean -logloss = 0.6060, Std = 0.0102
qda: Mean -logloss = 0.5900, Std = 0.0058
lda_poly: Mean -logloss = 0.6145, Std = 0.0226
log_reg_poly: Mean -logloss = 0.6142, Std = 0.0182
gnb: Mean -logloss = 0.5846, Std = 0.0060
knn: Mean -logloss = 0.5910, Std = 0.0131
svc_rbf: Mean -logloss = 0.6290, Std = 0.0193
