In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Read in the general math courses data set
df = pd.read_csv('gen_dataset.csv')
df.head()

Unnamed: 0,STUDENT,GRADUATE,MAJOR,104,105,140,143,145,150,151,160,165,166,207,265,266,267
0,0,1,Chemisty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
1,4,1,Computer Science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.5,4.0,0.0,0.0,0.0
2,7,1,Psychology,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,1,Physics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.5,0.0,3.0,0.0,4.0
4,10,0,Liberal Arts and Sciences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0


In [3]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 8779
Number of Majors: 115
Graduation Rate: 0.7866499601321335


In [4]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Psychology                        1462
Computer Science                   954
Biology                            465
Liberal Arts and Sciences          348
Mathematics                        346
Chemisty                           283
Statistics                         254
Biochemistry                       236
English                            216
Management Information Systems     201
Criminal Justice                   197
History                            190
Comunication Studies               186
Open Option                        184
Economics                          155
Physics                            149
Meteorology                        146
Environmental Science              143
Political Science                  130
Genetics                           121
Kinesiology and Health             113
Marketing                          101
Interdisciplinary Studies           99
Criminal Justice Studies            98
Music                               90
Name: count, dtype:

In [7]:
# Total enrollment for each course
courses = df.columns.tolist()[3:17]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 104 is 1209
Total enrollment for 105 is 523
Total enrollment for 140 is 1299
Total enrollment for 143 is 1372
Total enrollment for 145 is 82
Total enrollment for 150 is 1063
Total enrollment for 151 is 114
Total enrollment for 160 is 354
Total enrollment for 165 is 2424
Total enrollment for 166 is 2094
Total enrollment for 207 is 877
Total enrollment for 265 is 1588
Total enrollment for 266 is 410
Total enrollment for 267 is 637


In [8]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 104 is 0.8535980148883374
Success rate for 105 is 0.8260038240917782
Success rate for 140 is 0.850654349499615
Success rate for 143 is 0.8177842565597667
Success rate for 145 is 0.7804878048780488
Success rate for 150 is 0.8325493885230479
Success rate for 151 is 0.8421052631578947
Success rate for 160 is 0.8418079096045198
Success rate for 165 is 0.8543729372937293
Success rate for 166 is 0.8514804202483286
Success rate for 207 is 0.8597491448118586
Success rate for 265 is 0.8677581863979849
Success rate for 266 is 0.9073170731707317
Success rate for 267 is 0.8932496075353218


In [9]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [10]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
104: 0.13002294799046238
266: 0.1283742340139836
150: 0.12671178664532234
160: 0.12391346626601993
105: 0.11741761539190534
267: 0.10307584814659536
140: 0.07611775885675542
145: -0.06494817309510709
207: 0.06480723417456802
143: -0.04710788038178403
151: 0.03989738168424871
165: 0.02819298586378857
265: 0.02021927955833471
166: -0.0161823677755165

Feature importances for log_reg:
266: 0.14318468375144652
104: 0.14316511558879844
150: 0.13523470083822606
160: 0.13385545259692636
105: 0.12292822390962674
267: 0.10671228005055532
140: 0.08605964733084145
145: -0.06597229425322335
207: 0.06582085862559109
143: -0.04673261859252703
151: 0.04628247128644626
165: 0.024694058811546144
265: 0.021417864308388386
166: -0.015268945884050046

Feature importances for svc_linear:
104: 0.04321019888367075
266: 0.04277978038638225
150: 0.04258479564425209
160: 0.040629774031052876
105: 0.03904826297493625
267: 0.03395936966640303
140: 0.02604889505715685
145: -0.02154967

In [11]:
# Cross Validation
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.7824, Std = 0.0019
log_reg: Mean -logloss = 0.7814, Std = 0.0025
svc_linear: Mean -logloss = 0.7860, Std = 0.0010
qda: Mean -logloss = 0.6535, Std = 0.0589
lda_poly: Mean -logloss = 0.7810, Std = 0.0057
log_reg_poly: Mean -logloss = 0.7807, Std = 0.0044
gnb: Mean -logloss = 0.6548, Std = 0.0615
knn: Mean -logloss = 0.7778, Std = 0.0031
svc_rbf: Mean -logloss = 0.7865, Std = 0.0036
