In [1]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Read in the major math courses data set
df = pd.read_csv('maj_dataset.csv')
df.head()

Unnamed: 0,STUDENT,GRADUATE,MAJOR,201,301,302,304,314,317,350,365,373,385,414,415,435,436
0,9,1,Physics,6.5,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,5.5,0.0,0.0,0.0,0.0
1,27,1,Computer Science,0.0,0.0,0.0,0.0,0.0,6.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33,1,Nutritional Science,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51,1,Mathematics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
4,75,1,Mathematics,4.0,6.5,0.0,0.0,0.0,5.5,0.0,9.0,0.0,9.0,8.0,0.0,8.0,9.0


In [3]:
# Get some basic summary stats
print('Total Number of Students:', len(df))
print('Number of Majors:', df['MAJOR'].nunique())
print('Graduation Rate:', df['GRADUATE'].mean() )

Total Number of Students: 1040
Number of Majors: 53
Graduation Rate: 0.8769230769230769


In [4]:
# Top 25 Majors
maj_val = df['MAJOR'].value_counts()
maj_val.iloc[:25]

MAJOR
Mathematics                                 447
Physics                                     125
Computer Science                            122
Statistics                                  113
Economics                                    39
Liberal Arts and Sciences                    35
Meteorology                                  21
Chemisty                                     21
Biochemistry                                  9
Psychology                                    9
Data Science                                  7
Political Science                             5
Finance                                       5
Biology                                       5
World Languages and Cultures                  5
Mechanical Engineering                        4
Chemical Engineering                          4
Sociology                                     4
Philosophy                                    4
Materials Engineering                         3
Music                             

In [6]:
# Total enrollment for each course
courses = df.columns.tolist()[3:17]
for x in courses:
    print('Total enrollment for', x , 'is', np.count_nonzero(df[x]) )

Total enrollment for 201 is 708
Total enrollment for 301 is 472
Total enrollment for 302 is 86
Total enrollment for 304 is 136
Total enrollment for 314 is 177
Total enrollment for 317 is 663
Total enrollment for 350 is 127
Total enrollment for 365 is 105
Total enrollment for 373 is 131
Total enrollment for 385 is 227
Total enrollment for 414 is 481
Total enrollment for 415 is 107
Total enrollment for 435 is 215
Total enrollment for 436 is 196


In [7]:
# Success Rate for each course
for x in courses:
    print('Success rate for', x , 'is', df[x].gt(0).sum()/np.count_nonzero(df[x]))

Success rate for 201 is 0.940677966101695
Success rate for 301 is 0.8707627118644068
Success rate for 302 is 0.8837209302325582
Success rate for 304 is 0.9264705882352942
Success rate for 314 is 0.8983050847457628
Success rate for 317 is 0.8959276018099548
Success rate for 350 is 0.9291338582677166
Success rate for 365 is 0.9523809523809523
Success rate for 373 is 0.9312977099236641
Success rate for 385 is 0.933920704845815
Success rate for 414 is 0.8544698544698545
Success rate for 415 is 0.9252336448598131
Success rate for 435 is 0.9069767441860465
Success rate for 436 is 0.9438775510204082


In [8]:
# Create train-test split
X = df.drop(['STUDENT','MAJOR','GRADUATE'], axis=1)
y = df['GRADUATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [9]:
classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(penalty=None, max_iter= 100000),
     'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))])
}
importances = {}

for model_name, model in classifiers.items():
    model.fit(X_train, y_train)
    if hasattr(model, 'named_steps'):
        # For pipelines, get the final step
        final_model = model.named_steps[list(model.named_steps.keys())[-1]]
    else:
        final_model = model
    
    if hasattr(final_model, 'feature_importances_'):
        importances[model_name] = final_model.feature_importances_
    elif hasattr(final_model, 'coef_'):
        importances[model_name] = final_model.coef_[0]  # For linear models
        
# Print feature importances or coefficients ordered by absolute value
for model_name, importance in importances.items():
    print(f"\nFeature importances for {model_name}:")
    sorted_importances = sorted(zip(X.columns, importance), key=lambda x: abs(x[1]), reverse=True)
    for feature_name, value in sorted_importances:
        print(f"{feature_name}: {value}")


Feature importances for lda:
317: 0.14628605906254882
385: 0.13976211872272887
201: -0.1396471474874959
304: 0.12552390219545356
373: 0.1013337766627766
435: 0.08695755763181177
302: 0.06966393475547701
436: 0.06655725179681059
414: 0.05644848205612395
415: 0.051627135771779784
314: 0.03913204218081851
301: 0.03191376932715583
365: 0.010895276414271932
350: -0.007596673269740272

Feature importances for log_reg:
304: 0.2335971375352764
373: 0.17236736539895064
385: 0.16141451837855267
302: 0.14600540288045086
317: 0.12500452724867073
435: 0.11661139829887177
415: 0.11112298004052715
436: 0.11107561622247315
201: -0.07865470849243718
365: 0.07156184101135903
414: 0.06919743995147568
314: 0.052297220835496
301: 0.04180182312116122
350: -0.008025989674205447

Feature importances for svc_linear:
304: 0.052635610828104355
385: 0.04164782568064212
373: 0.034986510325278886
317: 0.03485974215994851
302: 0.03163718452963182
201: -0.028448249060731366
435: 0.027182460886515734
415: 0.022852313

In [10]:
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = 0.8692, Std = 0.0087
log_reg: Mean -logloss = 0.8628, Std = 0.0065
svc_linear: Mean -logloss = 0.8692, Std = 0.0087
qda: Mean -logloss = 0.6846, Std = 0.0357
lda_poly: Mean -logloss = 0.8449, Std = 0.0169
log_reg_poly: Mean -logloss = 0.8436, Std = 0.0119
gnb: Mean -logloss = 0.6423, Std = 0.0318
knn: Mean -logloss = 0.8449, Std = 0.0248
svc_rbf: Mean -logloss = 0.8769, Std = 0.0026
