In [1]:
# D Larue, 10/29/24
# Which courses predict graduation

# Data Processing
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression


In [21]:
# read in cleaned data
grades = pd.read_csv('newdata.csv')

# Split the data into features (X) and target (y)
X = grades.drop(['STUDENT','ENTRY_CCYY','SEM_CCYY.1','DEG_CD','GRAD_TIME'], axis=1)
y = grades['DEG_CD']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

fn = grades.columns.tolist()[5:]
cn=['not graduated','graduated']

In [23]:
rf = RandomForestClassifier(random_state=42,
                            class_weight='balanced_subsample',
                            bootstrap=True,
                            max_samples=0.7
                            )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy RandomForestClassifier:", accuracy)

Accuracy RandomForestClassifier: 0.6232106339468303


In [27]:
clf = DecisionTreeClassifier(max_depth = 1, 
                            #  random_state = 42, 
                             criterion = 'gini') # gini entropy
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy DecisionTreeClassifier:", accuracy)


Accuracy DecisionTreeClassifier: 0.7341513292433538


In [None]:
0.7341513292433538

In [7]:
#print feature importance from rf
# Get feature importances from the random forest classifier
importances = rf.feature_importances_

# Create a dataframe of features and their importance scores
feature_importance = pd.DataFrame({'feature': fn, 'importance': importances})

# Sort by importance descending
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Print the feature importances
print("Feature Importances:")
print(feature_importance)


Feature Importances:
                feature  importance
5   INTRO TO PROBABILTY    0.087241
12    PREP FOR CALCULUS    0.082955
2            CALCULUS I    0.078074
1           CALCULUS II    0.069869
3   MATRCES&LINEAR ALGB    0.066392
0          CALCULUS III    0.063532
4       COLLEGE ALGEBRA    0.052776
11  DISC MATH BUS&SOC S    0.040773
8   THRY LINEAR ALGEBRA    0.039834
15    ELEM DIFF EQUATNS    0.038798
25   SURVEY OF CALCULUS    0.036778
9       INTRO TO PROOFS    0.031743
7    DIFF EQ & TRANSFMS    0.031600
13  INTRO TO MATH IDEAS    0.027616
20           ANALYSIS I    0.026790
30   BUS & SOC SCI CALC    0.020506
19   ABSTRACT ALGEBRA I    0.020056
14  ORIENTATION IN MATH    0.019808
24    UNDERGRAD SEMINAR    0.017793
6   INTR PART DIFF EQUA    0.017405
10  HIGH SCHOOL ALGEBRA    0.017388
37  LIFE SCI CALC&MDL I    0.015063
17  INTR THY PROBAB&S I    0.013509
40   APPLD TRIGONOMETRY    0.009527
18  INTRO PROBAB&MATRIC    0.009165
26         GRAPH THEORY    0.008123
21     

In [8]:
#instantiate the logistic regression class
lr = LogisticRegression(random_state=42, 
                        class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy LogisticRegression:", accuracy)

Accuracy LogisticRegression: 0.6114519427402862


In [9]:
# Get coefficients from logistic regression
coef = lr.coef_[0]

# Create a dataframe of features and their coefficient values
lr_importance = pd.DataFrame({'feature': fn, 'coefficient': abs(coef)})

# Sort by absolute coefficient value descending 
lr_importance = lr_importance.sort_values('coefficient', ascending=False)

# Print the feature importances
print("Logistic Regression Feature Importances (by coefficient magnitude):")
print(lr_importance)


Logistic Regression Feature Importances (by coefficient magnitude):
                feature  coefficient
39  TEACH SEC SCHL MATH     1.180257
30   BUS & SOC SCI CALC     0.885704
24    UNDERGRAD SEMINAR     0.750702
37  LIFE SCI CALC&MDL I     0.743017
18  INTRO PROBAB&MATRIC     0.714024
36  NUMRC MTHDS DIFF EQ     0.618842
25   SURVEY OF CALCULUS     0.613351
5   INTRO TO PROBABILTY     0.609754
13  INTRO TO MATH IDEAS     0.488674
8   THRY LINEAR ALGEBRA     0.471956
15    ELEM DIFF EQUATNS     0.449634
38          ANALYSIS II     0.422136
17  INTR THY PROBAB&S I     0.372700
11  DISC MATH BUS&SOC S     0.371436
3   MATRCES&LINEAR ALGB     0.360283
35     DISCOVERING MATH     0.304468
14  ORIENTATION IN MATH     0.220100
4       COLLEGE ALGEBRA     0.214009
9       INTRO TO PROOFS     0.182393
42             TOPOLOGY     0.175226
20           ANALYSIS I     0.170897
32    INDEPENDENT STUDY     0.162395
16  INTR SCIENTF CMPTNG     0.157137
6   INTR PART DIFF EQUA     0.156901
33     

In [12]:
#make a copy of the data
X_copy = grades.drop(['STUDENT','ENTRY_CCYY','SEM_CCYY.1','DEG_CD','GRAD_TIME'], axis=1)
y_copy = grades['DEG_CD']

# In all columns of X_copy, replace each 1 with 0, and replace each 2 and 3 with 1
X_copy = X_copy.replace({1: 0, 2: 1, 3: 1})
#re-do the train-test split using X_copy and y_copy
X_train, X_test, y_train, y_test = train_test_split(X_copy, y_copy, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y_copy)

In [14]:
# create new random forest model, fit to the data, predict, and print accuracy
rf = RandomForestClassifier(random_state=42,
                            class_weight='balanced_subsample',
                            bootstrap=True,
                            max_samples=0.7
                            )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy RandomForestClassifier (modified):", accuracy)

Accuracy RandomForestClassifier (modified): 0.6119631901840491


In [16]:
# create new decision tree model, fit to the data, predict, and print accuracy
clf = DecisionTreeClassifier(max_depth = 1, 
                             random_state = 42, 
                             criterion = 'gini') # gini entropy
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy DecisionTreeClassifier (modified):", accuracy)

Accuracy DecisionTreeClassifier (modified): 0.7341513292433538


In [19]:
# create new logistic regression model, fit to the data, predict, and print accuracy
lr = LogisticRegression(random_state=42, 
                        class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy LogisticRegression:", accuracy)

Accuracy LogisticRegression: 0.6053169734151329


In [20]:
# Get feature importance from ranodm forest classifier and loistic regression
importances = rf.feature_importances_
rf_feature_importance = pd.DataFrame({'feature': fn, 'importance': importances})
rf_feature_importance = rf_feature_importance.sort_values('importance', ascending=False)

coef = lr.coef_[0]
lr_importance = pd.DataFrame({'feature': fn, 'coefficient': abs(coef)})
lr_importance = lr_importance.sort_values('coefficient', ascending=False)

# Print the feature importances
print("Random Forest Feature Importances (modified):")
print(rf_feature_importance)
print("Logistic Regression Feature Importances (modified):")
print(lr_importance)


Random Forest Feature Importances (modified):
                feature  importance
5   INTRO TO PROBABILTY    0.118500
3   MATRCES&LINEAR ALGB    0.063982
2            CALCULUS I    0.056811
11  DISC MATH BUS&SOC S    0.049982
4       COLLEGE ALGEBRA    0.047125
1           CALCULUS II    0.046047
0          CALCULUS III    0.045690
25   SURVEY OF CALCULUS    0.042232
8   THRY LINEAR ALGEBRA    0.042075
13  INTRO TO MATH IDEAS    0.039568
12    PREP FOR CALCULUS    0.039258
15    ELEM DIFF EQUATNS    0.039196
9       INTRO TO PROOFS    0.031360
24    UNDERGRAD SEMINAR    0.030138
7    DIFF EQ & TRANSFMS    0.028522
20           ANALYSIS I    0.027979
30   BUS & SOC SCI CALC    0.024930
14  ORIENTATION IN MATH    0.023921
6   INTR PART DIFF EQUA    0.021295
19   ABSTRACT ALGEBRA I    0.020862
17  INTR THY PROBAB&S I    0.020090
37  LIFE SCI CALC&MDL I    0.019489
10  HIGH SCHOOL ALGEBRA    0.015508
18  INTRO PROBAB&MATRIC    0.013279
26         GRAPH THEORY    0.009278
23          GEOMET