In [1]:
# code by Peter Solis
# import dependencies
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# read in data
df = pd.read_csv('../01 - Data Crunching/dataset.csv')

In [3]:
# translating our target into categorical values for these models to work with
# https://www.codementor.io/@agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu was referenced for how to make a tree / forest model with these in mind
target_names = pd.factorize(df['Target'])[1]
df['Target'] = pd.factorize(df['Target'])[0]
for i in range(len(target_names)):
    print(f'{i} - {target_names[i]}')

0 - Dropout
1 - Graduate
2 - Enrolled


In [4]:
# split based on data type
categorical_df = df.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18]].copy()
numerical_df = df.iloc[:,[2,17,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33]].copy()
y = df[['Target']].copy()

In [5]:
# make non-binary categorical columns clearly categorical - you may want to also bin these in some cases
# more info on what each category means can be found here:
# https://www.mdpi.com/2306-5729/7/11/146
def categorize(cell):
    return f'cat_{cell}'
col_to_adjust = ['Marital status',
                 'Application mode',
                 'Course',
                 'Previous qualification',
                 'Nacionality',
                 "Mother's qualification",
                 "Father's qualification",
                 "Mother's occupation",
                 "Father's occupation"]
for col in col_to_adjust:
    categorical_df[col] = categorical_df[col].apply(categorize)
categorical_df.head()

Unnamed: 0,Marital status,Application mode,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International
0,cat_1,cat_8,cat_2,1,cat_1,cat_1,cat_13,cat_10,cat_6,cat_10,1,0,0,1,1,0,0
1,cat_1,cat_6,cat_11,1,cat_1,cat_1,cat_1,cat_3,cat_4,cat_4,1,0,0,0,1,0,0
2,cat_1,cat_1,cat_5,1,cat_1,cat_1,cat_22,cat_27,cat_10,cat_10,1,0,0,0,1,0,0
3,cat_1,cat_8,cat_15,1,cat_1,cat_1,cat_23,cat_27,cat_6,cat_4,1,0,0,1,0,0,0
4,cat_2,cat_12,cat_3,0,cat_1,cat_1,cat_22,cat_28,cat_10,cat_10,0,0,0,1,0,0,0


In [6]:
# finish pre-processing categorical data
# I referenced the tables of most important variables found in the cell above, and decided to exclude some categorical data that didn't play a huge role and would greatly increase the number of columns
# important categorical data with tons of values
to_drop = ["Application mode",
           "Nacionality",
           "Mother's qualification",
           "Father's qualification"]
cat_df_2 = pd.get_dummies(categorical_df.drop(columns = to_drop, axis = 1))
# try using PCA to deal with the huge number of values, we'll give it twice the number of columns of the actual values
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
pca_cat_df = pd.DataFrame(pca.fit_transform(cat_df_2), columns = ['PCA1','PCA2','PCA3','PCA4','PCA5','PCA6','PCA7','PCA8','PCA9','PCA10'])

In [7]:
# these are the categorical columns that stay as one column
others = ['Daytime/evening attendance',
          'Displaced',
          'Educational special needs',
          'Debtor',
          'Tuition fees up to date',
          'Gender',
          'Scholarship holder',
          'International']
parsed_cat_df = pd.concat([pca_cat_df, categorical_df[others]], axis = 1)
parsed_cat_df

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,Daytime/evening attendance,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International
0,-0.162155,0.028003,-0.872018,-0.003433,-0.282242,-0.504850,-0.167726,-0.306465,0.556036,-0.052341,1,1,0,0,1,1,0,0
1,-0.230441,-0.508810,-0.888226,-0.126296,-0.399005,0.016365,-0.087263,-0.038581,-0.104354,0.018103,1,1,0,0,0,1,0,0
2,0.442845,0.679985,-1.137401,-0.197740,-0.015074,0.027241,-0.076635,-0.162938,0.024400,0.336801,1,1,0,0,0,1,0,0
3,-0.720098,-0.190386,-0.003015,-0.299154,-0.633856,-0.212593,-0.346403,-0.191316,0.113821,-0.146014,1,1,0,0,1,0,0,0
4,1.459907,0.612459,0.796117,-0.487269,-0.179858,-0.687227,0.330812,-0.473650,-0.068596,0.049842,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,0.111364,-0.748511,-0.315097,0.578126,-0.167337,-0.398868,-0.376538,-0.099547,0.040162,0.014352,1,0,0,0,1,1,0,0
4420,0.261958,0.938418,-0.512705,-0.692237,-0.074575,0.536392,-0.633831,-0.258722,-0.224359,0.821964,1,1,0,1,0,0,0,1
4421,-0.288204,1.448425,0.096071,0.118628,0.315241,-0.259958,0.162107,0.441754,0.364938,0.167924,1,1,0,0,1,0,1,0
4422,-0.756697,0.173565,0.243676,0.278891,0.065781,0.158652,0.131793,-0.392761,0.089649,-0.129306,1,1,0,0,1,0,1,0


In [8]:
# finish preprocessing by scaling and doing PCA
scaled_num_df = pd.DataFrame(StandardScaler().fit_transform(numerical_df), columns = numerical_df.columns)
final_df = pd.concat([parsed_cat_df, scaled_num_df], axis = 1)
final_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,-0.162155,0.028003,-0.872018,-0.003433,-0.282242,-0.50485,-0.167726,-0.306465,0.556036,-0.052341,...,-0.199273,-0.282442,-2.838337,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761
1,-0.230441,-0.50881,-0.888226,-0.126296,-0.399005,0.016365,-0.087263,-0.038581,-0.104354,0.018103,...,-0.199273,-0.282442,-0.105726,-0.522682,0.518904,0.659562,-0.199441,0.876222,-1.105222,0.347199
2,0.442845,0.679985,-1.137401,-0.19774,-0.015074,0.027241,-0.076635,-0.162938,0.0244,0.336801,...,-0.199273,-0.282442,-0.105726,-2.04263,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761
3,-0.720098,-0.190386,-0.003015,-0.299154,-0.633856,-0.212593,-0.346403,-0.191316,0.113821,-0.146014,...,-0.199273,-0.282442,-0.105726,0.490616,0.187165,0.41645,-0.199441,-0.813253,-1.466871,-1.375511
4,1.459907,0.612459,0.796117,-0.487269,-0.179858,-0.687227,0.330812,-0.47365,-0.068596,0.049842,...,-0.199273,-0.282442,-0.105726,-0.522682,0.518904,0.531608,-0.199441,0.876222,-1.105222,0.347199


In [9]:
# just testing w/ PCA again after all variables in
pca2 = PCA(n_components = 10)
pca2_df = pd.DataFrame(pca2.fit_transform(final_df))
sum(pca2.explained_variance_ratio_)

0.8241338598630407

In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(final_df, y, random_state = 0)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pca2_df, y, random_state = 0)

In [11]:
# first model, tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predict = tree_model.predict(X_test)

In [12]:
# results
print(classification_report(y_test, tree_predict, target_names = target_names))
confusion_matrix(y_test, tree_predict)

              precision    recall  f1-score   support

     Dropout       0.68      0.71      0.70       353
    Graduate       0.80      0.74      0.77       560
    Enrolled       0.31      0.35      0.33       193

    accuracy                           0.67      1106
   macro avg       0.60      0.60      0.60      1106
weighted avg       0.68      0.67      0.67      1106



array([[252,  36,  65],
       [ 58, 417,  85],
       [ 59,  66,  68]], dtype=int64)

In [13]:
# second model, tree w/ fully PCA data
tree_model_pca = DecisionTreeClassifier()
tree_model_pca.fit(X_train_pca, y_train_pca)
tree_predict_pca = tree_model_pca.predict(X_test_pca)

In [14]:
# results
print(classification_report(y_test_pca, tree_predict_pca, target_names = target_names))
confusion_matrix(y_test_pca, tree_predict_pca)

              precision    recall  f1-score   support

     Dropout       0.67      0.65      0.66       353
    Graduate       0.79      0.76      0.78       560
    Enrolled       0.32      0.36      0.34       193

    accuracy                           0.66      1106
   macro avg       0.59      0.59      0.59      1106
weighted avg       0.67      0.66      0.66      1106



array([[231,  50,  72],
       [ 53, 427,  80],
       [ 61,  62,  70]], dtype=int64)

In [15]:
# third model, forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train['Target'].ravel())
forest_predict = forest_model.predict(X_test)

In [16]:
# results
print(classification_report(y_test, forest_predict, target_names = target_names))
confusion_matrix(y_test, forest_predict)

              precision    recall  f1-score   support

     Dropout       0.80      0.76      0.78       353
    Graduate       0.81      0.94      0.87       560
    Enrolled       0.55      0.34      0.42       193

    accuracy                           0.78      1106
   macro avg       0.72      0.68      0.69      1106
weighted avg       0.76      0.78      0.76      1106



array([[269,  49,  35],
       [ 16, 526,  18],
       [ 51,  77,  65]], dtype=int64)

In [17]:
# fourth model, forest w/ fully PCA data
forest_model_pca = RandomForestClassifier()
forest_model_pca.fit(X_train_pca, y_train_pca['Target'].ravel())
forest_predict_pca = forest_model_pca.predict(X_test_pca)

In [18]:
# results
print(classification_report(y_test_pca, forest_predict_pca, target_names = target_names))
confusion_matrix(y_test_pca, forest_predict_pca)

              precision    recall  f1-score   support

     Dropout       0.75      0.74      0.74       353
    Graduate       0.80      0.92      0.86       560
    Enrolled       0.49      0.29      0.36       193

    accuracy                           0.75      1106
   macro avg       0.68      0.65      0.65      1106
weighted avg       0.73      0.75      0.73      1106



array([[260,  59,  34],
       [ 20, 516,  24],
       [ 66,  71,  56]], dtype=int64)

In [19]:
# fifth model, forest with more estimators
forest_model_250 = RandomForestClassifier(n_estimators = 250)
forest_model_250.fit(X_train, y_train['Target'].ravel())
forest_predict_250 = forest_model_250.predict(X_test)

In [20]:
# results
print(classification_report(y_test, forest_predict_250, target_names = target_names))
confusion_matrix(y_test, forest_predict_250)

              precision    recall  f1-score   support

     Dropout       0.81      0.78      0.79       353
    Graduate       0.82      0.95      0.88       560
    Enrolled       0.59      0.33      0.42       193

    accuracy                           0.79      1106
   macro avg       0.74      0.69      0.70      1106
weighted avg       0.77      0.79      0.77      1106



array([[277,  43,  33],
       [ 15, 533,  12],
       [ 52,  77,  64]], dtype=int64)

In [21]:
# sixth model, Support Vector Machine
sv_model = SVC(kernel = 'linear')
sv_model.fit(X_train, y_train['Target'].ravel())
sv_predict = sv_model.predict(X_test)

In [22]:
# results
print(classification_report(y_test, sv_predict, target_names = target_names))
confusion_matrix(y_test, sv_predict)

              precision    recall  f1-score   support

     Dropout       0.82      0.71      0.76       353
    Graduate       0.80      0.94      0.87       560
    Enrolled       0.53      0.39      0.45       193

    accuracy                           0.77      1106
   macro avg       0.72      0.68      0.69      1106
weighted avg       0.76      0.77      0.76      1106



array([[250,  51,  52],
       [ 17, 527,  16],
       [ 38,  79,  76]], dtype=int64)

In [23]:
# seventh model, Support Vector Machine w/ fully PCA data
sv_model_pca = SVC(kernel = 'linear')
sv_model_pca.fit(X_train_pca, y_train_pca['Target'].ravel())
sv_predict_pca = sv_model_pca.predict(X_test_pca)

In [24]:
# results
print(classification_report(y_test_pca, sv_predict_pca, target_names = target_names))
confusion_matrix(y_test_pca, sv_predict_pca)

              precision    recall  f1-score   support

     Dropout       0.77      0.72      0.74       353
    Graduate       0.78      0.92      0.84       560
    Enrolled       0.52      0.32      0.39       193

    accuracy                           0.75      1106
   macro avg       0.69      0.65      0.66      1106
weighted avg       0.73      0.75      0.73      1106



array([[253,  62,  38],
       [ 29, 513,  18],
       [ 48,  84,  61]], dtype=int64)

In [28]:
# list of level of importance
sorted(zip(forest_model_250.feature_importances_, final_df.columns), reverse = True)

[(0.13165467910969564, 'Curricular units 2nd sem (approved)'),
 (0.09204203202250096, 'Curricular units 2nd sem (grade)'),
 (0.08332466227098662, 'Curricular units 1st sem (approved)'),
 (0.05966356353077985, 'Curricular units 1st sem (grade)'),
 (0.04391077088975062, 'PCA10'),
 (0.03727185835294234, 'PCA3'),
 (0.03544122742922339, 'PCA1'),
 (0.03479011881119552, 'Curricular units 2nd sem (evaluations)'),
 (0.03455234235014338, 'Tuition fees up to date'),
 (0.0336352838280952, 'PCA2'),
 (0.03337919063472985, 'PCA8'),
 (0.03234096279054048, 'Age at enrollment'),
 (0.031993836016008255, 'PCA4'),
 (0.031932012833623194, 'PCA6'),
 (0.031759296408460126, 'PCA7'),
 (0.030787883850150088, 'PCA9'),
 (0.030008314154021467, 'PCA5'),
 (0.029715182505264146, 'Curricular units 1st sem (evaluations)'),
 (0.020779321104547648, 'Curricular units 2nd sem (enrolled)'),
 (0.01986582988797949, 'GDP'),
 (0.019127803477052712, 'Curricular units 1st sem (enrolled)'),
 (0.019093349667788147, 'Unemployment rat

In [25]:
'''
Model performance in order:
1 - Random Forest w/ 250 predictors - 79%
2 - Random Forest w/ 100 predictors - 78%
3 - Support Vector Machine - 77%
4 - Random Forest w/ fully PCA data - 75%
5 - Support Vector Machine w/ fully PCA data - 75%
6 - Decision Tree - 67%
7 - Decision Tree w/ fully PCA data - 66%

Conclusion:
Using fully PCA data did hurt the outcome, but not quite as much as I expected. Random Forest seems to be the best model by far, and a few tests with different numbers of predictors found it wasn't worth using more than 200-250 or so. Interesting things to try would include:
    - Only focus on variables the original dataset users found most important in their models
    - Use all the original data dummied out, rather than using PCA to reduce the number of columns for huge categorical stuff
    - Don't drop any columns
    - Find and try other supervised learning models we didn't learn in class
'''

'\nModel performance in order:\n1 - \n'