In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials, space_eval

In [2]:
# Load the data
df1 = pd.read_csv("../data/01-raw/test_scores.csv")
df2 = pd.read_csv("../data/02-processed/normalized_data.csv")
df = pd.merge(df2, df1[["SUBJID", "State"]], how='left', on="SUBJID")
df.dropna(inplace=True)

# Clean the data
feature_cols = ["SAT", "GPA", "income"]
outcome_col = ["accepted"]

# Get dummy variables for school
df["school"] = df["school"].astype(np.int64)
dummy_school = pd.get_dummies(df["school"], prefix="school")
dummy_state = pd.get_dummies(df["State"], prefix="", prefix_sep="")
dummy_race = pd.get_dummies(df["race"], prefix="", prefix_sep="")
dummy_gender = pd.get_dummies(df["gender"], prefix="", prefix_sep="")
col_labels = feature_cols + list(dummy_school.columns) + list(dummy_state.columns) + list(dummy_race.columns) + list(dummy_gender.columns)
X = pd.concat([df[feature_cols], dummy_school, dummy_state, dummy_race, dummy_gender], axis=1).copy()

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X,
    df[outcome_col],
    random_state=17)

In [3]:
# Convert dataframes to np arrays
X_train = X_train_df.copy().to_numpy()
X_test  = X_test_df.copy().to_numpy()
y_train = y_train_df.copy().astype(np.int8).to_numpy().flatten()
y_test = y_test_df.copy().astype(np.int8).to_numpy().flatten()

In [40]:
rf = RandomForestClassifier(n_estimators=1_000,
                            n_jobs=-1,
                            random_state=17)
rf.fit(X_train, y_train)

In [41]:
y_pred = rf.predict(X_test)

In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.36      0.41      6710
           1       0.75      0.83      0.79     15456

    accuracy                           0.69     22166
   macro avg       0.61      0.59      0.60     22166
weighted avg       0.67      0.69      0.67     22166



## Let's try with the imputed data from Chris

In [27]:
df = pd.read_csv("../data/final_frame_non_imputed.csv")
X = df.drop(["CHOICE", "ACCPT1ST", "Unnamed: 0"], axis=1)
Y = df["ACCPT1ST"].replace({1:0, 2:1})
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X,
    Y,
    random_state=17)
# Convert dataframes to np arrays
X_train = X_train_df.copy().to_numpy()
X_test  = X_test_df.copy().to_numpy()
y_train = y_train_df.copy().astype(np.int8).to_numpy().flatten()
y_test = y_test_df.copy().astype(np.int8).to_numpy().flatten()

Unnamed: 0,ACERECODE,OBEREGION,STRAT,STUDWGT,REASON05,REASON06,REASON08,REASON07,REASON10,REASON11,...,ParentalIncome,Areyourparentsalive?divorced?,Fatherseducation,Motherseducation,Firstgenerationstatusbasedonparent(s)withlessthansomecollege,Fatherscareeraggregated,Motherscareeraggregated,Yourfathersreligiouspreference,Yourmothersreligiouspreference,Areyouaveteran?
80025,2162,5.0,5,3.007467,3.0,2.0,3.0,3.0,3.0,2.0,...,11,0,1,2,0,7,1,3,3,0
13315,2887,8.0,6,2.185718,3.0,3.0,2.0,2.0,3.0,3.0,...,3,0,7,1,0,1,19,4,3,0
27470,935,3.0,1,16.472996,3.0,3.0,3.0,3.0,3.0,3.0,...,5,0,2,2,0,7,14,4,4,0
9728,2985,8.0,2,7.504052,3.0,2.0,3.0,3.0,3.0,3.0,...,5,0,1,1,0,7,7,3,2,0
68371,1265,3.0,23,4.343277,3.0,1.0,2.0,3.0,2.0,1.0,...,7,0,2,1,0,12,6,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25631,857,3.0,5,7.677224,3.0,3.0,3.0,3.0,3.0,3.0,...,12,1,2,2,0,15,4,3,1,0
42297,235,1.0,14,4.343651,2.0,2.0,2.0,2.0,3.0,3.0,...,3,0,2,2,0,1,10,0,3,0
34959,2050,5.0,2,11.984230,3.0,3.0,3.0,3.0,2.0,3.0,...,3,0,2,2,0,1,15,1,3,0
64753,918,3.0,3,7.381224,3.0,3.0,2.0,2.0,2.0,1.0,...,1,0,2,1,0,7,14,2,2,0


In [29]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)

In [30]:
y_pred = rf.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.36      0.46      5648
           1       0.83      0.94      0.88     18998

    accuracy                           0.81     24646
   macro avg       0.74      0.65      0.67     24646
weighted avg       0.79      0.81      0.79     24646



## Bayesian Optimization

In [57]:
space = dict(
    # n_estimators = hp.choice("n_estimators", [10, 50, 100, 500, 1200]),
    n_estimators = hp.randint("n_estimators", 2, 100),
    criterion = hp.choice("criterion", ["entropy", "gini", "log_loss"]), 
    max_depth = hp.randint("max_depth", 10, 5000),
    min_samples_split = hp.uniform("min_samples_split", 0, 1),
    min_samples_leaf = hp.uniform("min_samples_leaf", 0, 1),
    max_features = hp.choice("max_features", ["sqrt", "log2", None])
)

def objective(space):
    model = RandomForestClassifier(
        n_estimators=space["n_estimators"],
        criterion=space["criterion"],
        max_depth=space["max_depth"],
        min_samples_split=space["min_samples_split"],
        min_samples_leaf=space["min_samples_leaf"],
        max_features=space["max_features"],
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # f1 = f1_score(y_test, y_pred, average="micro")
    # return {"loss": -f1, "status": STATUS_OK}
    accuracy = accuracy_score(y_test, y_pred)
    return {"loss": -accuracy, "status": STATUS_OK}

In [37]:
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,  # Tree of Parzen Estimators (Bayesian approach)
    max_evals=200
)

100%|██████████| 100/100 [05:06<00:00,  3.06s/trial, best loss: -0.7151042136605612]


In [54]:
space_eval(space, best_params)

{'criterion': 'gini',
 'max_depth': 2020,
 'max_features': None,
 'min_samples_leaf': 0.020216234142791834,
 'min_samples_split': 0.48097154920332075,
 'n_estimators': 10}

In [55]:
rf = RandomForestClassifier(**space_eval(space, best_params))
rf.fit(X_train, y_train)

In [56]:
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.25      0.34      6710
           1       0.74      0.92      0.82     15456

    accuracy                           0.72     22166
   macro avg       0.65      0.58      0.58     22166
weighted avg       0.69      0.72      0.67     22166

