In [1]:
from sensitive_info import *
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("./data/CS7_preprocessed.csv")
df.head(2)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x41,x42,x43,x44,x45,x46,x47,x48,x49,y
0,-0.166563,-3.961588,4.621113,2.481908,-1.800135,0.804684,6.718751,-14.789997,-1.040673,-4.20495,...,-1.497117,5.414063,-2.325655,1.674827,-0.264332,60.781427,-7.689696,0.151589,-8.040166,0
1,-0.149894,-0.585676,27.839856,4.152333,6.426802,-2.426943,40.477058,-6.725709,0.896421,0.330165,...,36.29279,4.490915,0.762561,6.526662,1.007927,15.805696,-4.896678,-0.320283,16.719974,0


In [None]:
df = convert_strings_to_category(df)
df.info()

In [4]:
train_df, val_df, test_df = create_train_val_test_sets(df=df, 
                                                       target_column="y", 
                                                       test_size=0.1, 
                                                       val_size=0.1, 
                                                       random_state=7742)

Train Shape: (128000, 51)
Val Shape: (16000, 51)
Test Shape: (16000, 51)


In [5]:
X_train = train_df.drop(columns="y")
y_train = train_df.loc[:,"y"].to_numpy()

X_val = val_df.drop(columns="y")
y_val = val_df.loc[:,"y"].to_numpy()

X_test = test_df.drop(columns="y")
y_test = test_df.loc[:,"y"].to_numpy()

# Subset to features selected by DAI
X_train_dai, X_val_dai, X_test_dai = subset_to_dai_selected_features(X_train=X_train, 
                                                                     X_val=X_val, 
                                                                     X_test=X_test)

DAI Train Shape: (128000, 17)
DAI Val Shape: (16000, 17)
DAI Test Shape: (16000, 17)


In [6]:
X_train_dai.head(2)

Unnamed: 0,x48,x23,x27,x20,x28,x46,x49,x37,x42,x12,x32,x7,x2,x38,x41,x6,x40
133862,-4.81828,-4.67667,10.35292,2.029764,-30.102992,32.780607,-14.191745,-72.62,0.582534,-10.735783,0.01,-15.072493,0.419611,-24.254114,-26.823132,0.610082,3.60433
57143,3.102613,1.519396,-12.122562,-4.19857,-16.416524,55.582887,34.763313,1918.51,6.188595,9.566781,0.0,2.656567,-0.427442,23.56976,26.06629,-0.621469,-20.189674


In [7]:
scorers_dict = {"accuracy":make_scorer(accuracy_score), 
                "f025_score":make_scorer(fbeta_score, beta=0.25),
                "f05_score":make_scorer(fbeta_score, beta=0.5), 
                "avg_dollars_lost_per_prediction":make_scorer(score_func=average_dollars_scorer_sklearn, 
                                                              greater_is_better=False)}

In [9]:
%%time 

mlp_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")), 
                           ("scale", StandardScaler()),
                           ("model", MLPClassifier(random_state=7742, 
                                                   verbose=True, 
                                                   warm_start=True, 
                                                   early_stopping=True,
                                                   max_iter=100_000))])

mlp_grid = {"model__hidden_layer_sizes":[(200, 100,), (300, 200, 100,), 
                                         (400, 300, 200, 100), (512, 256, 128, 64,)], 
            "model__alpha":np.logspace(-7,-2, 20).tolist(), 
            "model__learning_rate_init":np.logspace(-5,-0.3, 40).tolist() + [0.01]}

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_mlp = GridSearchCV(estimator=mlp_pipe, 
                  param_grid=mlp_grid, 
                  refit="avg_dollars_lost_per_prediction",
                  scoring=scorers_dict, 
                  cv=cv_splitter,
                  n_jobs=32, 
                  error_score="raise", 
                  return_train_score=True, 
                  verbose=2)

# gs_mlp.fit(X=X_train_dai, y=y_train)


# save_name = get_gs_save_name(model_name="MLP_SC")
# save_path=f"./models/{save_name}"
# with open(save_path, 'wb') as file:
#     pickle.dump(gs_mlp, file)

# PATH=f"./models/MLP_20221124_1303.pkl"
# gs_mlp = load_gs_from_pickle(pickle_filepath=PATH)

gs_df = gs_to_clean_df(gs_mlp.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
gs_df.loc[:,[c for c in gs_df.columns if "std" not in c]].head()

Fitting 5 folds for each of 3280 candidates, totalling 16400 fits
Iteration 1, loss = 0.20068362
Validation score: 0.953281
Iteration 2, loss = 0.13670806
Validation score: 0.960781
Iteration 3, loss = 0.11808104
Validation score: 0.968125
Iteration 4, loss = 0.10666698
Validation score: 0.970781
Iteration 5, loss = 0.10078183
Validation score: 0.972969
Iteration 6, loss = 0.09594607
Validation score: 0.969844
Iteration 7, loss = 0.09242462
Validation score: 0.973281
Iteration 8, loss = 0.08824939
Validation score: 0.975313
Iteration 9, loss = 0.08782944
Validation score: 0.975469
Iteration 10, loss = 0.08558967
Validation score: 0.972969
Iteration 11, loss = 0.08358750
Validation score: 0.975859
Iteration 12, loss = 0.08323828
Validation score: 0.977187
Iteration 13, loss = 0.08189735
Validation score: 0.973125
Iteration 14, loss = 0.08094009
Validation score: 0.977656
Iteration 15, loss = 0.08057543
Validation score: 0.976016
Iteration 16, loss = 0.07900717
Validation score: 0.977109

Unnamed: 0,alpha,hidden_layer_sizes,learning_rate_init,params,mean_test_accuracy,rank_test_accuracy,mean_train_accuracy,mean_test_f025_score,rank_test_f025_score,mean_train_f025_score,mean_test_f05_score,rank_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction
3260,0.01,"(512, 256, 128, 64)",0.003394,"{'model__alpha': 0.01, 'model__hidden_layer_si...",0.978914,3,0.983143,0.976775,2,0.981537,0.975765,1,0.980701,-1.148594,1,-0.91543
190,0.0,"(200, 100)",0.013594,"{'model__alpha': 1.8329807108324375e-07, 'mode...",0.977586,280,0.982436,0.977003,1,0.982342,0.975363,2,0.980947,-1.156406,2,-0.894414
1787,4.3e-05,"(512, 256, 128, 64)",0.007804,"{'model__alpha': 4.281332398719396e-05, 'model...",0.977414,388,0.982721,0.976766,3,0.982554,0.975134,8,0.981209,-1.167344,3,-0.882461
1545,2.3e-05,"(300, 200, 100)",0.023679,"{'model__alpha': 2.3357214690901213e-05, 'mode...",0.97825,44,0.981723,0.976488,4,0.980273,0.975297,4,0.979267,-1.1675,4,-0.980859
3097,0.005456,"(512, 256, 128, 64)",0.00448,"{'model__alpha': 0.005455594781168515, 'model_...",0.978977,1,0.983002,0.976096,13,0.980879,0.975337,3,0.980201,-1.173594,5,-0.941992


In [35]:
gs_df.sort_values(by="mean_test_accuracy").head(2)

Unnamed: 0,alpha,hidden_layer_sizes,learning_rate_init,params,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,mean_train_accuracy,std_train_accuracy,mean_test_f025_score,...,mean_test_f05_score,std_test_f05_score,rank_test_f05_score,mean_train_f05_score,std_train_f05_score,mean_test_avg_dollars_lost_per_prediction,std_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,std_train_avg_dollars_lost_per_prediction
611,1e-06,"(400, 300, 200, 100)",0.287723,"{'model__alpha': 6.158482110660267e-07, 'model...",0.598711,0.000146,3280,0.598701,0.000151,0.003264,...,0.00097,0.00092,3219,0.000776,0.000669,-8.036406,0.015282,3156,-8.036133,0.019175
2087,0.000144,"(400, 300, 200, 100)",0.287723,"{'model__alpha': 0.0001438449888287663, 'model...",0.598742,6.2e-05,3277,0.598756,1.5e-05,0.000659,...,0.000194,0.000238,3243,0.00034,0.000561,-8.028281,0.00606,3148,-8.027852,0.005125


In [10]:
MODEL_NAME="Gridsearch MLP (Scaled)"
mlpsc_train_pred_df, mlpsc_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_mlp, 
                                                                               X_train=X_train_dai, 
                                                                               y_train=y_train, 
                                                                               X_val=X_val_dai, 
                                                                               y_val=y_val, 
                                                                               model_name=MODEL_NAME)


accuracy: Train=0.9840859375, Validation=0.9788125
f025_score: Train=0.9825588223779509, Validation=0.9806444405135414
f05_score: Train=0.9817747248018651, Validation=0.9783054915211421
avg_dollars_lost_per_prediction: Train=0.86453125, Validation=1.00375


In [14]:
gs_df["mean_test_avg_dollars_lost_per_prediction"].describe()

count    3280.000000
mean       -2.592662
std         2.359923
min       -17.220313
25%        -2.106523
50%        -1.645234
75%        -1.342813
max        -1.148594
Name: mean_test_avg_dollars_lost_per_prediction, dtype: float64

In [1]:
# gs_mlp.best_params_

In [None]:
%%time 

mlp_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")), 
                           ("scale", StandardScaler()),
                           ("model", MLPClassifier(random_state=7742, 
                                                   verbose=True, 
                                                   warm_start=True, 
                                                   early_stopping=True,
                                                   max_iter=100_000))])

mlp_grid = {"model__hidden_layer_sizes":[(512, 256, 128, 64,), (1024, 512, 256, 128, 64,)], 
            "model__alpha":np.logspace(-2.1,-0.2, 20).tolist(), # Increased
            "model__learning_rate_init":np.logspace(-3,-2, 20).tolist() + [gs_mlp.best_params_['model__learning_rate_init']]}

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_mlp2 = GridSearchCV(estimator=mlp_pipe, 
                  param_grid=mlp_grid, 
                  refit="avg_dollars_lost_per_prediction",
                  scoring=scorers_dict, 
                  cv=cv_splitter,
                  n_jobs=32, 
                  error_score="raise", 
                  return_train_score=True, 
                  verbose=2)

# gs_mlp2.fit(X=X_train_dai, y=y_train)


# save_name = get_gs_save_name(model_name="MLP_SC_Tune2")
# save_path=f"./models/{save_name}"
# with open(save_path, 'wb') as file:
#     pickle.dump(gs_mlp2, file)

# PATH=f"./models/MLP_20221124_1303.pkl"
# gs_mlp = load_gs_from_pickle(pickle_filepath=PATH)

gs_df2 = gs_to_clean_df(gs_mlp2.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
gs_df2.loc[:,[c for c in gs_df2.columns if "std" not in c]].head()

Fitting 5 folds for each of 840 candidates, totalling 4200 fits
