In [1]:
from sensitive_info import *
pd.set_option("display.max.columns", 500)

In [2]:
df = pd.read_csv("./data/CS7_preprocessed.csv")
df.head(2)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x41,x42,x43,x44,x45,x46,x47,x48,x49,y
0,-0.166563,-3.961588,4.621113,2.481908,-1.800135,0.804684,6.718751,-14.789997,-1.040673,-4.20495,...,-1.497117,5.414063,-2.325655,1.674827,-0.264332,60.781427,-7.689696,0.151589,-8.040166,0
1,-0.149894,-0.585676,27.839856,4.152333,6.426802,-2.426943,40.477058,-6.725709,0.896421,0.330165,...,36.29279,4.490915,0.762561,6.526662,1.007927,15.805696,-4.896678,-0.320283,16.719974,0


In [None]:
object_columns = df.select_dtypes(include=[object]).columns.tolist()
df.loc[:,object_columns] = df.loc[:,object_columns].astype("category")
df.info()

In [4]:
train_df, val_df, test_df = create_train_val_test_sets(df=df, 
                                                       target_column="y", 
                                                       test_size=0.1, 
                                                       val_size=0.1, 
                                                       random_state=7742)

Train Shape: (128000, 51)
Val Shape: (16000, 51)
Test Shape: (16000, 51)


In [5]:
X_train = train_df.drop(columns="y")
y_train = train_df.loc[:,"y"].to_numpy()

X_val = val_df.drop(columns="y")
y_val = val_df.loc[:,"y"].to_numpy()

X_test = test_df.drop(columns="y")
y_test = test_df.loc[:,"y"].to_numpy()

# Subset to features selected by DAI
X_train_dai, X_val_dai, X_test_dai = subset_to_dai_selected_features(X_train=X_train, 
                                                                     X_val=X_val, 
                                                                     X_test=X_test)

DAI Train Shape: (128000, 17)
DAI Val Shape: (16000, 17)
DAI Test Shape: (16000, 17)


In [6]:
scorers_dict = {"accuracy":"accuracy", 
                "f025_score":make_scorer(fbeta_score, beta=0.25),
                "f05_score":make_scorer(fbeta_score, beta=0.5), 
                "avg_dollars_lost_per_prediction":make_scorer(score_func=average_dollars_scorer_sklearn, 
                                                              greater_is_better=False), 
                "best_iter_early_stopping":get_lgbm_best_iter_sklearn}

In [7]:
%%time

parameter_grid = {"model__boosting_type":["gbdt"], 
                  "model__colsample_bytree":[0.9, 1.0], 
                  "model__bagging_seed":[7742], 
                  "model__feature_fraction_seed":[7743], 
                  "model__importance_type":["gain"], 
                  "model__learning_rate":[0.005, 0.0055, 
                                          0.006, 0.0065, 
                                          0.007, 0.0075, 
                                          0.008, 0.0085], 
                  "model__max_bin":[510, 1020], 
                  "model__max_delta_step":[0.0], 
                  "model__max_depth":[-1, 30], 
                  "model__num_leaves":[1024, 2048], 
                  "model__min_child_samples":[20], 
                  "model__min_child_weight":[0.001], 
                  "model__min_data_in_bin":[1], 
                  "model__min_split_gain":[0.0],
                  "model__n_estimators":[7_500], 
                  "model__objective":["binary"], 
                  "model__reg_alpha":[0.0], 
                  "model__reg_lambda":[1.0, 2.0, 5.0], 
                  "model__scale_pos_weight":[1.0], 
                  "model__seed":[534401655], 
                  "model__subsample":[0.7, 0.8], 
                  "model__subsample_freq":[1]}

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

model_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")), 
                             ('model', LGBMClassifier(n_jobs=32, random_state=7742))])


gs_no_mono = GridSearchCV(estimator=model_pipe, 
                  param_grid=parameter_grid, 
                  refit="avg_dollars_lost_per_prediction",
                  scoring=scorers_dict, 
                  cv=cv_splitter,
                  n_jobs=32, 
                  error_score="raise", 
                  return_train_score=True, 
                  verbose=2)

# gs_no_mono.fit(X=X_train_dai, 
#        y=y_train, 
#        model__callbacks=[early_stopping(stopping_rounds=5, 
#                                         first_metric_only=True, 
#                                         verbose=True)], 
#        model__eval_metric=[average_dollars_scorer_lgbm, 
#                            "binary"], 
#        model__eval_set=[(X_val_dai, y_val)])


# save_name = get_gs_save_name(model_name="LGBM_GS_No_Mono")
# save_path=f"./models/{save_name}"
# with open(save_path, 'wb') as file:
#     pickle.dump(gs_no_mono, file)

    
PATH=f"./models/LGBM_GS_No_Mono_20221124_0625.pkl"
gs_no_mono = load_gs_from_pickle(pickle_filepath=PATH)

gs_df = gs_to_clean_df(gs_no_mono.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
gs_df.loc[:,[c for c in gs_df.columns if "std" not in c]].head()

CPU times: total: 21.8 s
Wall time: 1.17 s


Unnamed: 0,bagging_seed,boosting_type,colsample_bytree,feature_fraction_seed,importance_type,learning_rate,max_bin,max_delta_step,max_depth,min_child_samples,...,mean_train_f025_score,mean_test_f05_score,rank_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,rank_test_best_iter_early_stopping,mean_train_best_iter_early_stopping
406,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.998465,0.947904,1,0.99836,-2.463281,1,-0.077734,3091.8,6,3091.8
385,7742,gbdt,1.0,7743,gain,0.005,510,0.0,-1,20,...,0.999819,0.947794,2,0.999816,-2.465156,2,-0.008828,2024.4,275,2024.4
452,7742,gbdt,1.0,7743,gain,0.0055,510,0.0,30,20,...,0.999458,0.947734,8,0.999418,-2.468281,3,-0.027578,2189.0,181,2189.0
584,7742,gbdt,1.0,7743,gain,0.007,510,0.0,-1,20,...,0.99938,0.947744,5,0.999342,-2.469375,4,-0.03125,1687.0,493,1687.0
588,7742,gbdt,1.0,7743,gain,0.007,510,0.0,30,20,...,0.999729,0.94774,6,0.999709,-2.469531,5,-0.013789,1561.6,565,1561.6


In [15]:
gs_df.sort_values(by="mean_test_accuracy", ascending=False).head(3)

Unnamed: 0,bagging_seed,boosting_type,colsample_bytree,feature_fraction_seed,importance_type,learning_rate,max_bin,max_delta_step,max_depth,min_child_samples,min_child_weight,min_data_in_bin,n_estimators,num_leaves,objective,reg_alpha,reg_lambda,scale_pos_weight,seed,subsample,subsample_freq,params,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,mean_train_accuracy,std_train_accuracy,mean_test_f025_score,std_test_f025_score,rank_test_f025_score,mean_train_f025_score,std_train_f025_score,mean_test_f05_score,std_test_f05_score,rank_test_f05_score,mean_train_f05_score,std_train_f05_score,mean_test_avg_dollars_lost_per_prediction,std_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,std_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,std_test_best_iter_early_stopping,rank_test_best_iter_early_stopping,mean_train_best_iter_early_stopping,std_train_best_iter_early_stopping
406,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,5.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955617,0.00163,1,0.998504,0.000234,0.949589,0.000969,2,0.998465,0.000238,0.947904,0.001274,1,0.99836,0.000254,-2.463281,0.051704,1,-0.077734,0.012023,3091.8,120.79139,6,3091.8,120.79139
388,7742,gbdt,1.0,7743,gain,0.005,510,0.0,-1,20,0.001,1,7500,1024,binary,0.0,5.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.9555,0.001452,2,0.998137,0.000246,0.949403,0.000806,21,0.998097,0.000274,0.947731,0.001114,9,0.997963,0.000284,-2.471875,0.044335,13,-0.096484,0.013546,2945.2,111.377556,22,2945.2,111.377556
626,7742,gbdt,1.0,7743,gain,0.0075,510,0.0,-1,20,0.001,1,7500,1024,binary,0.0,2.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.9555,0.001404,2,0.999188,0.000142,0.949435,0.000897,13,0.999138,0.000189,0.947752,0.001067,4,0.999089,0.000184,-2.470625,0.044892,8,-0.043281,0.008891,1598.4,38.160713,540,1598.4,38.160713


In [8]:
gs_no_mono.best_score_

-2.4632812500000005

In [9]:
gs_no_mono.best_estimator_.named_steps['model'].best_iteration_

2994

In [11]:
MODEL_NAME="Gridsearch No Mono"
dai_train_pred_df, dai_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_no_mono, 
                                                                               X_train=X_train_dai, 
                                                                               y_train=y_train, 
                                                                               X_val=X_val_dai, 
                                                                               y_val=y_val, 
                                                                               model_name=MODEL_NAME)

Best Iteration: 2994
accuracy: Train=0.9982578125, Validation=0.95275
f025_score: Train=0.9982654453322939, Validation=0.9483284375437124
f05_score: Train=0.998125384861216, Validation=0.9458305663599278
avg_dollars_lost_per_prediction: Train=0.08859375, Validation=2.54


In [None]:
gs_no_mono.best_params_

In [13]:
%%time

parameter_grid = {"model__boosting_type":["gbdt"], 
                  "model__colsample_bytree":[1.0], 
                  "model__bagging_seed":[7742], 
                  "model__feature_fraction_seed":[7743], 
                  "model__importance_type":["gain"], 
                  "model__learning_rate":[0.001, 0.005], 
                  "model__max_bin":[510], 
                  "model__max_delta_step":[0.0], 
                  "model__max_depth":[30], 
                  "model__num_leaves":[2048], 
                  "model__min_child_samples":[20], 
                  "model__min_child_weight":[0.001], 
                  "model__min_data_in_bin":[1], 
                  "model__min_split_gain":[0.0],
                  "model__n_estimators":[7_500], 
                  "model__objective":["binary"], 
                  "model__reg_alpha":[0.0], 
                  "model__reg_lambda":[5.0 + num for num in range(101)], 
                  "model__scale_pos_weight":[1.0], 
                  "model__seed":[534401655], 
                  "model__subsample":[0.7], 
                  "model__subsample_freq":[1]}

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

model_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")), 
                             ('model', LGBMClassifier(n_jobs=32, random_state=7742))])


gs_no_mono_reg = GridSearchCV(estimator=model_pipe, 
                  param_grid=parameter_grid, 
                  refit="avg_dollars_lost_per_prediction",
                  scoring=scorers_dict, 
                  cv=cv_splitter,
                  n_jobs=32, 
                  error_score="raise", 
                  return_train_score=True, 
                  verbose=2)

# gs_no_mono_reg.fit(X=X_train_dai, 
#        y=y_train, 
#        model__callbacks=[early_stopping(stopping_rounds=5, 
#                                         first_metric_only=True, 
#                                         verbose=True)], 
#        model__eval_metric=[average_dollars_scorer_lgbm, 
#                            "binary"], 
#        model__eval_set=[(X_val_dai, y_val)])


# save_name = get_gs_save_name(model_name="LGBM_GS_No_Mono_Reg")
# save_path=f"./models/{save_name}"
# with open(save_path, 'wb') as file:
#     pickle.dump(gs_no_mono_reg, file)

    
PATH=f"./models/LGBM_GS_No_Mono_Reg_20221126_1049.pkl"
g_reg = load_gs_from_pickle(pickle_filepath=PATH)

gs_df = gs_to_clean_df(gs_no_mono_reg.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
gs_df.loc[:,[c for c in gs_df.columns if "std" not in c]].head()

Fitting 5 folds for each of 202 candidates, totalling 1010 fits
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[2994]	valid_0's binary_logloss: 0.130106	valid_0's avg_dollars_lost_per_prediction: 2.54125
Evaluated only: binary_logloss
CPU times: total: 1h 12min 59s
Wall time: 1d 1h 19min 30s


Unnamed: 0,bagging_seed,boosting_type,colsample_bytree,feature_fraction_seed,importance_type,learning_rate,max_bin,max_delta_step,max_depth,min_child_samples,...,mean_train_f025_score,mean_test_f05_score,rank_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,rank_test_best_iter_early_stopping,mean_train_best_iter_early_stopping
101,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.998465,0.947904,1,0.99836,-2.463281,1,-0.077734,3091.8,202,3091.8
104,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.997895,0.947591,2,0.997742,-2.479375,2,-0.106875,3606.2,198,3606.2
103,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.998036,0.947539,4,0.997896,-2.481094,3,-0.099609,3428.4,200,3428.4
102,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.998272,0.947553,3,0.998146,-2.482656,4,-0.087773,3270.0,201,3270.0
105,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,...,0.997327,0.947335,5,0.997156,-2.492188,5,-0.134844,3585.6,199,3585.6


In [16]:
gs_df = gs_to_clean_df(gs_no_mono_reg.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
gs_df.loc[:,[c for c in gs_df.columns if "std" not in c]].head()

Unnamed: 0,bagging_seed,boosting_type,colsample_bytree,feature_fraction_seed,importance_type,learning_rate,max_bin,max_delta_step,max_depth,min_child_samples,min_child_weight,min_data_in_bin,n_estimators,num_leaves,objective,reg_alpha,reg_lambda,scale_pos_weight,seed,subsample,subsample_freq,params,mean_test_accuracy,rank_test_accuracy,mean_train_accuracy,mean_test_f025_score,rank_test_f025_score,mean_train_f025_score,mean_test_f05_score,rank_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,rank_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,rank_test_best_iter_early_stopping,mean_train_best_iter_early_stopping
101,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,5.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955617,1,0.998504,0.949589,1,0.998465,0.947904,1,0.99836,-2.463281,1,-0.077734,3091.8,202,3091.8
104,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,8.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955438,3,0.99793,0.949231,2,0.997895,0.947591,2,0.997742,-2.479375,2,-0.106875,3606.2,198,3606.2
103,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,7.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955352,4,0.998074,0.949208,3,0.998036,0.947539,4,0.997896,-2.481094,3,-0.099609,3428.4,200,3428.4
102,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,6.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955492,2,0.998299,0.949137,4,0.998272,0.947553,3,0.998146,-2.482656,4,-0.087773,3270.0,201,3270.0
105,7742,gbdt,1.0,7743,gain,0.005,510,0.0,30,20,0.001,1,7500,2048,binary,0.0,9.0,1.0,534401655,0.7,1,"{'model__bagging_seed': 7742, 'model__boosting...",0.955266,6,0.99743,0.948953,5,0.997327,0.947335,5,0.997156,-2.492188,5,-0.134844,3585.6,199,3585.6


In [17]:
MODEL_NAME="Gridsearch No Mono Reg"
dai_train_reg_pred_df, dai_val_reg_pred_df = get_val_performance_after_all_train_refit(gs=gs_no_mono_reg, 
                                                                               X_train=X_train_dai, 
                                                                               y_train=y_train, 
                                                                               X_val=X_val_dai, 
                                                                               y_val=y_val, 
                                                                               model_name=MODEL_NAME)

Best Iteration: 2994
accuracy: Train=0.9982578125, Validation=0.95275
f025_score: Train=0.9982654453322939, Validation=0.9483284375437124
f05_score: Train=0.998125384861216, Validation=0.9458305663599278
avg_dollars_lost_per_prediction: Train=0.08859375, Validation=2.54


In [25]:
metric_cols = [c for c in gs_df.columns if ("train" in c or "test" in c) and "std" not in c and "rank" not in c and "f025" not in c]
c = ["reg_lambda"] + metric_cols
gs_df.loc[:, c].head(20)

Unnamed: 0,reg_lambda,mean_test_accuracy,mean_train_accuracy,mean_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,mean_train_best_iter_early_stopping
101,5.0,0.955617,0.998504,0.947904,0.99836,-2.463281,-0.077734,3091.8,3091.8
104,8.0,0.955438,0.99793,0.947591,0.997742,-2.479375,-0.106875,3606.2,3606.2
103,7.0,0.955352,0.998074,0.947539,0.997896,-2.481094,-0.099609,3428.4,3428.4
102,6.0,0.955492,0.998299,0.947553,0.998146,-2.482656,-0.087773,3270.0,3270.0
105,9.0,0.955266,0.99743,0.947335,0.997156,-2.492188,-0.134844,3585.6,3585.6
106,10.0,0.955242,0.997502,0.947311,0.997257,-2.493281,-0.129961,3828.0,3828.0
108,12.0,0.955297,0.99751,0.947283,0.997261,-2.495938,-0.129805,4286.6,4286.6
107,11.0,0.955031,0.997168,0.94702,0.996862,-2.5075,-0.148828,3856.2,3856.2
109,13.0,0.955125,0.997215,0.947006,0.996932,-2.51,-0.145391,4320.4,4320.4
112,16.0,0.955117,0.997096,0.94697,0.996759,-2.512031,-0.153867,4844.2,4844.2


### For single models

In [37]:
grid = {"model__boosting_type":["gbdt"], 
                  "model__colsample_bytree":[0.8], 
                  "model__bagging_seed":[7742], 
                  "model__feature_fraction_seed":[7743], 
                  "model__importance_type":["gain"], 
                  "model__learning_rate":[0.01], 
                  "model__max_bin":[63], 
                  "model__max_delta_step":[0.0], 
                  "model__max_depth":[30], 
                  "model__num_leaves":[2048], 
                  "model__min_child_samples":[20], 
                  "model__min_child_weight":[0.001], 
                  "model__min_data_in_bin":[1], 
                  "model__min_split_gain":[0.0],
                  "model__n_estimators":[7_500], 
                  "model__objective":["binary"], 
                  "model__reg_alpha":[0.0], 
                  "model__reg_lambda":[5.0], 
                  "model__scale_pos_weight":[1.0], 
                  "model__seed":[534401655], 
                  "model__subsample":[0.7], 
                  "model__subsample_freq":[1]}

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

model_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")), 
                             ('model', LGBMClassifier(n_jobs=32, random_state=7742))])


g_test = GridSearchCV(estimator=model_pipe, 
                  param_grid=grid, 
                  refit="avg_dollars_lost_per_prediction",
                  scoring=scorers_dict, 
                  cv=cv_splitter,
                  n_jobs=32, 
                  error_score="raise", 
                  return_train_score=True, 
                  verbose=2)

# g_test.fit(X=X_train_dai, 
#        y=y_train, 
#        model__callbacks=[early_stopping(stopping_rounds=5, 
#                                         first_metric_only=True, 
#                                         verbose=True)], 
#        model__eval_metric=[average_dollars_scorer_lgbm, 
#                            "binary"], 
#        model__eval_set=[(X_val_dai, y_val)])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1711]	valid_0's binary_logloss: 0.129392	valid_0's avg_dollars_lost_per_prediction: 2.4625
Evaluated only: binary_logloss


In [38]:
MODEL_NAME="Gridsearch Test"
dai_train_reg_pred_df, dai_val_reg_pred_df = get_val_performance_after_all_train_refit(gs=g_test, 
                                                                               X_train=X_train_dai, 
                                                                               y_train=y_train, 
                                                                               X_val=X_val_dai, 
                                                                               y_val=y_val, 
                                                                               model_name=MODEL_NAME)

Best Iteration: 1711
accuracy: Train=0.9983125, Validation=0.9539375
f025_score: Train=0.9983079565021014, Validation=0.9500219243005215
f05_score: Train=0.9981761354336355, Validation=0.9474549998418271
avg_dollars_lost_per_prediction: Train=0.08625, Validation=2.46125


In [30]:
g_df = gs_to_clean_df(g_test.cv_results_, 
                       sort_metric="mean_test_avg_dollars_lost_per_prediction", 
                       sort_ascending=False)
metric_cols = [c for c in g_df.columns if ("train" in c or "test" in c) and "std" not in c and "rank" not in c and "f025" not in c]
c = ["reg_lambda"] + metric_cols
g_df.loc[:, c].head(20)

Unnamed: 0,reg_lambda,mean_test_accuracy,mean_train_accuracy,mean_test_f05_score,mean_train_f05_score,mean_test_avg_dollars_lost_per_prediction,mean_train_avg_dollars_lost_per_prediction,mean_test_best_iter_early_stopping,mean_train_best_iter_early_stopping
0,5.0,0.955383,0.998021,0.947457,0.99785,-2.486719,-0.101758,3277.2,3277.2
