In [1]:
from sensitive_info import *
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("./CS7_preprocessed.csv")
df.head(2)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x41,x42,x43,x44,x45,x46,x47,x48,x49,y
0,-0.166563,-3.961588,4.621113,2.481908,-1.800135,0.804684,6.718751,-14.789997,-1.040673,-4.20495,...,-1.497117,5.414063,-2.325655,1.674827,-0.264332,60.781427,-7.689696,0.151589,-8.040166,0
1,-0.149894,-0.585676,27.839856,4.152333,6.426802,-2.426943,40.477058,-6.725709,0.896421,0.330165,...,36.29279,4.490915,0.762561,6.526662,1.007927,15.805696,-4.896678,-0.320283,16.719974,0


In [3]:
df = convert_strings_to_category(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 51 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   x0      159974 non-null  float64 
 1   x1      159975 non-null  float64 
 2   x2      159962 non-null  float64 
 3   x3      159963 non-null  float64 
 4   x4      159974 non-null  float64 
 5   x5      159963 non-null  float64 
 6   x6      159974 non-null  float64 
 7   x7      159973 non-null  float64 
 8   x8      159979 non-null  float64 
 9   x9      159970 non-null  float64 
 10  x10     159957 non-null  float64 
 11  x11     159970 non-null  float64 
 12  x12     159964 non-null  float64 
 13  x13     159969 non-null  float64 
 14  x14     159966 non-null  float64 
 15  x15     159965 non-null  float64 
 16  x16     159974 non-null  float64 
 17  x17     159973 non-null  float64 
 18  x18     159960 non-null  float64 
 19  x19     159965 non-null  float64 
 20  x20     159962 non-null  f

In [4]:
train_df, val_df, test_df = create_train_val_test_sets(df=df, 
                                                       target_column="y", 
                                                       test_size=0.1, 
                                                       val_size=0.1, 
                                                       random_state=7742)

Train Shape: (128000, 51)
Val Shape: (16000, 51)
Test Shape: (16000, 51)


In [5]:
X_train = train_df.drop(columns="y")
y_train = train_df.loc[:,"y"].to_numpy()

X_val = val_df.drop(columns="y")
y_val = val_df.loc[:,"y"].to_numpy()

X_test = test_df.drop(columns="y")
y_test = test_df.loc[:,"y"].to_numpy()

# Subset to features selected by DAI
X_train_dai, X_val_dai, X_test_dai = subset_to_dai_selected_features(X_train=X_train, 
                                                                     X_val=X_val, 
                                                                     X_test=X_test)

DAI Train Shape: (128000, 17)
DAI Val Shape: (16000, 17)
DAI Test Shape: (16000, 17)


In [6]:
X_train_dai.columns

Index(['x48', 'x23', 'x27', 'x20', 'x28', 'x46', 'x49', 'x37', 'x42', 'x12',
       'x32', 'x7', 'x2', 'x38', 'x41', 'x6', 'x40'],
      dtype='object')

In [7]:
X_train_dai.shape

(128000, 17)

In [8]:
scorers_dict = {"accuracy":make_scorer(accuracy_score), 
                "f025_score":make_scorer(fbeta_score, beta=0.25),
                "f05_score":make_scorer(fbeta_score, beta=0.5), 
                "avg_dollars_lost_per_prediction":make_scorer(score_func=average_dollars_scorer_sklearn, 
                                                              greater_is_better=False)}

scorers_dict_lgbm = scorers_dict.copy()
scorers_dict_lgbm["best_iter_early_stopping"]=get_lgbm_best_iter_sklearn

## Model: MLP with Features from DAI

In [9]:
%%time 

mlp_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")),
                           ("mlp", MLPClassifier(random_state=7742, 
                                                 early_stopping=True,
                                                 max_iter=10_000))])

mlp_pipe.fit(X_train_dai, y_train)

evaluate_baseline_model(X_train=X_train_dai, 
                        y_train=y_train, 
                        X_val=X_val_dai, 
                        y_val=y_val, 
                        model_pipe=mlp_pipe, 
                        model_name="MLP using features from DAI", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.9453125
Validation accuracy: 0.940125

Training f025_score: 0.9366652834414027
Validation f025_score: 0.9348460535082534

Training f05_score: 0.9349798188813446
Validation f05_score: 0.9314894293435064

Training avg_dollars_lost_per_prediction: -3.07875
Validation avg_dollars_lost_per_prediction: -3.1925

CPU times: total: 30.3 s
Wall time: 30.3 s


## Model: MLP with Features from DAI, scaled inputs

In [10]:
%%time 

mlp_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")),
                           ("scaler", StandardScaler()),
                           ("mlp", MLPClassifier(random_state=7742, 
                                                 early_stopping=True,
                                                 max_iter=10_000))])

mlp_pipe.fit(X_train_dai, y_train)

evaluate_baseline_model(X_train=X_train_dai, 
                        y_train=y_train, 
                        X_val=X_val_dai, 
                        y_val=y_val, 
                        model_pipe=mlp_pipe, 
                        model_name="MLP using features from DAI, scaled inputs", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.96765625
Validation accuracy: 0.96025

Training f025_score: 0.9624604892759075
Validation f025_score: 0.9550579493973448

Training f05_score: 0.9615294255844006
Validation f05_score: 0.9534891034309254

Training avg_dollars_lost_per_prediction: -1.83125
Validation avg_dollars_lost_per_prediction: -2.2

CPU times: total: 25.8 s
Wall time: 25.7 s


## Model: MLP with All Features

In [11]:
categorical_pipe = Pipeline(steps=[("categorical_imputer", SimpleImputer(strategy="most_frequent")), 
                                   ("encode", OneHotEncoder())])


preprocess = ColumnTransformer(transformers=[("numeric_imputer", SimpleImputer(), X_train.select_dtypes(include=[float, np.number, int]).columns.tolist()), 
                                             ("cat_pipe", categorical_pipe, X_train.select_dtypes(include=["category"]).columns.tolist())], 
                               remainder="passthrough")

mlp_all_pipe = Pipeline(steps=[("preprocess", preprocess), 
                               ('model',  MLPClassifier(random_state=7742, 
                                                        early_stopping=True,
                                                        max_iter=10_000))])

mlp_all_pipe.fit(X_train, y_train)

evaluate_baseline_model(X_train=X_train, 
                        y_train=y_train, 
                        X_val=X_val, 
                        y_val=y_val, 
                        model_pipe=mlp_all_pipe, 
                        model_name="MLP using All Features", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.93040625
Validation accuracy: 0.9190625

Training f025_score: 0.9144974962714092
Validation f025_score: 0.8996136870406225

Training f05_score: 0.9140589002475653
Validation f05_score: 0.8994354159518388

Training avg_dollars_lost_per_prediction: -4.12125
Validation avg_dollars_lost_per_prediction: -4.83375



## Model: MLP with All Features, scaled inputs

In [12]:
categorical_pipe = Pipeline(steps=[("categorical_imputer", SimpleImputer(strategy="most_frequent")), 
                                   ("encode", OneHotEncoder())])

numeric_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="most_frequent")), 
                               ("scale", StandardScaler())])

preprocess = ColumnTransformer(transformers=[("numeric_pipe", numeric_pipe, X_train.select_dtypes(include=[float, np.number, int]).columns.tolist()), 
                                             ("cat_pipe", categorical_pipe, X_train.select_dtypes(include=["category"]).columns.tolist())], 
                               remainder="passthrough")

mlp_all_pipe = Pipeline(steps=[("preprocess", preprocess), 
                               ('model',  MLPClassifier(random_state=7742, 
                                                        early_stopping=True,
                                                        max_iter=10_000))])

mlp_all_pipe.fit(X_train, y_train)

evaluate_baseline_model(X_train=X_train, 
                        y_train=y_train, 
                        X_val=X_val, 
                        y_val=y_val, 
                        model_pipe=mlp_all_pipe, 
                        model_name="MLP using All Features, scaled inputs", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.957625
Validation accuracy: 0.9400625

Training f025_score: 0.9515537654942536
Validation f025_score: 0.9314613329858623

Training f05_score: 0.9500601117371114
Validation f05_score: 0.9292817679558012

Training avg_dollars_lost_per_prediction: -2.365625
Validation avg_dollars_lost_per_prediction: -3.33375



## Model: KNN with Features from DAI

In [13]:
knn_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")),
                           ("scaler", StandardScaler()),
                           ("knn", KNeighborsClassifier(n_jobs=5))])

knn_pipe.fit(X_train_dai, y_train)

evaluate_baseline_model(X_train=X_train_dai, 
                        y_train=y_train, 
                        X_val=X_val_dai, 
                        y_val=y_val, 
                        model_pipe=knn_pipe, 
                        model_name="KNN (5 Neighbors) using features from DAI", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.9679453125
Validation accuracy: 0.94975

Training f025_score: 0.9614819752008182
Validation f025_score: 0.9401311965457112

Training f05_score: 0.961002241609585
Validation f05_score: 0.9391745792986744

Training avg_dollars_lost_per_prediction: -1.86671875
Validation avg_dollars_lost_per_prediction: -2.9



## Model: KNN with All Features

In [14]:
categorical_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="most_frequent")), 
                                   ("encode", OneHotEncoder())])

numeric_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="most_frequent")), 
                               ("scale", StandardScaler())])

preprocess = ColumnTransformer(transformers=[("numeric_pipe", numeric_pipe, X_train.select_dtypes(include=[float, np.number, int]).columns.tolist()), 
                                             ("cat_pipe", categorical_pipe, X_train.select_dtypes(include=["category"]).columns.tolist())], 
                               remainder="passthrough")

knn_all_pipe = Pipeline(steps=[("preprocess", preprocess),
                               ("knn", KNeighborsClassifier(n_jobs=5))])

knn_all_pipe.fit(X_train, y_train)

evaluate_baseline_model(X_train=X_train, 
                        y_train=y_train, 
                        X_val=X_val, 
                        y_val=y_val, 
                        model_pipe=knn_all_pipe, 
                        model_name="KNN (5 Neighbors) using all features", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.891796875
Validation accuracy: 0.81875

Training f025_score: 0.8882430158772919
Validation f025_score: 0.7976198852157482

Training f05_score: 0.8791278978729015
Validation f05_score: 0.7862027149930947

Training avg_dollars_lost_per_prediction: -5.3896875
Validation avg_dollars_lost_per_prediction: -9.37



## Model: LogisticRegression with Features from DAI

In [15]:
logreg_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="mean")),
                           ("logreg", LogisticRegression(max_iter=20_000))])

logreg_pipe.fit(X_train_dai, y_train)

evaluate_baseline_model(X_train=X_train_dai, 
                        y_train=y_train, 
                        X_val=X_val_dai, 
                        y_val=y_val, 
                        model_pipe=logreg_pipe, 
                        model_name="Logistic Regression using features from DAI", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.70209375
Validation accuracy: 0.7046875

Training f025_score: 0.654640265977953
Validation f025_score: 0.6608560274570394

Training f05_score: 0.6295790800610386
Validation f05_score: 0.6335781015724117

Training avg_dollars_lost_per_prediction: -14.298125
Validation avg_dollars_lost_per_prediction: -13.91625



## Model: LogisticRegression with All Features

In [16]:
categorical_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="most_frequent")), 
                                   ("encode", OneHotEncoder())])

preprocess = ColumnTransformer(transformers=[("numeric_imputer", SimpleImputer(), X_train.select_dtypes(include=[float, np.number, int]).columns.tolist()), 
                                             ("cat_pipe", categorical_pipe, X_train.select_dtypes(include=["category"]).columns.tolist())], 
                               remainder="passthrough")

logreg_all_pipe = Pipeline(steps=[("preprocess", preprocess),
                                  ("logreg", LogisticRegression(max_iter=20_000))])


logreg_all_pipe.fit(X_train, y_train)

evaluate_baseline_model(X_train=X_train, 
                        y_train=y_train, 
                        X_val=X_val, 
                        y_val=y_val, 
                        model_pipe=logreg_all_pipe, 
                        model_name="Logistic Regression using All Features", 
                        scorers_dict=scorers_dict)

Training accuracy: 0.7020859375
Validation accuracy: 0.7051875

Training f025_score: 0.6552519264103946
Validation f025_score: 0.6625500231561198

Training f05_score: 0.6295860972847449
Validation f05_score: 0.6343990412494684

Training avg_dollars_lost_per_prediction: -14.23078125
Validation avg_dollars_lost_per_prediction: -13.79625



## Model: Best DAI Hyperparameters

In [17]:
EARLY_STOPPING_ROUNDS = 5 
best_params = {"boosting_type":"gbdt",
               "colsample_bytree":0.8,
               "bagging_seed":7742,
               "feature_fraction_seed":7743,
               "importance_type":"gain",
               "learning_rate":0.05,
               "max_bin":63,
               "max_delta_step":0.0,
               "max_depth":10,
               "num_leaves":1024,
               "min_child_samples":20,
               "min_child_weight":0.001,
               "min_data_in_bin":1,
               "min_split_gain":0.0,
               "n_estimators":715,
               "objective":"binary",
               "reg_alpha":0.0,
               "reg_lambda":0.0,
               "scale_pos_weight":1.0,
               "seed":534401655,
               "subsample":0.7,
               "subsample_freq":1}

best_param_grid = {key:[value] for key, value in best_params.items()}

In [18]:
DAI_MONOTIC_CONSTRAINTS = [-1, 1, -1, 0, 0, 1, 0, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1] 
dai_mono_column_sorted = sorted([int(col[1:]) for col in X_train_dai.columns])
dai_mono_columns = [f"x{col_num}" for col_num in dai_mono_column_sorted] 
X_train_dai_mono = X_train_dai.loc[:, dai_mono_columns]
X_val_dai_mono = X_val_dai.loc[:, dai_mono_columns]

best_param_grid_mono = best_param_grid.copy()
best_param_grid_mono['monotone_constraints'] = [DAI_MONOTIC_CONSTRAINTS]
best_param_grid_mono['monotone_constraints_method'] = ["intermediate"]
best_param_grid_mono['monotonicity_constraints'] = [True]

In [19]:
%%time

MODEL_NAME = "Best DAI Exactly"

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_dai_exact = GridSearchCV(estimator=LGBMClassifier(n_jobs=32, random_state=7742), 
                   param_grid=best_param_grid_mono, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_dai_exact.fit(X=X_train_dai_mono, 
        y=y_train, 
        callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        eval_set=[(X_val_dai_mono, y_val)])

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[634]	valid_0's binary_logloss: 0.166714	valid_0's avg_dollars_lost_per_prediction: 3.4275	valid_0's f05_score: 0.92683
Evaluated only: binary_logloss
CPU times: total: 3min 7s
Wall time: 1min 41s


In [20]:
gs_dai_exact_df = display_cross_validation_scores(gs=gs_dai_exact, model_name=MODEL_NAME)
gs_dai_exact_df

Unnamed: 0,Best DAI Exactly
mean_test_f025_score,0.930679
mean_train_f025_score,0.990459
mean_test_f025_score,0.930679
mean_train_f025_score,0.990459
mean_test_avg_dollars_lost_per_prediction,-3.380781
mean_train_avg_dollars_lost_per_prediction,-0.486484
mean_test_accuracy,0.938023
mean_train_accuracy,0.990363


In [21]:
dai_train_exact_pred_df, dai_val_exact_pred_df = get_val_performance_after_all_train_refit(gs=gs_dai_exact, 
                                                                                       X_train=X_train_dai_mono, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val_dai_mono, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 634
accuracy: Train=0.9862578125, Validation=0.937125
f025_score: Train=0.9855810555764235, Validation=0.9297011644764533
f05_score: Train=0.9846960742103408, Validation=0.9268300415306089
avg_dollars_lost_per_prediction: Train=0.72296875, Validation=3.4275


## Best DAI (Monotone removed)

After this point, all models will not used monotone constraints

In [22]:
%%time

MODEL_NAME = "Best DAI (DAI params and features)"

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_dai = GridSearchCV(estimator=LGBMClassifier(n_jobs=32, random_state=7742), 
                   param_grid=best_param_grid, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_dai.fit(X=X_train_dai, 
        y=y_train, 
        callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        eval_set=[(X_val_dai, y_val)])

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[415]	valid_0's binary_logloss: 0.132426	valid_0's avg_dollars_lost_per_prediction: 2.57125	valid_0's f05_score: 0.945452
Evaluated only: binary_logloss
CPU times: total: 2min 28s
Wall time: 1min 16s


In [23]:
gs_dai_df = display_cross_validation_scores(gs=gs_dai, model_name=MODEL_NAME)
gs_dai_df

Unnamed: 0,Best DAI (DAI params and features)
mean_test_f025_score,0.947297
mean_train_f025_score,0.997257
mean_test_f025_score,0.947297
mean_train_f025_score,0.997257
mean_test_avg_dollars_lost_per_prediction,-2.57125
mean_train_avg_dollars_lost_per_prediction,-0.135937
mean_test_accuracy,0.953937
mean_train_accuracy,0.997531


In [24]:
dai_train_pred_df, dai_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_dai, 
                                                                                       X_train=X_train_dai, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val_dai, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 415
accuracy: Train=0.995765625, Validation=0.9531875
f025_score: Train=0.9952675383172299, Validation=0.9474549966079012
f05_score: Train=0.9950922684712358, Validation=0.9454516790162384
avg_dollars_lost_per_prediction: Train=0.2340625, Validation=2.57125


## Model: Best DAI Hyperparameters, With Imputer

In [25]:
%%time

MODEL_NAME = "Best DAI with Imputer"

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

dai_imp_pipe = Pipeline(steps=[("impute", SimpleImputer()), 
                          ('model', LGBMClassifier(n_jobs=32, random_state=7742))])

best_param_grid_imp = {f"model__{key}":value for key, value in best_param_grid.items()}
best_param_grid_imp['impute__strategy'] = ["mean"]

gs_dai_imp = GridSearchCV(estimator=dai_imp_pipe, 
                          param_grid=best_param_grid_imp, 
                          refit="avg_dollars_lost_per_prediction",
                          scoring=scorers_dict, 
                          cv=cv_splitter,
                          n_jobs=32, 
                          error_score="raise", 
                          return_train_score=True)

gs_dai_imp.fit(X=X_train_dai, 
               y=y_train, 
               model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                                first_metric_only=True, 
                                                verbose=True)], 
               model__eval_metric=[average_dollars_scorer_lgbm, 
                                   fpt5_scorer_lgbm, 
                                   "binary", 
                                   "accuracy"], 
               model__eval_set=[(X_val_dai, y_val)])

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[450]	valid_0's binary_logloss: 0.132653	valid_0's avg_dollars_lost_per_prediction: 2.4725	valid_0's f05_score: 0.947468
Evaluated only: binary_logloss
CPU times: total: 2min 35s
Wall time: 1min 18s


In [26]:
gs_dai_imp_df = display_cross_validation_scores(gs=gs_dai_imp, model_name=MODEL_NAME)
gs_dai_imp_df

Unnamed: 0,Best DAI with Imputer
mean_test_f025_score,0.947315
mean_train_f025_score,0.997648
mean_test_f025_score,0.947315
mean_train_f025_score,0.997648
mean_test_avg_dollars_lost_per_prediction,-2.571406
mean_train_avg_dollars_lost_per_prediction,-0.117813
mean_test_accuracy,0.953836
mean_train_accuracy,0.997797


In [27]:
dai_imp_train_pred_df, dai_imp_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_dai_imp, 
                                                                                       X_train=X_train_dai, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val_dai, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 450
accuracy: Train=0.9978046875, Validation=0.954625
f025_score: Train=0.9976147083433627, Validation=0.9495828799419659
f05_score: Train=0.9975022016475338, Validation=0.9474680548982489
avg_dollars_lost_per_prediction: Train=0.11890625, Validation=2.4725


## Model: LightGBM Default Hyperparams, Features from DAI

In [28]:
%%time

MODEL_NAME = "LGBM Default Params, DAI Features"

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_def_dai = GridSearchCV(estimator=LGBMClassifier(n_jobs=32, random_state=7742), 
                   param_grid={}, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_def_dai.fit(X=X_train_dai, 
        y=y_train, 
        callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        eval_set=[(X_val_dai, y_val)])

Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.247955	valid_0's avg_dollars_lost_per_prediction: 4.88375	valid_0's f05_score: 0.894011
Evaluated only: binary_logloss
CPU times: total: 16.6 s
Wall time: 9.3 s


In [29]:
gs_def_dai_df = display_cross_validation_scores(gs=gs_def_dai, model_name=MODEL_NAME)
gs_def_dai_df

Unnamed: 0,"LGBM Default Params, DAI Features"
mean_test_f025_score,0.90027
mean_train_f025_score,0.911802
mean_test_f025_score,0.90027
mean_train_f025_score,0.911802
mean_test_avg_dollars_lost_per_prediction,-4.82875
mean_train_avg_dollars_lost_per_prediction,-4.291758
mean_test_accuracy,0.908844
mean_train_accuracy,0.918373


In [30]:
def_dai_train_pred_df, def_dai_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_def_dai, 
                                                                                       X_train=X_train_dai, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val_dai, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 100
accuracy: Train=0.9186875, Validation=0.9083125
f025_score: Train=0.9114215826860678, Validation=0.8990501183786405
f05_score: Train=0.9067022086824067, Validation=0.8940109943099623
avg_dollars_lost_per_prediction: Train=4.306875, Validation=4.88375


## Model: LightGBM Default Hyperparams, Features from DAI, With Imputer

In [31]:
%%time

MODEL_NAME = "LGBM Default Params, DAI Features, Imputer"

cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

dai_imp_pipe = Pipeline(steps=[("impute", SimpleImputer()), 
                          ('model', LGBMClassifier(n_jobs=32, random_state=7742))])

gs_def_dai_imp = GridSearchCV(estimator=dai_imp_pipe, 
                   param_grid={"impute__strategy":["mean"]}, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_def_dai_imp.fit(X=X_train_dai, 
        y=y_train, 
        model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        model__eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        model__eval_set=[(X_val_dai, y_val)])

Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.247983	valid_0's avg_dollars_lost_per_prediction: 4.70375	valid_0's f05_score: 0.8972
Evaluated only: binary_logloss
CPU times: total: 15.6 s
Wall time: 9.26 s


In [32]:
gs_def_dai_imp_df = display_cross_validation_scores(gs=gs_def_dai_imp, model_name=MODEL_NAME)
gs_def_dai_imp_df

Unnamed: 0,"LGBM Default Params, DAI Features, Imputer"
mean_test_f025_score,0.901498
mean_train_f025_score,0.913827
mean_test_f025_score,0.901498
mean_train_f025_score,0.913827
mean_test_avg_dollars_lost_per_prediction,-4.772188
mean_train_avg_dollars_lost_per_prediction,-4.199336
mean_test_accuracy,0.909797
mean_train_accuracy,0.919572


In [33]:
def_dai_imp_train_pred_df, def_dai_imp_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_def_dai_imp, 
                                                                                       X_train=X_train_dai, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val_dai, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 100
accuracy: Train=0.918578125, Validation=0.9098125
f025_score: Train=0.9131488499162775, Validation=0.9030259572637316
f05_score: Train=0.9077315064523143, Validation=0.8971998837321966
avg_dollars_lost_per_prediction: Train=4.2328125, Validation=4.70375


## Model: LightGBM DAI Hyperparams, All Features

In [34]:
best_param_grid_all = {f"model__{key}":value for key, value in best_param_grid.items()}

In [None]:
%%time

MODEL_NAME = "LGBM DAI Params, All Features"

dai_all_pipe = Pipeline(steps=[("to_dataframe", FunctionTransformer(func=to_dataframe_and_dtypes)), 
                          ('model', LGBMClassifier(n_jobs=32, 
                                                   random_state=7742))])


cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_dai_all = GridSearchCV(estimator=dai_all_pipe, 
                   param_grid=best_param_grid_all, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_dai_all.fit(X=X_train, 
        y=y_train, 
        model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        model__eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        model__eval_set=[(X_val, y_val)])

In [36]:
gs_dai_all_df = display_cross_validation_scores(gs=gs_dai_all, model_name=MODEL_NAME)
gs_dai_all_df

Unnamed: 0,"LGBM DAI Params, All Features"
mean_test_f025_score,0.934334
mean_train_f025_score,0.992197
mean_test_f025_score,0.934334
mean_train_f025_score,0.992197
mean_test_avg_dollars_lost_per_prediction,-3.208594
mean_train_avg_dollars_lost_per_prediction,-0.391719
mean_test_accuracy,0.940852
mean_train_accuracy,0.992547


In [37]:
gs_dai_all_train_pred_df, gs_dai_all_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_dai_all, 
                                                                                       X_train=X_train, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 227
accuracy: Train=0.982484375, Validation=0.9375
f025_score: Train=0.9813235460348293, Validation=0.9319772631183351
f05_score: Train=0.9802869208891258, Validation=0.928460045189829
avg_dollars_lost_per_prediction: Train=0.9315625, Validation=3.33


## Model: LightGBM DAI Hyperparams, All Features, With Imputer

In [None]:
%%time

MODEL_NAME = "LGBM DAI Params, All Features, Imputer"

all_columns = X_train.columns
categorical_columns = X_train.select_dtypes(include=["category"]).columns.tolist()
numeric_columns = [col for col in all_columns if col not in categorical_columns]
categorical_column_indicies = [index for index, col in enumerate(all_columns) if col in categorical_columns]
categorical_column_indicies


categorical_pipe = Pipeline(steps=[("categorical_imputer", SimpleImputer(strategy="most_frequent"))])

numeric_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="mean"))])

preprocess = ColumnTransformer(transformers=[("numeric_pipe", numeric_pipe, numeric_columns), 
                                             ("cat_pipe", categorical_pipe, categorical_columns)], 
                               remainder="passthrough")

preprocess_to_df = Pipeline(steps=[("imputers", preprocess), 
                                   ("to_dataframe", FunctionTransformer(func=to_dataframe_and_dtypes))])

dai_all_imp_pipe = Pipeline(steps=[("preprocess", preprocess_to_df), 
                          ('model', LGBMClassifier(n_jobs=32, 
                                                   random_state=7742))])


cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_dai_all_imp = GridSearchCV(estimator=dai_all_imp_pipe, 
                   param_grid=best_param_grid_all, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_dai_all_imp.fit(X=X_train, 
        y=y_train, 
        model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        model__eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        model__eval_set=[(X_val, y_val)])

In [39]:
X_cat_pp = preprocess_to_df.fit_transform(X_train)

In [40]:
pd.DataFrame(X_cat_pp)[47].value_counts()

asia       111114
euorpe      13296
america      3590
Name: 47, dtype: int64

In [41]:
pd.DataFrame(X_cat_pp)[48].value_counts()

July         36387
June         33090
August       23492
May          17602
September     8665
April         5449
October       1935
March          971
November       270
February       113
December        18
January          8
Name: 48, dtype: int64

In [42]:
gs_dai_all_imp_df = display_cross_validation_scores(gs=gs_dai_all_imp, model_name=MODEL_NAME)
gs_dai_all_imp_df

Unnamed: 0,"LGBM DAI Params, All Features, Imputer"
mean_test_f025_score,0.921096
mean_train_f025_score,0.942293
mean_test_f025_score,0.921096
mean_train_f025_score,0.942293
mean_test_avg_dollars_lost_per_prediction,-3.969375
mean_train_avg_dollars_lost_per_prediction,-3.018867
mean_test_accuracy,0.905625
mean_train_accuracy,0.924736


In [43]:
gs_dai_all_imp_train_pred_df, gs_dai_all_imp_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_dai_all_imp, 
                                                                                       X_train=X_train, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 38
accuracy: Train=0.93540625, Validation=0.91475
f025_score: Train=0.9426038332067248, Validation=0.919970912807209
f05_score: Train=0.93426680952342, Validation=0.9100975045447034
avg_dollars_lost_per_prediction: Train=2.918125, Validation=3.965


## Model: LightGBM Default Hyperparams, All Features

In [None]:
%%time

MODEL_NAME = "LGBM Default Params, All Features"

all_pipe = Pipeline(steps=[("to_dataframe", FunctionTransformer(func=to_dataframe_and_dtypes)), 
                          ('model', LGBMClassifier(n_jobs=32, 
                                                   random_state=7742))])


cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_def_all= GridSearchCV(estimator=all_pipe, 
                   param_grid={}, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_def_all.fit(X=X_train, 
        y=y_train, 
        model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        model__eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        model__eval_set=[(X_val, y_val)])

In [45]:
gs_def_all_df = display_cross_validation_scores(gs=gs_def_all, model_name=MODEL_NAME)
gs_def_all_df

Unnamed: 0,"LGBM Default Params, All Features"
mean_test_f025_score,0.901484
mean_train_f025_score,0.912123
mean_test_f025_score,0.901484
mean_train_f025_score,0.912123
mean_test_avg_dollars_lost_per_prediction,-4.773438
mean_train_avg_dollars_lost_per_prediction,-4.276055
mean_test_accuracy,0.909484
mean_train_accuracy,0.918822


In [46]:
def_all_train_pred_df, def_all_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_def_all, 
                                                                                       X_train=X_train, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 100
accuracy: Train=0.917453125, Validation=0.906
f025_score: Train=0.9102324091540022, Validation=0.8967826758000439
f05_score: Train=0.9053705977227317, Validation=0.8914828530027371
avg_dollars_lost_per_prediction: Train=4.3634375, Validation=4.99


## Model: LightGBM Default Hyperparams, All Features, Imputer

In [None]:
%%time

MODEL_NAME = "LGBM Default Params, All Features, Imputer"

all_columns = X_train.columns
categorical_columns = X_train.select_dtypes(include=["category"]).columns.tolist()
numeric_columns = [col for col in all_columns if col not in categorical_columns]
categorical_column_indicies = [index for index, col in enumerate(all_columns) if col in categorical_columns]
categorical_column_indicies


categorical_pipe = Pipeline(steps=[("categorical_imputer", SimpleImputer(strategy="most_frequent"))])

numeric_pipe = Pipeline(steps=[("numeric_imputer", SimpleImputer(strategy="mean"))])

preprocess = ColumnTransformer(transformers=[("numeric_pipe", numeric_pipe, numeric_columns), 
                                             ("cat_pipe", categorical_pipe, categorical_columns)], 
                               remainder="passthrough")

preprocess_to_df = Pipeline(steps=[("imputers", preprocess), 
                                   ("to_dataframe", FunctionTransformer(func=to_dataframe_and_dtypes))])

all_imp_pipe = Pipeline(steps=[("preprocess", preprocess_to_df), 
                          ('model', LGBMClassifier(n_jobs=32, 
                                                   random_state=7742))])


cv_splitter = StratifiedKFold(n_splits=5, 
                              shuffle=True, 
                              random_state=7742)

gs_def_all_imp = GridSearchCV(estimator=all_imp_pipe, 
                   param_grid={}, 
                   refit="avg_dollars_lost_per_prediction",
                   scoring=scorers_dict, 
                   cv=cv_splitter,
                   n_jobs=32, 
                   error_score="raise", 
                   return_train_score=True)

gs_def_all_imp.fit(X=X_train, 
        y=y_train, 
        model__callbacks=[early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, 
                                  first_metric_only=True, 
                                  verbose=True)], 
        model__eval_metric=[average_dollars_scorer_lgbm, 
                     fpt5_scorer_lgbm, 
                     "binary", 
                     "accuracy"], 
        model__eval_set=[(X_val, y_val)])

In [48]:
gs_def_all_imp_df = display_cross_validation_scores(gs=gs_def_all_imp, model_name=MODEL_NAME)
gs_def_all_imp_df

Unnamed: 0,"LGBM Default Params, All Features, Imputer"
mean_test_f025_score,0.84483
mean_train_f025_score,0.852053
mean_test_f025_score,0.84483
mean_train_f025_score,0.852053
mean_test_avg_dollars_lost_per_prediction,-7.106094
mean_train_avg_dollars_lost_per_prediction,-6.827695
mean_test_accuracy,0.828914
mean_train_accuracy,0.834529


In [49]:
def_all_imp_train_pred_df, def_all_imp_val_pred_df = get_val_performance_after_all_train_refit(gs=gs_def_all_imp, 
                                                                                       X_train=X_train, 
                                                                                       y_train=y_train, 
                                                                                       X_val=X_val, 
                                                                                       y_val=y_val, 
                                                                                       model_name=MODEL_NAME)

Best Iteration: 11
accuracy: Train=0.8255390625, Validation=0.8226875
f025_score: Train=0.8481855811253066, Validation=0.8476228366819175
f05_score: Train=0.816234913854209, Validation=0.8138202375950189
avg_dollars_lost_per_prediction: Train=6.93859375, Validation=6.94125
