### 1st models

 - Drop every row with missing value
 - Split dataset
 - Train model
 - Report model
 - Profit?

In [None]:
%reset
from utils import *

In [None]:
data = preproccess_data_simple(df).dropna()

In [None]:
X = data.drop('BAD', axis=1)
y = data['BAD']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RNG)

In [None]:
models = fit_models(X_train, y_train, models_to_fit)

In [None]:
# save_to_pickle(models, 'pickles/models_1st_models.pickle')

In [None]:
models = load_models('pickles/models_1st_models.pickle')

In [None]:
for model in models:
    report_GS(model, X_test, y_test)

### 2nd Models
  - Preprocess data
    - Add variable `is_data_good` indicating whether there was missing data in given row
    - Fill NaNs in `Reason` collumn with value `Other`
    - Fill NaNs in `Job, Derog, Delinq` collumns with modes
  - Fill remaining NaNs with mean of given collumns

In [None]:
%reset
from utils import *

In [None]:
data = preproccess_data_simple(df)

In [None]:
X = data.drop('BAD', axis=1)
y = data['BAD']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RNG)

In [None]:
X_test = X_test.fillna(X_train.mean())
X_train = X_train.fillna(X_train.mean())

In [None]:
models = fit_models(X_train, y_train, models_to_fit)

In [None]:
# save_to_pickle(models, 'pickles/models_2nd_models.pickle')

In [None]:
models = load_models('pickles/models_2nd_models.pickle')

In [None]:
for model in models:
    report_GS(model, X_test, y_test)

### 3rd Models
 - Same dataset as with 2nd models
 - Use simple SMOTEC to over-sample minority class in `Bad`

In [None]:
%reset
from utils import *

In [None]:
data = preproccess_data_simple(df)

In [None]:
X = data.drop('BAD', axis=1)
y = data['BAD']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RNG)

In [None]:
X_test = X_test.fillna(X_train.mean())
X_train = X_train.fillna(X_train.mean())

In [None]:
sm = SMOTENC(
    categorical_features=range(10, 18),
    sampling_strategy='minority',
    random_state=RNG
)

In [None]:
X_train, y_train  = sm.fit_resample(X_train, y_train) # type: ignore

In [None]:
models = fit_models(X_train, y_train, models_to_fit) # type: ignore

In [None]:
# save_to_pickle(models, 'pickles/models_3rd_models.pickle')

In [None]:
models = load_models('pickles/models_3rd_models.pickle')

In [None]:
for model in models:
    report_GS(model, X_test, y_test)

### 4th Models
 - Same dataset as with 2nd models
 - GS over SMOTEC hyper-params

In [None]:
%reset
from utils import *

In [None]:
data = preproccess_data_simple(df)

In [None]:
X = data.drop('BAD', axis=1)
y = data['BAD']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RNG)

In [None]:
X_test = X_test.fillna(X_train.mean())
X_train = X_train.fillna(X_train.mean())

In [None]:
smote_k_neighbors = [5, 7, 10, 15]
smote_strategies = [0.4, 0.6, 0.8, 1.0]

In [None]:
task_args = [
    (
        X_train,
        y_train,
        k_n, 
        strat
    )
    for k_n in smote_k_neighbors
    for strat in smote_strategies
]

In [None]:
# Note: this cell will produce A LOT of warnings, due to n_jobs being set to -1, however it doesn't affect 
# the models itself. To avoid the warnings, set n_jobs params in models to None.

models = []
with Pool(16) as pool:
    results = pool.imap(get_models_smote, task_args)
    for res in results:
        models.append(res)

In [None]:
# save_to_pickle(models, 'pickles/models_4th_models.pickle')

In [None]:
models = load_models('pickles/models_4th_models.pickle')

In [None]:
# Save models in parts, to prevent GitHub from complaining

# for i in range(len(smote_k_neighbors)*len(smote_strategies)):
#     save_to_pickle(models[i], f'pickles/models_4th_models_part{i}.pickle')

In [None]:
# Load the in parts as well

# models = []
# for i in range(len(smote_k_neighbors)*len(smote_strategies)):
#     models.append(
#         load_models(f'pickles/models_4th_models_part{i}.pickle')
#     )

In [None]:
for hyper_params, models, exec_time in models:
    print("=================================================================")
    print("MODEL: ", hyper_params)
    print("TIME: ", exec_time)
    for model in models:
        report_GS(model, X_test, y_test)