In [5]:
import json
import os
import sys
import warnings

import numpy as np
import pandas as pd
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [6]:
sys.path.append(os.path.join('..', 'src'))

In [7]:
import importlib
import model
importlib.reload(model)

from model import get_model_params, timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics

# set model parameters and capture data

In [8]:
# scoring = 'neg_mean_squared_error'
scoring = 'f1'

inputs = os.path.join('..', 'data', '03_processed')
models_reports = os.path.join('..', 'data', '04_models')
model_outputs = os.path.join('..', 'data', '05_model_output')
reports = os.path.join('..', 'data', '06_reporting')

X_train            = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
X_train_onehot         = pd.read_csv(os.path.join(inputs, 'X_train_onehot.csv'), index_col='id')
y_train            = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id')

data_list = [X_train, X_train_onehot, y_train]

for df in data_list:
    print(df.shape)

(4930, 20)
(4930, 26)
(4930, 1)


In [9]:
X_train_onehot.head()

Unnamed: 0_level_0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,...,dummy_multiplelines_yes,dummy_internetservice_fiber optic,dummy_internetservice_no,dummy_streamingtv_no internet service,dummy_streamingtv_yes,dummy_contract_one year,dummy_contract_two year,dummy_paymentmethod_credit card (automatic),dummy_paymentmethod_electronic check,dummy_paymentmethod_mailed check
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7590-vhveg,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
5575-gnvde,1.0,0.0,0.0,0.0,34.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,1,0,0,0,1
3668-qpybk,1.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
7795-cfocw,1.0,0.0,0.0,0.0,45.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,1,0,0,0,0
9237-hqitu,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0


# Machine Learning

convergence warning: https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge

In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

In [11]:
ml_dict = {}

# Specify the hyperparameter space
# if target_type == 'regression':
#     parameters = {
#     'model__alpha': np.linspace(0.2, 1, 5), 
#     'model__l1_ratio': np.linspace(0, 1, 5),
#     'model__random_state':[42]
#     }
#     ml_model = ElasticNet()
#     # set tol, default is 1e-4
#     do_transform_label = 'log'
# elif target_type == 'binary':
c_space = np.logspace(-5, 1, 5)
parameters = {
'model__C': c_space, 
'model__penalty': ['l2'],
'model__random_state':[42]
}
ml_model = LogisticRegression()
do_transform_label = None

# key = 'standard'

## test with different preprocessing steps
There are 2 different X_sets: On X_train_onehot, I applied one-hot encoding, while on X_train I applied Ordinal Encoding. The former is aimed at linear regression models, and the later is generally used for tree models.

On 'column' parameter, I am able to choose column groups. For instance, I might exclude collinear variables obtained from the VIF function applied on notebook 5. That is useful for linear regression models.

```python
treat_collinearity = False, do_build_polynomals=False, do_treat_skewness=False
```

In [12]:
model_type = 'reg'
ml_dict[model_type] = {}
columns = X_train_onehot.columns

clf, ml_dict[model_type]['train_time'], ml_dict[model_type]['prediction_time'] = apply_ml_model(
    X_train_onehot, y_train, columns, ml_model, parameters, scoring,
    do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[model_type]['best_params'], ml_dict[model_type]['best_score']  = get_model_params(clf, scoring)
pprint(ml_dict)

save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)

test type: False
{'reg': {'best_params': {'model__C': 10.0,
                         'model__penalty': 'l2',
                         'model__random_state': 42},
         'best_score': 0.5921888532265143,
         'prediction_time': 0.0004001,
         'train_time': 1.41481}}


```python
treat_collinearity = True, do_build_polynomals=False, do_treat_skewness=False,
```

In [13]:
model_type = 'reg_nocol'
ml_dict[model_type] = {}

# columns_nocol = dfs_dict['X_train_oh_nocol'].columns.to_list()

clf, ml_dict[model_type]['train_time'], ml_dict[model_type]['prediction_time'] = apply_ml_model(
    X_train_onehot, y_train, columns, ml_model, parameters, scoring,
    do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[model_type]['best_params'], ml_dict[model_type]['best_score']  = get_model_params(clf, scoring)
pprint(ml_dict)

save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)

test type: False
{'reg': {'best_params': {'model__C': 10.0,
                         'model__penalty': 'l2',
                         'model__random_state': 42},
         'best_score': 0.5921888532265143,
         'prediction_time': 0.0004001,
         'train_time': 1.41481},
 'reg_nocol': {'best_params': {'model__C': 10.0,
                               'model__penalty': 'l2',
                               'model__random_state': 42},
               'best_score': 0.5921888532265143,
               'prediction_time': 0.0001999,
               'train_time': 1.148982}}


I might use the alternative encoding just to demonstrate the impact on the score.