# Model Selection
We will be focusing on XGBoost, since it achieved the best score in the eda steps.
Possibly the best solution would involve a combination of some neural network, with XGBoost 
and/or LightGBM. More details on the steps and decisions can be found in the data_eda.ipynb notebook

## Imports 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import xgboost as xgb
import notebook_utils as nu 
import numpy as np
import joblib

random_state = 42

## Loads Data and Separates into Train and Test sets

In [2]:
data = pd.read_csv('./data/train.csv')

In [3]:
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [4]:
x_cols = [x for x in data.columns if (x not in ['ID_code', 'target'])]

X = data[x_cols]
y = data['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

### Oversample train data

In [6]:
y_train.head()

38762     0
76883     0
2018      0
133899    0
170373    1
Name: target, dtype: int64

In [7]:
X_train.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
38762,12.6508,-1.4518,4.9398,9.4263,11.3132,-3.2654,5.304,20.0105,-0.5506,7.4902,...,0.9493,10.1718,1.8558,12.2295,22.8003,0.1811,1.0184,8.9432,10.7889,-15.156
76883,8.8434,-5.0525,5.8053,2.554,9.7131,1.8681,5.308,14.4369,4.9028,6.9503,...,5.287,10.8789,2.6776,-5.2744,19.9168,-0.5264,-3.4889,9.4107,10.8898,6.696
2018,5.8905,3.834,7.7051,8.8097,12.6723,0.0678,4.1572,14.7936,-0.2814,9.8768,...,4.1241,4.4696,5.0724,1.5313,17.1286,0.912,0.8431,8.8373,16.0513,-11.9798
133899,8.2786,5.8034,13.2013,8.9956,13.5608,-10.0445,5.1545,16.3067,-3.5246,10.012,...,0.7748,14.6195,3.061,-5.9456,15.4801,0.2584,-1.0892,7.4688,16.2685,-10.2297
170373,7.3882,6.3704,9.9413,7.9744,11.6259,-8.782,4.4755,17.7317,3.7531,8.8027,...,9.1395,5.5115,3.3208,0.3378,19.2325,-2.2717,7.2787,9.0987,15.9157,-17.2046


In [8]:
df0_indexes = y_train.index[y_train == 0].tolist()
df1_indexes = y_train.index[y_train == 1].tolist()

X_train_0 = X_train.loc[df0_indexes]
y_train_0 = y_train.loc[df0_indexes]

nr_df_0 = X_train_0.shape[0]

X_train_1 = X_train.loc[df1_indexes]
y_train_1 = y_train.loc[df1_indexes]

n_samples = nr_df_0
X_train_1, y_train_1 = resample(X_train_1, y_train_1, replace=True, n_samples=n_samples,random_state=random_state)

X_train = pd.concat([X_train_0, X_train_1])
y_train = pd.concat([y_train_0, y_train_1])

In [9]:
X_train.describe()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,...,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0,323870.0
mean,10.892665,-1.365485,10.910729,6.825571,11.097798,-4.748479,5.48813,16.542181,0.365201,7.501028,...,3.575732,7.623189,1.842769,3.236723,17.89859,-0.093978,2.473966,8.864975,15.659997,-2.963305
std,3.150621,4.13683,2.740539,2.060147,1.638567,7.986068,0.896579,3.420468,3.333813,1.256785,...,4.688064,3.113859,1.484306,4.024613,3.165741,1.455688,5.496634,0.93568,3.123425,10.446711
min,0.4528,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,3.9705,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,8.5595,-4.533075,8.8264,5.27195,9.8895,-10.944625,4.8192,13.9216,-2.2272,6.5517,...,0.1672,5.2705,0.7887,0.4978,15.529,-1.150975,-1.765575,8.2135,13.5739,-10.79095
50%,10.7268,-1.3386,10.792,6.86455,11.126,-4.5136,5.4777,16.4638,0.47965,7.5716,...,3.5571,7.5434,1.7909,3.301,17.8588,-0.135,2.6009,8.8464,15.7195,-2.4664
75%,13.0141,1.6462,12.731,8.3529,12.2911,1.2232,6.1005,19.0825,3.0124,8.524375,...,6.8434,9.7183,2.8685,6.1409,20.3184,0.8929,6.728,9.5629,17.942875,5.2492
max,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.3556,27.6918,10.1513,11.1506,...,18.4409,16.6846,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [10]:
y_train.describe()

count    323870.000000
mean          0.500000
std           0.500001
min           0.000000
25%           0.000000
50%           0.500000
75%           1.000000
max           1.000000
Name: target, dtype: float64

## 1) Train Single XGBoost Instance

### 1.1) Hyperparameters:

In [11]:
hyperparameters = {
    'learning_rate': 0.3, # default
    'max_depth': 6, # default
    'min_child_weight': 1, # default
#     'subsample': 0.75,
#     'colsample_bytree': 0.75,
    'random_state': random_state
}

### 1.3) Train:

In [12]:
xgb_model = xgb.XGBClassifier(learning_rate=hyperparameters['learning_rate'],
                              n_jobs=-1,
                              max_depth=hyperparameters['max_depth'],
                              min_child_weight=hyperparameters['min_child_weight'],
#                               subsample=hyperparameters['subsample'],
#                               colsample_bytree=hyperparameters['colsample_bytree'],
                              objective='binary:logistic',
                              random_state=random_state,
                              verbosity=1
                              )
xgb_model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=1)

In [13]:
print(xgb_model.get_xgb_params())
xgb.XGBClassifier(params = xgb_model.get_xgb_params())

{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.3, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'monotone_constraints': '()', 'n_jobs': -1, 'num_parallel_tree': 1, 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': 1}


XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None...
                      'n_jobs': -1, 'num_parallel_tree': 1,
                      'objective': 'binary:logistic', 'random_state': 42,
                      'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1,
                      'subsample': 1, 'tree_method': 'exact',
                      'validate_parameters': 1, 'verbosity': 1},
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=True, validate_parameters=None, verbosity=None)

### 1.4) Evaluate the Results:

In [18]:
y_pred = xgb_model.predict(X_test)
y_true = np.array(y_test)

scores = nu.get_scores(y_true, y_pred)
nu.print_scores(scores)

accuracy score: 0.86595
gini score: 0.5079081009615076
f1 score: 0.48193236714975846
precision score: 0.3968809675366009
recall score: 0.6133792424987703


### 1.4) Save Model

In [22]:
joblib.dump(xgb_model, "xgb_model.joblib.dat")

['xgb_model.joblib.dat']

### 1.5) Tests if model was correctly saved

In [18]:
xgb_model = pickle.load(open("xgb_model.pickle.dat", "rb"))

In [19]:
y_pred = xgb_model.predict(X_test)
y_true = np.array(y_test)

scores = nu.get_scores(y_true, y_pred)
nu.print_scores(scores)

accuracy score: 0.86595
gini score: 0.5079081009615076
f1 score: 0.48193236714975846
precision score: 0.3968809675366009
recall score: 0.6133792424987703


## 2) First Hyperparameter Selection/Tuning
TODO: make GridSearchCV

In [13]:
'learning_rate': [0.01, 0.03, 0.1],
        'max_depth': [3, 5, 7, 10, 15],
        'min_child_weight': [1, 3, 5, 10],
        'subsample': [0.3, 0.5, 0.75, 1],
        'colsample_bytree': [0.3, 0.5, 0.75, 1],
        'n_estimators' : [50, 100, 200, 500, 1000],
        'objective': ['reg:squarederror']

SyntaxError: invalid syntax (<ipython-input-13-326acb4f6f3b>, line 1)

## 3) Second Hyperparameter Selection
TODO
The second iteration is made based on the first