In [12]:
import pandas as pd


In [32]:
bomba_data = pd.read_csv('../data/processed/Data_train.csv')
bomba_predict = pd.read_csv('../data/processed/Data_test.csv')

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler

In [34]:
bomba_data.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0.0,1.0,2.0,3.0,4.0
Basal Rate (U/h),1.95,1.6,1.95,2.7,2.2
BWZ Estimate (U),9.5,6.6,5.5,4.4,11.2
BWZ Carb Ratio (U/Ex),1.0,1.9,1.0,1.1,1.5
BWZ Insulin Sensitivity (mg/dL/U),45.0,45.0,45.0,45.0,45.0
BWZ Carb Input (exchanges),9.5,3.5,5.5,4.0,7.5
BWZ BG Input (mg/dL),84.0,111.0,99.0,97.0,75.0
BWZ Correction Estimate (U),0.0,0.0,0.0,0.0,0.0
BWZ Food Estimate (U),9.5,6.6,5.5,4.4,11.2
BWZ Active Insulin (U),0.0,0.0,0.0,1.2,0.0


In [35]:
bomba_data.columns.tolist()

['Unnamed: 0',
 'Basal Rate (U/h)',
 'BWZ Estimate (U)',
 'BWZ Carb Ratio (U/Ex)',
 'BWZ Insulin Sensitivity (mg/dL/U)',
 'BWZ Carb Input (exchanges)',
 'BWZ BG Input (mg/dL)',
 'BWZ Correction Estimate (U)',
 'BWZ Food Estimate (U)',
 'BWZ Active Insulin (U)',
 'BWZ Unabsorbed Insulin Total (U)',
 'day_of_month',
 'day_of_week',
 'month_of_year',
 'hour']

In [36]:
# Food estimate lo quitamos de la lista por correlación = 1 con BWZ Estimate (U)

NUM_FEATS = ['Basal Rate (U/h)','BWZ Carb Ratio (U/Ex)','BWZ Insulin Sensitivity (mg/dL/U)',
             'BWZ Carb Input (exchanges)', 'BWZ BG Input (mg/dL)','BWZ Correction Estimate (U)',
             'BWZ Active Insulin (U)','BWZ Unabsorbed Insulin Total (U)']
CAT_FEATS = ['day_of_month', 'day_of_week', 'month_of_year', 'hour']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'BWZ Estimate (U)'
FEATS

['Basal Rate (U/h)',
 'BWZ Carb Ratio (U/Ex)',
 'BWZ Insulin Sensitivity (mg/dL/U)',
 'BWZ Carb Input (exchanges)',
 'BWZ BG Input (mg/dL)',
 'BWZ Correction Estimate (U)',
 'BWZ Active Insulin (U)',
 'BWZ Unabsorbed Insulin Total (U)',
 'day_of_month',
 'day_of_week',
 'month_of_year',
 'hour']

In [37]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [38]:
# Probar strategy con mean y median
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [39]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS)])



In [40]:
pd.DataFrame(data=preprocessor.fit_transform(bomba_data)).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,4.112078,0.811676,-2.058106,1.586585,-1.180318,-0.190357,-0.410317,-0.097593
1,2.753731,6.019288,-2.058106,-0.808604,0.661402,-0.190357,-0.410317,-0.097593
2,4.112078,0.811676,-2.058106,-0.010208,-0.157141,-0.190357,-0.410317,-0.097593
3,7.022822,1.3903,-2.058106,-0.609005,-0.293564,-0.190357,1.086021,-0.097593
4,5.082326,3.704794,-2.058106,0.788188,-1.794225,-0.190357,-0.410317,-0.097593


In [41]:
preprocessor

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                           

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
bomba_train, bomba_test = train_test_split(bomba_data)

In [51]:
print(bomba_train.shape)
print(bomba_test.shape)

(2625, 15)
(875, 15)


In [52]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [53]:
model.fit(bomba_train[FEATS], bomba_train[TARGET]);

In [54]:
from sklearn.metrics import mean_squared_error

In [55]:
y_test = model.predict(bomba_test[FEATS])
y_train = model.predict(bomba_train[FEATS])

In [56]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=bomba_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=bomba_train[TARGET], squared=False)}")

test error: 0.1660909079820014
train error: 0.03679526367036169


In [59]:
from sklearn.model_selection import cross_val_score

In [61]:
scores = cross_val_score(model, 
                         bomba_data[FEATS], 
                         bomba_data[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=6, n_jobs=-1)

In [62]:
import numpy as np
np.mean(-scores)

0.27280975178126593

In [63]:
print(-scores)


[1.22315996 0.0309368  0.1073748  0.06329453 0.10575029 0.10634213]


In [64]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median','most_frequent','constant'],
    'regressor__n_estimators': [8,16,64],
    'regressor__max_depth': [4,8],
    'regressor__max_features': ['auto', 'sqrt']
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(bomba_data[FEATS], bomba_data[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   11.8s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('num',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                     

In [67]:
grid_search.best_params_

{'regressor__n_estimators': 8,
 'regressor__max_features': 'auto',
 'regressor__max_depth': 8,
 'preprocessor__num__imputer__strategy': 'mean'}

In [68]:
grid_search.best_score_

-0.31941209035189716