# Model building

In [None]:
import pickle
from pathlib import Path

from scipy.stats import uniform, randint
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

## 1. Asset loading

### 1.1. Feature information

In [None]:
with open('../data/biometric_features.pkl', 'rb') as input_file:
    biometric_features=pickle.load(input_file)

with open('../data/input_features.pkl', 'rb') as input_file:
    input_features=pickle.load(input_file)

with open('../data/output_features.pkl', 'rb') as input_file:
    output_features=pickle.load(input_file)

with open('../data/categorical_features.pkl', 'rb') as input_file:
    categorical_features=pickle.load(input_file)

### 1.2. Datasets

In [None]:
with open('../data/processed/train.pkl', 'rb') as input_file:
    train_df=pickle.load(input_file)

with open('../data/processed/test.pkl', 'rb') as input_file:
    train_df=pickle.load(input_file)

## 2. Model Building

We need to build two models - one to predict time and the other to predict calories. We will again use a dictionary to keep things organized.

### 2.1. Model dictionary definition

In [None]:
models={
    'calorie_model': HistGradientBoostingRegressor(early_stopping=True),
    'time_model': HistGradientBoostingRegressor(early_stopping=True)
}

### 2.2. Naive model cross validation

In [None]:
cross_val_results={}

for model_name, model in models.items():

    # Your code here... Remember: the features names for each model are the biometric
    # features + the 'extra' feature from the input features dictionary we
    # loaded above. The label name is also stored in the output features dictionary.
    # The keys match across the models, input features and output features dictionaries.

## 3. Model optimization

In [None]:
hyperparameters={
    'max_iter': randint(10, 10000)
    # etc.
}
optimized_hyperparameters={}

for model_name, model in models.items():

    # Your code here... HistGradientBoostingRegressor is fast on this dataset, so use
    # RandomizedSearchCV with a few hundred or thousand iterations. Make sure to replace
    # the naive model with the optimized one in the models dictionary and store the
    # winning hyperparameters using the model name as key in the optimized hyperparameters
    # dictionary.

## 4. Model evaluation

In [None]:
# Your code here... Make predictions on the test set with the optimized time and calorie models.
# Then evaluate those predictions - plotting predicted vs true values and/or fit residuals is
# a good idea, you also probably want to look at the RMSE between predictions and labels

## 5. Save assets

In [None]:
Path('../models').mkdir(exist_ok=True)

with open('../models/optimized_models.pkl', 'wb') as output_file:
    pickle.dump(models, output_file)

with open('../models/optimized_hyperparameters.pkl', 'wb') as output_file:
    pickle.dump(optimized_hyperparameters, output_file)