# Data preparation

In [None]:
from pathlib import Path
import pickle
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

## 1. Load raw data

In [None]:
data_df=pd.read_csv('../data/raw/workout_fitness_tracker_data.csv')

## 2. Select features

In [None]:
data_df.info()

Given some self-report biometrics from the user we will build two models that do the following depending on user input:

1. Take workout duration and predict calorie burn for each workout type (i.e. 'I am going to work out for X minutes, how many calories will I burn?').
2. Take calorie burn and predict duration for each workout type (i.e., 'I want to burn X calories, how long do I need to work out for?').

Both models will share a set of input 'biometric' features. These must be things the user is likely to know about themselves. People probably know their weight and age, but probably don't know their VO2 max.

The models will differ in one of their input features and their output:

1. `calorie_model` needs to take workout duration + biometrics as input features and predict calories burned.
2. `time_model` needs to take calories burned + biometrics as input features and predict workout duration. 

We will use dictionaries to keep everything organized

In [None]:
biometric_features=[
    'Age',
    'Gender',
    'Height (cm)',
    'Weight (kg)',
    'Workout Type',
    'Sleep Hours',
    'Water Intake (liters)', 
    'Resting Heart Rate (bpm)',
    'Workout Intensity',
    'Body Fat (%)',
    'Mood Before Workout'
]

input_features={
    'calorie_model': 'Workout Duration (mins)',
    'time_model': 'Calories Burned'
}

output_features={
    'calorie_model': 'Calories Burned',
    'time_model': 'Workout Duration (mins)'
}

## 3. Encode categorical features

In [None]:
categorical_features=['Gender', 'Workout Intensity', 'Mood Before Workout', 'Workout Duration (min)']

# Your code here... I recommend sklearn's OrdinalEncoder to start with because it will let us keep the 
# number/names of features constant. We can experiment with improving encoding later if we have time.

## 4. Train-test split

In [None]:
# Your code here....

## 5. Save assets

In [None]:
# Feature information
with open('../data/biometric_features.pkl', 'wb') as output_file:
    pickle.dump(biometric_features, output_file)

with open('../data/input_features.pkl', 'wb') as output_file:
    pickle.dump(input_features, output_file)

with open('../data/output_features.pkl', 'wb') as output_file:
    pickle.dump(output_features, output_file)

with open('../data/categorical_features.pkl', 'wb') as output_file:
    pickle.dump(categorical_features, output_file)

# Data
Path('../data/processed').mkdir(exist_ok=True)

with open('../data/processed/all.pkl', 'wb') as output_file:
    pickle.dump(data_df, output_file)

with open('../data/processed/train.pkl', 'wb') as output_file:
    pickle.dump(train_df, output_file)

with open('../data/processed/test.pkl', 'wb') as output_file:
    pickle.dump(test_df, output_file)