# Data preparation

In [1]:
from pathlib import Path
import pickle
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

## 1. Load raw data

In [2]:
data_df=pd.read_csv('../data/raw/workout_fitness_tracker_data.csv')

## 2. Select features

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   User ID                   10000 non-null  int64  
 1   Age                       10000 non-null  int64  
 2   Gender                    10000 non-null  object 
 3   Height (cm)               10000 non-null  int64  
 4   Weight (kg)               10000 non-null  int64  
 5   Workout Type              10000 non-null  object 
 6   Workout Duration (mins)   10000 non-null  int64  
 7   Calories Burned           10000 non-null  int64  
 8   Heart Rate (bpm)          10000 non-null  int64  
 9   Steps Taken               10000 non-null  int64  
 10  Distance (km)             10000 non-null  float64
 11  Workout Intensity         10000 non-null  object 
 12  Sleep Hours               10000 non-null  float64
 13  Water Intake (liters)     10000 non-null  float64
 14  Daily C

Given some self-report biometrics from the user we will build two models that do the following depending on user input:

1. Take workout duration and predict calorie burn for each workout type (i.e. 'I am going to work out for X minutes, how many calories will I burn?').
2. Take calorie burn and predict duration for each workout type (i.e., 'I want to burn X calories, how long do I need to work out for?').

Both models will share a set of input 'biometric' features. These must be things the user is likely to know about themselves. People probably know their weight and age, but probably don't know their VO2 max.

The models will differ in one of their input features and their output:

1. `calorie_model` needs to take workout duration + biometrics as input features and predict calories burned.
2. `time_model` needs to take calories burned + biometrics as input features and predict workout duration. 

We will use dictionaries to keep everything organized

In [4]:
biometric_features=[
    'Age', 'Gender', 'Height (cm)', 'Weight (kg)', 'Workout Type',
    'Sleep Hours', 'Mood Before Workout'

]

input_features={
    'calorie_model': ['Workout Duration (mins)'] + biometric_features,
    'time_model': ['Calories Burned'] + biometric_features
}

output_features={
    'calorie_model': 'Calories Burned',
    'time_model': 'Workout Duration (mins)'
}

## 3. Encode categorical features

In [5]:
# Your code here... I recommend sklearn's OrdinalEncoder to start with because it will let us keep the 
# number/names of features constant. We can experiment with improving encoding later if we have time.

categorical_features=['Gender', 'Workout Type', 'Workout Intensity', 'Mood Before Workout']

# Extract unique values from categorical features
categories_list = [
    data_df["Gender"].unique().tolist(),
    data_df["Workout Type"].unique().tolist(),
    data_df["Workout Intensity"].unique().tolist(),
    data_df["Mood Before Workout"].unique().tolist()
]

# Apply dynamic categories to OrdinalEncoder
encoder = OrdinalEncoder(categories=categories_list)

# Apply encoding to categorical columns
df_encoded = data_df.copy()  # Keep original dataframe
df_encoded[categorical_features] = encoder.fit_transform(data_df[categorical_features])

print(df_encoded.head())  # Check encoded values

   User ID  Age  Gender  Height (cm)  Weight (kg)  Workout Type  \
0        1   39     0.0          175           99           0.0   
1        2   36     1.0          157          112           1.0   
2        3   25     2.0          180           66           2.0   
3        4   56     0.0          154           89           0.0   
4        5   53     1.0          194           59           3.0   

   Workout Duration (mins)  Calories Burned  Heart Rate (bpm)  Steps Taken  \
0                       79              384               112         8850   
1                       73              612               168         2821   
2                       27              540               133        18898   
3                       39              672               118        14102   
4                       56              410               170        16518   

   Distance (km)  Workout Intensity  Sleep Hours  Water Intake (liters)  \
0          14.44                0.0          8.2     

## 4. Train-test split

In [6]:
# Your code here....
def split_data(df, test_size=0.2, random_state=42):
    """Splits the dataset into train_df and test_df."""
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    return train_df, test_df

# Apply the split
train_df, test_df = split_data(df_encoded)

# Confirm the split sizes
print(f"Train Data Shape: {train_df.shape}")
print(f"Test Data Shape: {test_df.shape}")

Train Data Shape: (8000, 20)
Test Data Shape: (2000, 20)


## 5. Save assets

In [7]:
# Feature information
with open('../data/biometric_features.pkl', 'wb') as output_file:
    pickle.dump(biometric_features, output_file)

with open('../data/input_features.pkl', 'wb') as output_file:
    pickle.dump(input_features, output_file)

with open('../data/output_features.pkl', 'wb') as output_file:
    pickle.dump(output_features, output_file)

with open('../data/categorical_features.pkl', 'wb') as output_file:
    pickle.dump(categorical_features, output_file)

# Data
Path('../data/processed').mkdir(exist_ok=True)

with open('../data/processed/all.pkl', 'wb') as output_file:
    pickle.dump(data_df, output_file)

with open('../data/processed/train.pkl', 'wb') as output_file:
    pickle.dump(train_df, output_file)

with open('../data/processed/test.pkl', 'wb') as output_file:
    pickle.dump(test_df, output_file)