In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Background
---
This project is a practice of an Life Style Dataset and in this notebook I'll do an exploratory analysis of the data and answer the following questions:
1. Which lifestyle factors show the strongest correlation with overall health or happiness?
2. How do age and gender influence health-related behaviors and outcomes?
3. Are there any significant differences in stress levels among people with different activity levels or diets?
4. Can we detect clusters of individuals with similar lifestyle patterns?
5. What are the top 3 actionable recommendations you can give based on the data insights?

Also I would like to include the following KPIs here and in a Power BI file on the "Dashboards" folder:
1. Average Sleep Hours per Person per Week
2. Average Exercise Frequency (per week)
3. Average BMI or Health Score grouped by Lifestyle Type
4. Percentage of Respondents with Healthy Habits 
5. Happiness-to-Stress Ratio

# Initial SetUp and Data cleaning
Load and inspect data to know the headers, data size, data types and how many nulls has the Dataset

---
## Conclusions
1. There is any null or empty field
2. The data appears to be integrated and clasified even on categorical fields such as Workout_type or Gender
3. The data info is based on people between 18 and 59 years old, and 75% of the people are under the 49 years old
4. The most important categorical fields are: Gender, Workout_Type, meal_name, meal_type, diet_type, cooking_method, Name of Exersice, Difficulty Level, Burns_calories_Bin

In [4]:
#Loading the dataset
df_raw = pd.read_csv('../data/raw/Final_data.csv')

In [6]:
#Looking for the count of the dataset
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 54 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             20000 non-null  float64
 1   Gender                          20000 non-null  object 
 2   Weight (kg)                     20000 non-null  float64
 3   Height (m)                      20000 non-null  float64
 4   Max_BPM                         20000 non-null  float64
 5   Avg_BPM                         20000 non-null  float64
 6   Resting_BPM                     20000 non-null  float64
 7   Session_Duration (hours)        20000 non-null  float64
 8   Calories_Burned                 20000 non-null  float64
 9   Workout_Type                    20000 non-null  object 
 10  Fat_Percentage                  20000 non-null  float64
 11  Water_Intake (liters)           20000 non-null  float64
 12  Workout_Frequency (days/week)   

In [7]:
#Look for the dataset stadistic info
df_raw.describe()

Unnamed: 0,Age,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Fat_Percentage,Water_Intake (liters),...,BMI_calc,cal_from_macros,pct_carbs,protein_per_kg,pct_HRR,pct_maxHR,cal_balance,lean_mass_kg,expected_burn,Burns Calories (per 30 min)_bc
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,38.851453,73.898832,1.723093,179.889702,143.704306,62.195813,1.259446,1280.1096,26.101313,2.627485,...,24.921651,1998.297076,0.499983,1.460142,0.699005,0.802305,744.308699,53.786384,866.352318,8.631802e+19
std,12.11458,21.17301,0.127033,11.510805,14.267688,7.289351,0.341336,502.228982,4.99616,0.604724,...,6.701144,440.848408,0.001455,0.518946,0.14488,0.096613,720.946619,12.49874,250.317069,3.197579e+19
min,18.0,39.18,1.49,159.31,119.07,49.49,0.49,323.11,11.333134,1.46,...,12.037907,1105.57,0.492434,0.516706,0.371344,0.599789,-1266.22,30.946261,219.8528,2.491905e+16
25%,28.17,58.16,1.62,170.0575,131.22,55.96,1.05,910.8,22.387807,2.17,...,20.094975,1661.0225,0.499054,1.076294,0.583656,0.727676,261.4325,44.587037,714.09825,6.441978e+19
50%,39.865,70.0,1.71,180.14,142.99,62.2,1.27,1231.45,25.822504,2.61,...,24.119097,1943.13,0.499981,1.38226,0.686284,0.794834,691.19,51.204908,868.7214,8.371578e+19
75%,49.63,86.1,1.8,189.425,156.06,68.09,1.46,1553.1125,29.676026,3.12,...,28.56262,2271.95,0.50091,1.750495,0.798196,0.869211,1176.29,61.939016,1012.5327,1.100442e+20
max,59.67,130.77,2.01,199.64,169.84,74.5,2.02,2890.82,35.0,3.73,...,50.229544,3699.54,0.507889,3.916881,1.073939,1.047032,3075.58,90.117371,1477.1088,1.756614e+20


In [11]:
#Looking all the categorical fields identified by the "object" dtype
df_raw.select_dtypes(include=['object'])

Unnamed: 0,Gender,Workout_Type,meal_name,meal_type,diet_type,cooking_method,Name of Exercise,Benefit,Target Muscle Group,Equipment Needed,Difficulty Level,Body Part,Type of Muscle,Workout,Burns_Calories_Bin
0,Male,Strength,Other,Lunch,Vegan,Grilled,Decline Push-ups,Improves shoulder health and posture,"Shoulders, Triceps",Cable Machine,Advanced,Legs,Lats,Dumbbell flyes,Medium
1,Female,HIIT,Other,Lunch,Vegetarian,Fried,Bear Crawls,Strengthens lower abs,"Back, Core, Shoulders",Step or Box,Intermediate,Chest,Lats,Lateral raises,High
2,Female,Cardio,Other,Breakfast,Paleo,Boiled,Dips,Builds chest strength,"Quadriceps, Glutes",Step or Box,Intermediate,Arms,Grip Strength,Standing calf raises,High
3,Female,HIIT,Other,Lunch,Paleo,Fried,Mountain Climbers,Improves coordination and cardiovascular health,"Biceps, Forearms",Parallel Bars or Chair,Advanced,Shoulders,Upper,Incline dumbbell flyes,High
4,Male,Strength,Other,Breakfast,Vegan,Baked,Bicep Curls,Targets obliques and improves core rotation,"Chest, Triceps",Wall,Advanced,Abs,Wrist Flexors,Military press,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,Female,Strength,Other,Breakfast,Low-Carb,Boiled,Frog Jumps,Improves cardiovascular fitness,"Quadriceps, Hamstrings, Glutes",Box or Platform,Advanced,Arms,Anterior,Triceps pushdowns,Medium
19996,Female,Strength,Other,Snack,Vegetarian,Boiled,Pull-ups,Strengthens back and legs,"Core, Shoulders, Hips",Dumbbells or Barbell,Intermediate,Arms,Lats,Seated calf raises,Low
19997,Male,Strength,Other,Snack,Keto,Grilled,Dips,Builds shoulder width,"Upper Back, Rear Deltoids",Barbell,Beginner,Forearms,Wrist Extensors,Russian twists,High
19998,Male,Yoga,Other,Snack,Paleo,Roasted,Plyo Squats,Builds calf muscles,"Rear Deltoids, Upper Back",Pull-up Bar,Intermediate,Shoulders,Anterior,Leg curls,Low


In [37]:
#I look for the "meal name" unique values to know if it is worth to consider for categorical values encoding 
df_raw['meal_name'].unique()

array(['Other'], dtype=object)

In [40]:
df_raw['Name of Exercise'].nunique()

55

In [41]:
# Encoding categorical Values 
# Nominal: Gender, Workout_type, meal_type, diet_type, cooking_method, Name of Exercise
# Ordinal: Difficulty Level, Burns_Calories_Bin

nominal_cols = ['Gender', 'Workout_Type', 'meal_type', 'diet_type', 'cooking_method']
ordinal_difficulty = ['Difficulty Level']
ordinal_burns = ['Burns_Calories_Bin']

order_difficulty = [['Beginner', 'Intermediate', 'Advanced']]
order_burns_cal = [['Low', 'Medium', 'High', 'Very High']]

preprocessor = ColumnTransformer(
    transformers=[
        ('nominal', OneHotEncoder(drop='first'), nominal_cols),
        ('ordinal_difficulty', OrdinalEncoder(categories=order_difficulty), ordinal_difficulty),
        ('ordinal_burns', OrdinalEncoder(categories=order_burns_cal), ordinal_burns)
    ],
    remainder='passthrough'
)

df_encoded = pd.DataFrame(
    preprocessor.fit_transform(df_raw),
    columns=preprocessor.get_feature_names_out()
)
df_encoded.columns

Index(['nominal__Gender_Male', 'nominal__Workout_Type_HIIT',
       'nominal__Workout_Type_Strength', 'nominal__Workout_Type_Yoga',
       'nominal__meal_type_Dinner', 'nominal__meal_type_Lunch',
       'nominal__meal_type_Snack', 'nominal__diet_type_Keto',
       'nominal__diet_type_Low-Carb', 'nominal__diet_type_Paleo',
       'nominal__diet_type_Vegan', 'nominal__diet_type_Vegetarian',
       'nominal__cooking_method_Boiled', 'nominal__cooking_method_Fried',
       'nominal__cooking_method_Grilled', 'nominal__cooking_method_Raw',
       'nominal__cooking_method_Roasted', 'nominal__cooking_method_Steamed',
       'ordinal_difficulty__Difficulty Level',
       'ordinal_burns__Burns_Calories_Bin', 'remainder__Age',
       'remainder__Weight (kg)', 'remainder__Height (m)', 'remainder__Max_BPM',
       'remainder__Avg_BPM', 'remainder__Resting_BPM',
       'remainder__Session_Duration (hours)', 'remainder__Calories_Burned',
       'remainder__Fat_Percentage', 'remainder__Water_Intake (li

In [36]:
df_encoded

Unnamed: 0,nominal__Gender_Male,nominal__Workout_Type_HIIT,nominal__Workout_Type_Strength,nominal__Workout_Type_Yoga,nominal__meal_type_Dinner,nominal__meal_type_Lunch,nominal__meal_type_Snack,nominal__diet_type_Keto,nominal__diet_type_Low-Carb,nominal__diet_type_Paleo,...,remainder__BMI_calc,remainder__cal_from_macros,remainder__pct_carbs,remainder__protein_per_kg,remainder__pct_HRR,remainder__pct_maxHR,remainder__cal_balance,remainder__lean_mass_kg,remainder__expected_burn,remainder__Burns Calories (per 30 min)_bc
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,24.870447,2139.59,0.500432,1.624789,0.741237,0.835985,725.1,47.777394,685.16,72604254381420232704.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,23.479709,1711.65,0.50085,1.514093,0.551247,0.73427,-232.91,40.809803,978.6184,102050570044360998912.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,21.148123,1965.92,0.50061,1.663445,0.574534,0.708124,805.74,44.63558,654.5266,107960708814109687808.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,32.449827,1627.28,0.499533,0.862017,0.744155,0.81115,1206.21,63.007432,773.63,89879211568990568448.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.831372,2659.23,0.500581,2.538153,0.668405,0.789751,303.6,43.347504,711.4176,52646848199189766144.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,27.232687,1882.8,0.5,0.954837,0.623413,0.743876,1865.39,71.269345,533.2558,79244019024727015424.0
19996,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,25.199462,1205.47,0.496968,0.68906,0.56381,0.683964,173.87,65.049689,1310.6016,57084744375619584000.0
19997,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,16.565671,1947.79,0.50028,2.105844,0.946701,0.966818,-43.8,35.420708,957.9568,91012849874417762304.0
19998,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,16.880049,1921.51,0.49994,2.190745,0.509756,0.676243,346.25,35.88926,928.4004,52464364387870105600.0
