In [1]:
import pandas as pd
import numpy as np

# Load the raw dataset
df = pd.read_csv('../data/raw/exercise_dataset.csv')

In [2]:
# --- Step 1: Clean column names ---
df.columns = (
    df.columns.str.strip()
    .str.replace(' ', '_')
    .str.replace('-', '_')
    .str.lower()
)

In [3]:
# --- Step 2: Drop unnecessary columns ---
if 'd' in df.columns:
    df = df.drop(columns=['d'])

In [5]:
# --- Step 3: Handle missing values ---
# Replace blank strings or 'NA' with NaN
df = df.replace(['', 'NA', 'N/A', 'na'], np.nan)
# Fill missing numeric columns with mean
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].mean())

# Fill missing categorical columns with mode
for col in df.select_dtypes(exclude=[np.number]).columns:
    df[col] = df[col].fillna(df[col].mode()[0])


In [6]:
# --- Step 4: Convert data types ---
df['age'] = df['age'].astype(int)
df['duration'] = df['duration'].astype(int)
df['heart_rate'] = df['heart_rate'].astype(int)
df['exercise_intensity'] = df['exercise_intensity'].astype(int)

In [7]:
# --- Step 5: Add useful derived features ---
# Calorie burn rate per minute
df['calories_per_min'] = (df['calories_burn'] / df['duration']).round(2)

# Weight difference (Dream - Actual)
df['weight_diff'] = (df['dream_weight'] - df['actual_weight']).round(2)

In [8]:
# --- Step 6: Handle categorical consistency ---
df['weather_conditions'] = df['weather_conditions'].str.title()
df['gender'] = df['gender'].str.title()

In [9]:
# --- Step 7: Save processed dataset ---
df.to_csv('../data/processed/cleaned_exercise_data.csv', index=False)

print("✅ Cleaned dataset saved successfully at '../data/processed/cleaned_exercise_data.csv'")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()

✅ Cleaned dataset saved successfully at '../data/processed/cleaned_exercise_data.csv'
Rows: 3864, Columns: 14


Unnamed: 0,id,exercise,calories_burn,dream_weight,actual_weight,age,gender,duration,heart_rate,bmi,weather_conditions,exercise_intensity,calories_per_min,weight_diff
0,1,Exercise 2,286.959851,91.892531,96.301115,45,Male,37,170,29.426275,Rainy,5,7.76,-4.41
1,2,Exercise 7,343.453036,64.165097,61.104668,25,Male,43,142,21.286346,Rainy,5,7.99,3.06
2,3,Exercise 4,261.223465,70.846224,71.766724,20,Male,20,148,27.899592,Cloudy,4,13.06,-0.92
3,4,Exercise 5,127.183858,79.477008,82.984456,33,Male,39,170,33.729552,Sunny,10,3.26,-3.51
4,5,Exercise 10,416.318374,89.960226,85.643174,29,Female,34,118,23.286113,Cloudy,3,12.24,4.32
