# SmartFit AI: Intelligent Fitness & Nutrition Analysis

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/Final_data.csv")

In [8]:
"""
01_data_cleaning_and_feature_engineering.py

Purpose:
 - Clean dataset already loaded as df
 - Handle missing values, outliers, encoding, and feature engineering
 - Return cleaned dataframe ready for EDA or modeling
"""

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# -----------------------------
# Cleaning and preprocessing
# -----------------------------

def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """Make column names lowercase and underscore separated."""
    df = df.copy()
    df.columns = [
        c.strip().lower().replace(" ", "_").replace("(", "").replace(")", "").replace("%", "pct")
        for c in df.columns
    ]
    return df


def basic_type_casting(df: pd.DataFrame) -> pd.DataFrame:
    """Try to convert columns to numeric or datetime (only if column looks like a date)."""
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == object:
            # Try numeric first
            try:
                df[col] = pd.to_numeric(df[col].str.replace(",", ""), errors="raise")
                continue
            except Exception:
                pass

            # Only try datetime if column name hints it's a date or time
            if any(k in col for k in ["date", "time", "year", "month"]):
                try:
                    df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
                except Exception:
                    pass
    return df



def handle_missing_values(df: pd.DataFrame, strategy_num="median", strategy_cat="most_frequent") -> pd.DataFrame:
    """Fill missing numeric and categorical values."""
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(include=["object", "category"]).columns

    if len(num_cols):
        df[num_cols] = SimpleImputer(strategy=strategy_num).fit_transform(df[num_cols])
    if len(cat_cols):
        df[cat_cols] = SimpleImputer(strategy=strategy_cat).fit_transform(df[cat_cols])

    return df


def cap_outliers_iqr(df: pd.DataFrame, factor=1.5) -> pd.DataFrame:
    """Cap outliers using IQR."""
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - factor * iqr, q3 + factor * iqr
        df[col] = df[col].clip(lower, upper)
    return df


# -----------------------------
# Feature engineering
# -----------------------------

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Add derived features."""
    df = df.copy()

    # calorie balance
    if "calories" in df.columns and "calories_burned" in df.columns:
        df["calorie_balance"] = df["calories"] - df["calories_burned"]

    # workout efficiency
    if "calories_burned" in df.columns and "session_duration_hours" in df.columns:
        df["workout_efficiency"] = df["calories_burned"] / df["session_duration_hours"].replace({0: np.nan})
        df["workout_efficiency"] = df["workout_efficiency"].fillna(0)

    # macro percentages
    if all(c in df.columns for c in ["carbs", "proteins", "fats"]):
        total = df[["carbs", "proteins", "fats"]].sum(axis=1).replace(0, np.nan)
        df["pct_carbs"] = df["carbs"] / total
        df["pct_proteins"] = df["proteins"] / total
        df["pct_fats"] = df["fats"] / total
        df[["pct_carbs", "pct_proteins", "pct_fats"]] = df[["pct_carbs", "pct_proteins", "pct_fats"]].fillna(0)

    # protein per kg
    if "proteins" in df.columns and "weight_kg" in df.columns:
        df["protein_per_kg"] = df["proteins"] / df["weight_kg"].replace({0: np.nan})
        df["protein_per_kg"] = df["protein_per_kg"].fillna(0)

    # bmi
    if "bmi" not in df.columns and all(c in df.columns for c in ["weight_kg", "height_m"]):
        df["bmi"] = df["weight_kg"] / (df["height_m"] ** 2)

    return df


# -----------------------------
# Master cleaning pipeline
# -----------------------------

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Apply all cleaning + feature engineering steps."""
    df = clean_column_names(df)
    df = basic_type_casting(df)
    df = handle_missing_values(df)
    df = cap_outliers_iqr(df)
    df = feature_engineering(df)
    return df


In [9]:
# df is already loaded
df_cleaned = preprocess_data(df)


In [11]:
df_cleaned.to_csv("cleaned_file.csv", index = False)