# Data preprocessing

## Setup

In [None]:
!pip install -r ../requirements.txt

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm # DO NOT REMOVE

%matplotlib inline

In [None]:
PROJECT_PATH = Path.joinpath(Path.cwd(), '..')
DATA_PATH = PROJECT_PATH / "data"
CSV_PATH = DATA_PATH / "gym_members_exercise_tracking.csv"

In [None]:
df = pd.read_csv(CSV_PATH)

## Column name 'normalizing'

In [None]:
def normalize_name(name: str) -> str:
    """
        Normalize given name into lowercase separated by underscores
        and removing the measurements.

    Args:
        name (str): column name to normalize

    Returns:
        str: normalized name
    """

    return name.lower().split("(")[0].strip().replace(" ", "_")


df.columns = df.columns.map(normalize_name)
df.columns

## Data Cleaning

## Outliers Handling

### Water_Intake and Fat_Percentage

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Water_Intake (liters)", y="Fat_Percentage", hue="Gender", palette="deep", s=50)
plt.title("Water Intake vs Fat Percentage")
plt.xlabel("Water Intake (liters)")
plt.ylabel("Fat Percentage")
plt.show()

In [None]:
# Remove all rows where Gender is Male and Fat Percentage <= 15
df_filtered = df[~((df['Gender'] == 'Male') & (df['Fat_Percentage'] <= 15))]
# Remove all rows where Gender is Female and Fat Percentage <= 20
df_filtered = df_filtered[~((df_filtered["Gender"] == "Female") & (df_filtered["Fat_Percentage"] <= 20))]

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_filtered, x="Water_Intake (liters)", y="Fat_Percentage", hue="Gender", palette="deep", s=50)
plt.title("Water Intake vs Fat Percentage")
plt.xlabel("Water Intake (liters)")
plt.ylabel("Fat Percentage")
plt.show()

## Data Transformation

In [None]:
# Scaling and normalization
from sklearn.preprocessing import StandardScaler

_scaler = StandardScaler()

scaled_df = pd.DataFrame(_scaler.fit_transform(df), columns=encoded_df.columns)

# Dimensionality reduction
scaled_df.head(5)