In [1]:
import pandas as pd

df = pd.read_csv('../data/external/gym.csv')
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [6]:
import pandas as pd

def get_skewed_columns(df, threshold=0.5):
    """
    Returns the columns in the dataframe that are numerically skewed.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default=1.0), absolute skewness value above which a column is considered skewed

    Returns:
    - skewed_cols: pandas Series with skewness values of skewed columns
    """
    numeric_cols = df.select_dtypes(include=['number'])
    skewness = numeric_cols.skew().sort_values(ascending=False)
    skewed_cols = skewness[abs(skewness) > threshold]
    return skewed_cols


In [7]:
skewed = get_skewed_columns(df)
skewed

Weight (kg)       0.772384
BMI               0.763648
Fat_Percentage   -0.635225
dtype: float64

In [5]:
import pandas as pd
import cloudpickle

# Step 1: Create the sample DataFrame
data = {
    "Age": [28],
    "Gender": ["Female"],
    "Weight (kg)": [300.3],
    "Height (m)": [1.68],
    "Max_BPM": [205],
    "Avg_BPM": [172],
    "Resting_BPM": [73],
    "Session_Duration (hours)": [1.65],
    "Calories_Burned": [1425.0],
    "Workout_Type": ["Cardio"],
    "Fat_Percentage": [18.2],
    "Water_Intake (liters)": [3.1],
    "Workout_Frequency (days/week)": [4],
    "Experience_Level": [2],
    "BMI": [22.78]
}
df = pd.DataFrame(data)

# Step 2: Load the preprocessing pipeline
with open("../models/pipelines/preprocessing_gym/preprocessing.pkl", "rb") as f:
    preprocessing_pipeline = cloudpickle.load(f)

# Step 3: Transform the DataFrame
transformed_df = preprocessing_pipeline.transform(df)

# Step 4: Print the transformed data
print(transformed_df)


   Age  Gender  Weight (kg)  Height (m)  Max_BPM  Avg_BPM  Resting_BPM  \
0   28  Female     128.0625        1.68      205      172           73   

   Session_Duration (hours)  Calories_Burned Workout_Type  Fat_Percentage  \
0                      1.65           1425.0       Cardio            18.2   

   Water_Intake (liters)  Workout_Frequency (days/week)  Experience_Level  \
0                    3.1                              4                 2   

     BMI  
0  22.78  
