In [1]:
import pandas as pd

df = pd.read_csv('../data/external/gym.csv')
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [6]:
import pandas as pd

def get_skewed_columns(df, threshold=0.5):
    """
    Returns the columns in the dataframe that are numerically skewed.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default=1.0), absolute skewness value above which a column is considered skewed

    Returns:
    - skewed_cols: pandas Series with skewness values of skewed columns
    """
    numeric_cols = df.select_dtypes(include=['number'])
    skewness = numeric_cols.skew().sort_values(ascending=False)
    skewed_cols = skewness[abs(skewness) > threshold]
    return skewed_cols


In [7]:
skewed = get_skewed_columns(df)
skewed

Weight (kg)       0.772384
BMI               0.763648
Fat_Percentage   -0.635225
dtype: float64

In [8]:
import pandas as pd
import cloudpickle
import numpy as np

# Step 1: Create the sample DataFrame
data = {
    "Age": [28],
    "Gender": ["Female"],
    "Weight (kg)": [300.3],
    "Height (m)": [1.68],
    "Max_BPM": [205],
    "Avg_BPM": [172],
    "Resting_BPM": [73],
    "Session_Duration (hours)": [1.65],
    "Calories_Burned": [1425.0],
    "Workout_Type": ["Cardio"],
    "Fat_Percentage": [18.2],
    "Water_Intake (liters)": [3.1],
    "Workout_Frequency (days/week)": [4],
    "Experience_Level": [2],
    "BMI": [22.78]
}
df = pd.DataFrame(data)

# Step 2: Load the preprocessing pipeline and transformation pipeline separately
try:
    # First try to load the preprocessing pipeline
    with open("../models/pipelines/preprocessing_gym/preprocessing.pkl", "rb") as f:
        preprocessing_pipeline = cloudpickle.load(f)

    # Then apply preprocessing
    preprocessed_df = preprocessing_pipeline.transform(df)

    # If preprocessing worked, then try to load and apply transformation
    with open("../models/pipelines/preprocessing_gym/transformation.pkl", "rb") as f:
        transformation_pipeline = cloudpickle.load(f)

    # Apply transformation to preprocessed data
    transformed_df = transformation_pipeline.transform(preprocessed_df)

    print("Successfully applied both pipelines separately!")
    print(transformed_df)

except Exception as e:
    print(f"Error applying pipelines separately: {str(e)}")
    print("Trying alternative approach...")

    # Alternative approach: Inspect the processor pipeline
    with open("../models/pipelines/preprocessing_gym/processor.pkl", "rb") as f:
        processor_pipeline = cloudpickle.load(f)

    # Print pipeline steps for debugging
    print("Processor pipeline steps:")
    for name, step in processor_pipeline.named_steps.items():
        print(f"- {name}: {type(step).__name__}")
        if hasattr(step, "transformers"):
            print("  Transformers:")
            for transformer_name, transformer, columns in step.transformers:
                print(f"    - {transformer_name}: {type(transformer).__name__} (columns: {columns})")

    # Create necessary one-hot encoded columns manually
    # These are the columns mentioned in the error
    required_columns = ['Workout_Type_Strength', 'Workout_Type_Yoga', 'Gender_Male', 'Workout_Type_HIIT']

    # Apply the preprocessing first
    try:
        # For most cases, we can manually handle the categorical encoding
        # Determine which columns need encoding
        if "Gender" in df.columns:
            df["Gender_Male"] = np.where(df["Gender"] == "Male", 1, 0)

        if "Workout_Type" in df.columns:
            df["Workout_Type_Strength"] = np.where(df["Workout_Type"] == "Strength", 1, 0)
            df["Workout_Type_Yoga"] = np.where(df["Workout_Type"] == "Yoga", 1, 0)
            df["Workout_Type_HIIT"] = np.where(df["Workout_Type"] == "HIIT", 1, 0)
            df["Workout_Type_Cardio"] = np.where(df["Workout_Type"] == "Cardio", 1, 0)

        # Now try to apply the processor pipeline
        transformed_df = processor_pipeline.transform(df)
        print("\nSuccessfully applied processor pipeline with manual column creation!")
        print(transformed_df)

    except Exception as e:
        print(f"\nError with manual approach: {str(e)}")
        print("\nCreating a proper implementation...")

        # Create a function to properly preprocess new data
        def preprocess_new_data(new_data, pipeline_path):
            """
            Properly preprocess new data using saved pipeline.

            Parameters:
            -----------
            new_data : pd.DataFrame
                New data to preprocess
            pipeline_path : str
                Path to the saved processor.pkl file

            Returns:
            --------
            pd.DataFrame
                Preprocessed data
            """
            # Load the pipeline
            with open(pipeline_path, "rb") as f:
                processor = cloudpickle.load(f)

            # Clone the data to avoid modifying the original
            data_copy = new_data.copy()

            # Handle categorical variables manually (one-hot encoding)
            # For Gender
            if "Gender" in data_copy.columns:
                data_copy["Gender_Male"] = np.where(data_copy["Gender"] == "Male", 1, 0)

            # For Workout_Type
            workout_types = ["Strength", "Yoga", "HIIT", "Cardio"]
            if "Workout_Type" in data_copy.columns:
                for wtype in workout_types:
                    data_copy[f"Workout_Type_{wtype}"] = np.where(data_copy["Workout_Type"] == wtype, 1, 0)

            # Now try to apply just the transformation part of the pipeline
            # This assumes the pipeline has 'preprocessing' and 'transformation' steps
            if hasattr(processor, 'named_steps') and 'transformation' in processor.named_steps:
                try:
                    # Apply only transformation step
                    transformed_data = processor.named_steps['transformation'].transform(data_copy)
                    if isinstance(transformed_data, np.ndarray):
                        # Convert back to DataFrame with feature names if possible
                        if hasattr(processor.named_steps['transformation'], 'get_feature_names_out'):
                            feature_names = processor.named_steps['transformation'].get_feature_names_out()
                            return pd.DataFrame(transformed_data, columns=feature_names)
                        else:
                            # Use generic feature names
                            return pd.DataFrame(transformed_data, columns=[f"feature_{i}" for i in range(transformed_data.shape[1])])
                    return transformed_data
                except Exception as trans_err:
                    print(f"Error applying transformation: {str(trans_err)}")

            # Fallback approach: try the full pipeline
            try:
                return processor.transform(data_copy)
            except Exception as e:
                print(f"Error applying processor pipeline: {str(e)}")
                raise

        # Apply our custom function
        try:
            result = preprocess_new_data(df, "../models/pipelines/preprocessing_gym/processor.pkl")
            print("\nSuccessfully applied custom preprocessing function!")
            print(result)
        except Exception as final_err:
            print(f"\nFinal error: {str(final_err)}")
            print("\nRecommendation: You need to create a new custom preprocessing function that handles the specific encoding of your data.")

Error applying pipelines separately: columns are missing: {'Workout_Type_Strength', 'Workout_Type_Yoga', 'Gender_Male', 'Workout_Type_HIIT'}
Trying alternative approach...
Processor pipeline steps:
- preprocessing: PreprocessingPipeline
- transformation: Pipeline

Successfully applied processor pipeline with manual column creation!
[[ 1.99146084 -1.15173459 -0.87477173 -0.34986956  2.17707035  1.97814263
   1.46190501  1.17024991  1.96400236  0.76560624  0.7420429   0.2611364
  22.78        0.          0.          0.          0.        ]]


In [16]:
import pandas as pd
import cloudpickle

# Step 1: Create the sample DataFrame
data = {
    "Age": [28],
    "Gender": ["Female"],
    "Weight (kg)": [64.3],
    "Height (m)": [1.68],
    "Max_BPM": [205],
    "Avg_BPM": [172],
    "Resting_BPM": [73],
    "Session_Duration (hours)": [1.65],
    "Calories_Burned": [1425.0],
    "Workout_Type": ["Cardio"],
    "Fat_Percentage": [18.2],
    "Water_Intake (liters)": [3.1],
    "Workout_Frequency (days/week)": [4],
    "Experience_Level": [2],
    "BMI": [22.78]
}
df = pd.DataFrame(data)

# Step 2: Load the preprocessing pipeline
with open("../models/pipelines/preprocessing_gym/processor.pkl", "rb") as f:
    preprocessing_pipeline = cloudpickle.load(f)

# Step 3: Apply the preprocessing pipeline
preprocessed_data = preprocessing_pipeline.transform(df)
# Step 6: Print the final transformed data
print(preprocessed_data)


[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Transforming data with preprocessing pipeline
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | No duplicate rows found
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Transformed skewed data in Weight (kg)
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Transformed skewed data in Fat_Percentage
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Age
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Weight (kg)
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Height (m)
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Max_BPM
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Avg_BPM
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerical features in Resting_BPM
[✓] 2025-05-16 02:03:49 | Data Preprocessing | INFO  | Scaled numerica



            Age  Weight (kg)  Height (m)  Max_BPM   Avg_BPM  Resting_BPM  \
index                                                                      
0     -0.874772    -3.503716    -0.34987  2.17707  1.978143     1.461905   

       Session_Duration (hours)  Calories_Burned  Fat_Percentage  \
index                                                              
0                       1.17025         1.942245       -4.192956   

       Water_Intake (liters)  ...  \
index                         ...   
0                   0.765606  ...   

       Weight (kg) - Workout_Frequency (days/week)  \
index                                                
0                                        -4.245759   

       Weight (kg) - Workout_Type_HIIT  Weight (kg) - Workout_Type_Strength  \
index                                                                         
0                            -3.503716                            -3.503716   

       Weight (kg) - Workout_Type_Yoga  \
index      