In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Входной файл после препроцессинга (без скейлинга)
DATA_PATH = "data/preprocessed_without_scaling.csv"

# Выходные файлы с фичами (после скейлинга)
WORKOUT_OUT = "data/workout_features_scaled.csv"
MEAL_OUT    = "data/meal_features_scaled.csv"



In [2]:
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(19823, 36)


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,Difficulty Level,Body Part,Type of Muscle,Workout,sodium_g,cholesterol_g,Carbs,Proteins,Fats,meal_name
0,35,Male,65.27,1.62,188.58,157.65,69.05,1.0,1080.9,Strength,...,3,Legs,Lats,Dumbbell flyes,1.72994,0.28505,267.68,106.05,71.63,Grilled Vegan Lunch
1,23,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,2,Chest,Lats,Lateral raises,0.69308,0.30061,214.32,85.41,56.97,Fried Vegetarian Lunch
2,33,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,2,Arms,Grip Strength,Standing calf raises,2.14248,0.21542,246.04,98.11,65.48,Boiled Paleo Breakfast
3,39,Female,93.78,1.7,191.21,155.1,50.07,1.1,1450.79,HIIT,...,3,Shoulders,Upper,Incline dumbbell flyes,0.1232,0.0097,203.22,80.84,54.56,Fried Paleo Lunch
4,45,Male,52.42,1.88,193.58,152.88,70.84,1.08,1166.4,Strength,...,3,Abs,Wrist Flexors,Military press,1.93511,0.11689,332.79,133.05,88.43,Baked Vegan Breakfast


In [3]:
df.columns

Index(['Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM',
       'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned',
       'Workout_Type', 'Water_Intake (liters)',
       'Workout_Frequency (days/week)', 'Experience_Level', 'BMI',
       'Daily meals frequency', 'Calories', 'sugar_g', 'serving_size_g',
       'rating', 'Name of Exercise', 'Sets', 'Reps', 'Benefit',
       'Burns Calories (per 30 min)', 'Target Muscle Group',
       'Equipment Needed', 'Difficulty Level', 'Body Part', 'Type of Muscle',
       'Workout', 'sodium_g', 'cholesterol_g', 'Carbs', 'Proteins', 'Fats',
       'meal_name'],
      dtype='object')

BMR, PAL, TDEE, CaloriesPerDay, BMI

In [4]:
def add_user_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Добавляем базовые 'user-level' фичи:
    BMR, PAL, TDEE, BMI (если вдруг не было) и примерную CaloriesPerDay.
    """
    out = df.copy()

    weight = out["Weight (kg)"]
    height_cm = out["Height (m)"] * 100
    age = out["Age"]

    # пол: считаем, что 'male'/'female'
    is_male = out["Gender"].str.lower().str.startswith("m")

    bmr_male   = 10 * weight + 6.25 * height_cm - 5 * age + 5
    bmr_female = 10 * weight + 6.25 * height_cm - 5 * age - 161

    out["BMR"] = np.where(is_male, bmr_male, bmr_female)

    # PAL = 1.2 + 0.175 * Workout_Frequency (days/week)
    out["PAL"] = 1.2 + 0.175 * out["Workout_Frequency (days/week)"]

    # TDEE = BMR * PAL
    out["TDEE"] = out["BMR"] * out["PAL"]

    # Если вдруг BMI не был посчитан
    if "BMI" not in out.columns:
        out["BMI"] = out["Weight (kg)"] / (out["Height (m)"] ** 2)

    # Примерная CaloriesPerDay (если нет отдельного столбца):
    # предполагаем: калории_на_приём * кол-во приёмов
    if "CaloriesPerDay" not in out.columns:
        out["CaloriesPerDay"] = out["Calories"] * out["Daily meals frequency"]

    return out


df = add_user_features(df)
df.head()



Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,sodium_g,cholesterol_g,Carbs,Proteins,Fats,meal_name,BMR,PAL,TDEE,CaloriesPerDay
0,35,Male,65.27,1.62,188.58,157.65,69.05,1.0,1080.9,Strength,...,1.72994,0.28505,267.68,106.05,71.63,Grilled Vegan Lunch,1495.2,1.9,2840.88,5418.0
1,23,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,0.69308,0.30061,214.32,85.41,56.97,Fried Vegetarian Lunch,1256.85,1.9,2388.015,4731.0
2,33,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,2.14248,0.21542,246.04,98.11,65.48,Boiled Paleo Breakfast,1307.55,1.725,2255.52375,3216.0
3,39,Female,93.78,1.7,191.21,155.1,50.07,1.1,1450.79,HIIT,...,0.1232,0.0097,203.22,80.84,54.56,Fried Paleo Lunch,1644.3,1.9,3124.17,7971.0
4,45,Male,52.42,1.88,193.58,152.88,70.84,1.08,1166.4,Strength,...,1.93511,0.11689,332.79,133.05,88.43,Baked Vegan Breakfast,1479.2,1.9,2810.48,4410.0


Workout feature engineering (E, I, S, D, R)

In [5]:
def add_workout_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Строим фичи для тренировок:
    E - Energy Consumption
    I - Intensity
    S - Power component
    D - Duration (нормированная)
    R - Risk (1 - penalties)
    """
    out = df.copy()

    # --- Difficulty & Experience в числа ---
    level_map = {
        "Beginner": 1,
        "Intermediate": 2,
        "Advanced": 3
    }
    out["difficulty_num"]  = out["Difficulty Level"].map(level_map)
    out["experience_num"]  = out["Experience_Level"].map(level_map)

    # На случай, если где-то NaN (не распознались)
    out["difficulty_num"] = out["difficulty_num"].fillna(out["difficulty_num"].median())
    out["experience_num"] = out["experience_num"].fillna(out["experience_num"].median())

    # --- E: Energy consumption ---
    # E_raw
    cals = out["Calories_Burned"]
    out["E_raw"] = (cals - cals.min()) / (cals.max() - cals.min())

    # E_eff
    burn30 = out["Burns Calories (per 30 min)"]
    out["E_eff"] = (burn30 - burn30.min()) / (burn30.max() - burn30.min())

    out["E"] = 0.5 * out["E_raw"] + 0.5 * out["E_eff"]

    # --- I: Intensity (HRR) ---
    out["pct_HRR"] = (out["Avg_BPM"] - out["Resting_BPM"]) / (out["Max_BPM"] - out["Resting_BPM"])
    hrr = out["pct_HRR"]
    out["I"] = (hrr - hrr.min()) / (hrr.max() - hrr.min())

    # --- S: Power component ---
    out["workload"] = out["Sets"] * out["Reps"]
    out["S"] = out["workload"] / out["difficulty_num"]

    # --- D: Duration (нормируем длину тренировки) ---
    out["Duration_min"] = out["Session_Duration (hours)"] * 60
    dur = out["Duration_min"]
    out["D"] = (dur - dur.min()) / (dur.max() - dur.min())

    # --- R: Risk = 1 - (0.4*pen_age + 0.3*pen_bmi + 0.2*pen_hrr + 0.1*pen_skill) ---

    # 1) возраст
    age = out["Age"]
    out["pen_age"] = (age - age.min()) / (age.max() - age.min())

    # 2) BMI
    bmi = out["BMI"]
    out["pen_bmi"] = (bmi - bmi.min()) / (bmi.max() - bmi.min())

    # 3) HRR (верхняя часть распределения)
    hrr90 = hrr.quantile(0.9)
    hrr_max = hrr.max()
    out["pen_hrr"] = np.where(
        hrr > hrr90,
        (hrr - hrr90) / (hrr_max - hrr90),
        0.0
    )

    # 4) Skill penalty (сложность > опыт)
    out["pen_skill"] = np.maximum(0, out["difficulty_num"] - out["experience_num"])

    out["R"] = 1 - (
        0.4 * out["pen_age"]
      + 0.3 * out["pen_bmi"]
      + 0.2 * out["pen_hrr"]
      + 0.1 * out["pen_skill"]
    )

    return out


df = add_workout_features(df)
df[["E", "I", "S", "D", "R"]].head()


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,E,I,S,D,R
0,0.570217,0.526466,,0.333333,
1,0.740993,0.256055,,0.575163,
2,0.54966,0.289199,,0.27451,
3,0.660172,0.53062,,0.398693,
4,0.560736,0.422805,,0.385621,


Meal feature engineering (C, P, MacroMatch, ED, F)

In [6]:
def add_meal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Строим фичи для еды:
    C  - Calorie Fit (насколько блюдо попадает в целевые калории)
    P  - Proteins per meal
    MacroMatch - насколько БЖУ близки к целевым долям
    ED - Energy Density (kcal/gram)
    F  - Food Safety (по сахару, натрию, холестерину)
    """
    out = df.copy()

    # Удобно переименовать столбец
    out = out.rename(columns={"Daily meals frequency": "Meal_freq"})

    # --- C: Calorie Fit ---
    # Целевая калорийность одного приёма пищи
    out["Meal_target"] = out["CaloriesPerDay"] / out["Meal_freq"]

    out["C"] = 1 - (out["Calories"] - out["Meal_target"]).abs() / out["Meal_target"]

    # --- P: Proteins per meal ---
    out["P"] = out["Proteins"] / out["Meal_freq"]

    # --- MacroMatch ---
    # пересчитываем БЖУ в калории
    out["cal_from_protein"] = out["Proteins"] * 4
    out["cal_from_carbs"]   = out["Carbs"]    * 4
    out["cal_from_fats"]    = out["Fats"]     * 9

    total_macro_cal = (
        out["cal_from_protein"] +
        out["cal_from_carbs"] +
        out["cal_from_fats"]
    ).replace(0, np.nan)

    out["pct_p"] = out["cal_from_protein"] / total_macro_cal
    out["pct_c"] = out["cal_from_carbs"]   / total_macro_cal
    out["pct_f"] = out["cal_from_fats"]    / total_macro_cal

    # Целевые доли (для weight-loss, например)
    target_p, target_c, target_f = 0.30, 0.40, 0.30

    out["MacroMatch"] = 1 - (1 / 3) * (
        (out["pct_p"] - target_p).abs() +
        (out["pct_c"] - target_c).abs() +
        (out["pct_f"] - target_f).abs()
    )

    # --- ED: Energy Density ---
    out["ED"] = out["Calories"] / out["serving_size_g"]

    # --- F: Food Safety ---
    sugar_90 = out["sugar_g"].quantile(0.9)
    sodium_90 = out["sodium_g"].quantile(0.9)
    chol_90   = out["cholesterol_g"].quantile(0.9)

    out["F"] = 1 - 0.5 * (
        (out["sugar_g"]        / sugar_90).clip(upper=1) +
        (out["sodium_g"]       / sodium_90).clip(upper=1) +
        (out["cholesterol_g"]  / chol_90).clip(upper=1)
    )

    return out


df = add_meal_features(df)
df[["C", "P", "MacroMatch", "ED", "F"]].head()


Unnamed: 0,C,P,MacroMatch,ED,F
0,1.0,35.35,0.932175,14.991284,-0.240064
1,1.0,28.47,0.932767,14.448007,0.207901
2,1.0,49.055,0.932926,4.025737,-0.355138
3,1.0,26.946667,0.932475,8.453438,0.849143
4,1.0,44.35,0.932946,14.815561,0.088658


We divide it into workout features and meal features.

In [7]:
# Минимальный набор для workout-модели:
workout_features = df[[
    "Name of Exercise",
    "Workout_Type",
    "Target Muscle Group",
    "Body Part",
    "Type of Muscle",
    "Difficulty Level",
    "E", "I", "S", "D", "R"
]].copy()

# Минимальный набор для meal-модели:
meal_features = df[[
    "meal_name",
    "Calories",
    "Proteins",
    "Carbs",
    "Fats",
    "serving_size_g",
    "sodium_g",
    "cholesterol_g",
    "C", "P", "MacroMatch", "ED", "F"
]].copy()


Scaling (StandardScaler)

Workout Scaling

In [8]:
workout_num_cols = ["E", "I", "S", "D", "R"]

scaler_workout = StandardScaler()
workout_scaled = scaler_workout.fit_transform(workout_features[workout_num_cols])

workout_scaled_df = pd.DataFrame(
    workout_scaled,
    columns=[col + "_scaled" for col in workout_num_cols],
    index=workout_features.index
)

workout_final = pd.concat([workout_features, workout_scaled_df], axis=1)
workout_final.to_csv(WORKOUT_OUT, index=False)
print("Saved workout features to:", WORKOUT_OUT)


Saved workout features to: data/workout_features_scaled.csv


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Meal Scaling

In [9]:
meal_num_cols = ["Calories", "Proteins", "Carbs", "Fats",
                 "serving_size_g", "sodium_g", "cholesterol_g",
                 "C", "P", "MacroMatch", "ED", "F"]

scaler_meal = StandardScaler()
meal_scaled = scaler_meal.fit_transform(meal_features[meal_num_cols])

meal_scaled_df = pd.DataFrame(
    meal_scaled,
    columns=[col + "_scaled" for col in meal_num_cols],
    index=meal_features.index
)

meal_final = pd.concat([meal_features, meal_scaled_df], axis=1)
meal_final.to_csv(MEAL_OUT, index=False)
print("Saved meal features to:", MEAL_OUT)

Saved meal features to: data/meal_features_scaled.csv
