In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

In [2]:
import os
for dirname, _, filenames in os.walk('./Input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./Input/test.csv
./Input/train.csv


In [3]:
df_train = pd.read_csv("./Input/train.csv")
df_train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [None]:
df_test = pd.read_csv("./Input/test.csv")
df_test.head()

In [None]:
# ============================
# Feature Engineering Modules
# ============================
def add_basic_features(df):
    df['IMC'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['BSA'] = 0.007184 * (df['Height'] ** 0.725) * (df['Weight'] ** 0.425)
    df['FCmax'] = 220 - df['Age']
    df['FCArea'] = df['Heart_Rate'] / df['FCmax'].replace(0, np.nan)
    df['Log_Weight'] = np.log1p(df['Weight'])
    df['Sqrt_Height'] = np.sqrt(df['Height'])
    df['Age_squared'] = df['Age'] ** 2
    df['Weight_cubed'] = df['Weight'] ** 3
    df['Heart_Rate_per_Age'] = df['Heart_Rate'] / df['Age'].replace(0, np.nan)
    df['Temp_Heart_Interaction'] = df['Body_Temp'] * df['Heart_Rate']
    df['IMC_x_HeartRate'] = df['IMC'] * df['Heart_Rate']
    df['HeartRate_Temp_Age'] = df['Heart_Rate'] * df['Body_Temp'] / df['Age'].replace(0, np.nan)
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 2}).fillna(0)

    for f1 in ['Body_Temp','Age', 'Height']:
        for f2 in ['Weight', 'Duration', 'Heart_Rate']:
            df[f'{f1}_x_{f2}']=df[f1]*df[f2]
            df[f'{f1}_+_{f2}']=df[f1]+df[f2]
            df[f'{f1}_-_{f2}']=df[f1]-df[f2]
            df[f'{f1}_by_{f2}']=df[f1]/(df[f2]+1e-5)
                
    for f1 in ['Heart_Rate', 'Body_Temp']:
        df[f'sin_{f1}'] = np.sin(df[f1])
        df[f'sin_{f1}'] = np.sin(df[f1])

    for i,f1 in enumerate(['Duration', 'Heart_Rate','Body_Temp','Weight','Height']):
        max_val=df[f1].max()
        min_val=df[f1].min()
        df[f'{f1}_maxdiff']=max_val-df[f1]
        df[f'{f1}_mindiff']=df[f1]-min_val

    for f1 in (['Age', 'Height','Sex']):
        for f2 in (['Duration', 'Heart_Rate','Body_Temp','Weight']):
            temp_df = df.groupby(f1)[f2].mean().reset_index().rename(columns={f2: f'{f2}_{f1}_mean'})
            df = df.merge(temp_df, on=f1, how='left')    
            df[f'diff{f1}mean_grp{f2}']=df[f2]-df[f'{f2}_{f1}_mean']
            df[f'add{f2}mean_grp{f1}']=df[f2]+df[f'{f2}_{f1}_mean']
        
    for f1 in ['Age','Body_Temp']:
        df[f'{f1}_log']=np.log1p(df[f1])
                

    for f1 in ['Duration', 'Heart_Rate','Body_Temp','Weight','Age', 'Height']:
        df[f'{f1}_squared']=df[f1]**2

    for f1 in ['Heart_Rate','Body_Temp']:
        for f2 in ['Weight','Age', 'Height']:
            df[f'dur_{f1}_x_{f2}']=df['Duration']*df[f1]*df[f2]
            df[f'dur_by_{f1}_{f2}']=df['Duration']/((df[f1]*df[f2])+1e-5)
    return df

def add_metabolic_features(df):
    df['TMB'] = np.where(df['Sex'] == 1,
                         10 * df['Weight'] + 6.25 * df['Height'] - 5 * df['Age'] - 161,
                         10 * df['Weight'] + 6.25 * df['Height'] - 5 * df['Age'] + 5)
    df['Burned_Calories'] = np.where(df['Sex'] == 1,
                                     ((-20.4022 + (0.4472 * df['Heart_Rate']) -
                                       (0.1263 * df['Weight']) + (0.074 * df['Age'])) / 4.184) * df['Duration'],
                                     ((-55.0969 + (0.6309 * df['Heart_Rate']) +
                                       (0.1988 * df['Weight']) + (0.2017 * df['Age'])) / 4.184) * df['Duration'])
    df['Burned_Calories_per_min'] = df['Burned_Calories'] / df['Duration'].replace(0, np.nan)
    df['TMB_per_kg'] = df['TMB'] / df['Weight'].replace(0, np.nan)
    df['TMB_x_BSA'] = df['TMB'] * df['BSA']
    df['Activity'] = df['Burned_Calories'] / (df['TMB'] * (df['Duration'] / 1440)).replace(0, np.nan)
    return df

def add_combined_features(df):
    df['AgeSex'] = df['Age'].astype(str) + df['Sex'].astype(str)
    df['AgeSex'] = LabelEncoder().fit_transform(df['AgeSex']) + 1
    return df

def add_binned_features(df):
    df['Age_group'] = pd.cut(df['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 80], labels=False)
    df['Duration_group'] = pd.qcut(df['Duration'], q=4, labels=False, duplicates='drop')
    df['IMC_category'] = pd.cut(df['IMC'], bins=[0, 18.5, 24.9, 29.9, 34.9, 39.9, 60], labels=False)
    df['FCArea_binned'] = pd.qcut(df['FCArea'], q=4, labels=False, duplicates='drop')
    return df

def feature_engineering(df):
    df = df.copy()
    df = add_basic_features(df)
    df = add_metabolic_features(df)
    df = add_combined_features(df)
    df = add_binned_features(df)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

# ============================
# Target-based Features (Leakage Control)
# ============================
def compute_target_based_features(train_idx, valid_idx):
    train_temp = df_train.iloc[train_idx].copy()
    valid_temp = df_train.iloc[valid_idx].copy()

    # Age group & Calories_by_AgeGroup
    train_temp['Age_group'] = pd.cut(train_temp['Age'], bins=[0,20,30,40,50,60,70,80], labels=False)
    valid_temp['Age_group'] = pd.cut(valid_temp['Age'], bins=[0,20,30,40,50,60,70,80], labels=False)
    age_means = train_temp.groupby('Age_group')['Calories'].mean()
    valid_temp['Calories_by_AgeGroup'] = valid_temp['Age_group'].map(age_means)

    # Duration group & Calories_by_Duration
    train_temp['Duration_group'] = pd.qcut(train_temp['Duration'], q=4, labels=False)
    valid_temp['Duration_group'] = pd.qcut(valid_temp['Duration'], q=4, labels=False)
    duration_means = train_temp.groupby('Duration_group')['Calories'].mean()
    valid_temp['Calories_by_Duration'] = valid_temp['Duration_group'].map(duration_means)

    # Sex_encoded based on log transform
    sex_log_means = np.log1p(train_temp.groupby('Sex')['Calories'].mean())
    valid_temp['Sex_encoded_log'] = valid_temp['Sex'].map(sex_log_means)
    valid_temp['Sex_encoded'] = np.expm1(valid_temp['Sex_encoded_log'])

    return valid_temp[['Calories_by_AgeGroup', 'Calories_by_Duration', 'Sex_encoded']]

# ============================
# Model Training
# ============================
def train_predict_model(model, model_name):
    oof_pred = np.zeros(len(df_train))
    scores = []

    for train_idx, valid_idx in kf.split(X_full):
        X_train, X_valid = X_full.iloc[train_idx].copy(), X_full.iloc[valid_idx].copy()
        y_train, y_valid = y_full.iloc[train_idx], y_full.iloc[valid_idx]

        # Features based on target
        valid_leak_features = compute_target_based_features(train_idx, valid_idx)
        train_leak_features = compute_target_based_features(train_idx, train_idx)

        for col in valid_leak_features.columns:
            X_valid[col] = valid_leak_features[col].values
            X_train[col] = train_leak_features[col].values

        # Train Model
        if model_name == 'CatBoost':
            model.fit(X_train, y_train,
                      eval_set=(X_valid, y_valid),
                      early_stopping_rounds=10,
                      verbose=False)
            y_pred_log = model.predict(X_valid)

        elif model_name == 'LightGBM':
            model.fit(X_train, y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_metric='rmse')
            y_pred_log = model.predict(X_valid)

        elif model_name == 'XGBoost':
            model.fit(X_train, y_train,
                      eval_set=[(X_valid, y_valid)])
            y_pred_log = model.predict(X_valid)

        elif model_name == 'TabNet':
            model.fit(X_train.values, y_train.values.reshape(-1, 1),
                      eval_set=[(X_valid.values, y_valid.values.reshape(-1, 1))],
                      eval_metric=["rmse"], max_epochs=50, patience=5)
            y_pred_log = model.predict(X_valid.values).squeeze()

        else:
            model.fit(X_train.values, y_train.values)
            y_pred_log = model.predict(X_valid.values)

        # predict
        y_pred = np.expm1(y_pred_log)
        y_true = np.expm1(y_valid)
        y_pred = np.maximum(0, y_pred)

        score = np.sqrt(mean_squared_log_error(y_true, y_pred))
        scores.append(score)
        oof_pred[valid_idx] = y_pred

    print(f"{model_name} RMSLE CV mean: {np.mean(scores):.5f}")
    return oof_pred, np.mean(scores)

In [5]:
# ============================
# Apply Feature Engineering
# ============================
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

X_full = df_train.drop(columns=['Calories'])
y_full = np.log1p(df_train['Calories'])

n_splits = 5
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
# ============================
# Model List & Execution
# ============================
models_to_run = [
    (CatBoostRegressor(random_state=42, verbose=0), 'CatBoost'),
    (XGBRegressor(random_state=42, verbosity=0), 'XGBoost'),
    (LGBMRegressor(random_state=42, verbose=-1), 'LightGBM'),
    #(TabNetRegressor(device_name='cpu'), 'TabNet'), 
    #(GaussianProcessRegressor(kernel=kernel, alpha=1e-10, normalize_y=True), 'Gaussian')
]

oof_dict = {}
scores_dict = {}

for mdl, name in models_to_run:
    oof, score = train_predict_model(mdl, name)
    oof_dict[name] = oof
    scores_dict[name] = score

# ============================
# Hill Climbing Blending
# ============================
base_score = np.inf
best_blend = None
best_models = []
current_preds = np.zeros(len(df_train))

for name in sorted(oof_dict, key=scores_dict.get):
    candidate_preds = (current_preds * len(best_models) + oof_dict[name]) / (len(best_models) + 1)
    candidate_score = np.sqrt(mean_squared_log_error(np.expm1(y_full), np.maximum(0, candidate_preds)))
    if candidate_score < base_score:
        base_score = candidate_score
        best_models.append(name)
        current_preds = candidate_preds
        print(f"Ajout de {name} améliore le score à {candidate_score:.5f}")

print(f"Modèles retenus dans le blend : {best_models}")
print(f"Score OOF blend final : {base_score:.5f}")

# ============================
# Final Test Prediction
# ============================
age_means_global = df_train.groupby(pd.cut(df_train['Age'], bins=[0,20,30,40,50,60,70,80], labels=False))['Calories'].mean()
duration_means_global = df_train.groupby(pd.qcut(df_train['Duration'], q=4, labels=False))['Calories'].mean()
sex_log_means_global = np.log1p(df_train.groupby('Sex')['Calories'].mean())
sex_means_global = np.expm1(sex_log_means_global)

df_test['Calories_by_AgeGroup'] = df_test['Age_group'].map(age_means_global)
df_test['Calories_by_Duration'] = df_test['Duration_group'].map(duration_means_global)
df_test['Sex_encoded'] = df_test['Sex'].map(sex_means_global)

test_preds = np.zeros(len(df_test))
for name in best_models:
    mdl = next(m for m, n in models_to_run if n == name)
    mdl.fit(X_full.values, y_full.values)

    X_test = df_test.drop(columns=['Calories'], errors='ignore').copy()
    y_test_pred_log = mdl.predict(X_test)
    y_test_pred = np.expm1(y_test_pred_log)
    y_test_pred = np.maximum(0, y_test_pred)
    test_preds += y_test_pred

test_preds /= len(best_models)



In [None]:
test_preds = np.zeros(len(df_test))
for name in best_models:
    mdl = next(m for m, n in models_to_run if n == name)

    # Train again model
    X_full_fe = df_train.copy()
    X_full_fe['Calories_by_AgeGroup'] = pd.cut(X_full_fe['Age'], bins=[0,20,30,40,50,60,70,80], labels=False).map(age_means_global)
    X_full_fe['Calories_by_Duration'] = pd.qcut(X_full_fe['Duration'], q=4, labels=False).map(duration_means_global)
    X_full_fe['Sex_encoded'] = X_full_fe['Sex'].map(sex_means_global)
    
    X_train_final = X_full_fe.drop(columns=['Calories'])
    y_train_final = np.log1p(X_full_fe['Calories'])
    
    mdl.fit(X_train, y_train)

    X_test = df_test.drop(columns=['Calories'], errors='ignore').copy()
    y_test_pred_log = mdl.predict(X_test.values)
    y_test_pred = np.expm1(y_test_pred_log)
    y_test_pred = np.maximum(0, y_test_pred)
    test_preds += y_test_pred

test_preds /= len(best_models)

In [None]:
submission = pd.DataFrame({
    'id': df_test['id'],
    'Calories': test_preds
})
submission.to_csv('./Output/submission4.csv', index=False)
print("Fichier submission.csv généré.")