#### Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelEncoder, PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from scipy import stats
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_log_error
import warnings as w
w.filterwarnings('ignore')

#### Dataset

In [59]:
train = pd.read_csv('D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\\train.csv')
test = pd.read_csv('D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\\test.csv')

#### Feature engineering

In [60]:
# Weight_per_Age
train['Weight_per_Age'] = train['Weight'] / (train['Age'] + 1)
test['Weight_per_Age'] = test['Weight'] / (test['Age'] + 1)

# HeartRate per Weight
train['HeartRate_per_kg'] = train['Heart_Rate'] / train['Weight']
test['HeartRate_per_kg'] = test['Heart_Rate'] / test['Weight']
# Duration Per Age
train['Duration_per_age'] = train['Duration'] / (train['Age'] + 1)
test['Duration_per_age'] = test['Duration'] / (test['Age'] + 1 )

# Duration * Heart Rate
train['Duration_heart_rate']=train['Duration']*train['Heart_Rate']
test['Duration_heart_rate']=test['Duration']*test['Heart_Rate']

# Intensity
train['Duration_per_weight']=train['Duration']/train['Weight']
test['Duration_per_weight']=test['Duration']/test['Weight']

# All Durations add and multi
train['duration_sum']=train['Duration_per_weight']+train['Duration_heart_rate']+train['Duration_per_age']
test['duration_sum']=test['Duration_per_weight']+test['Duration_heart_rate']+test['Duration_per_age']

train['duration_multi']=train['Duration_per_weight']*train['Duration_heart_rate']*train['Duration_per_age']
test['duration_multi']=test['Duration_per_weight']*test['Duration_heart_rate']*test['Duration_per_age']

# Converting Height in Meters
train['Height']=train['Height']/100
test['Height']=test['Height']/100

# Creating new column 'BMI'
train['BMI']=train['Weight']/(train['Height'] ** 2)
train['BMI']=train['BMI'].round(2)
test['BMI']=test['Weight']/(test['Height'] ** 2)
test['BMI']=test['BMI'].round(2)

# Mapping Genders
map={'male':0,'female':1}
train['Sex']=train['Sex'].map(map)
test['Sex']=test['Sex'].map(map)

num_cols = train.select_dtypes(include=['int64', 'float64']).columns.drop(['id', 'Calories']).to_list()

#### Experiment

In [None]:
def remove_outliers_by_zscore(df, cols, threshold=3):
    z_scores = np.abs((df[cols] - df[cols].mean()) / df[cols].std())
    return df[(z_scores < threshold).all(axis=1)]

train = remove_outliers_by_zscore(train, num_cols)

In [39]:
def scale_features(cols, df, test_df=None):
    scaler = StandardScaler()
    df[cols] = scaler.fit_transform(df[cols])
    if test_df is not None:
        test_df[cols] = scaler.transform(test_df[cols])
    return df, test_df

train, test = scale_features(num_cols, train, test)

In [44]:
def log_transform_skewed_features(cols, df, test_df=None):
    for col in cols:
        df[col] = np.log1p(df[col])
        if test_df is not None:
            test_df[col] = np.log1p(test_df[col])
    return df, test_df

train, test = log_transform_skewed_features(num_cols, train, test)

In [4]:
def create_polynomial_features(df, cols, degree=2):
    pf = PolynomialFeatures(degree, include_bias=False)
    poly_features = pf.fit_transform(df[cols])
    poly_df = pd.DataFrame(poly_features, columns=pf.get_feature_names_out(cols), index=df.index)
    df = df.drop(columns=cols).join(poly_df)
    return df

train = create_polynomial_features(train, num_cols)
test = create_polynomial_features(test, num_cols)

In [None]:
def create_interaction_features(df, col_pairs):
    for col1, col2 in col_pairs:
        df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
        df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
    return df

In [17]:
def create_binned_features(df, cols, bins=5, labels=None):
    for col in cols:
        if labels is None:
            labels = [f'{col}_bin_{i}' for i in range(bins)]
        min_val = df[col].min()
        max_val = df[col].max()
        bin_edges = np.linspace(min_val, max_val, bins + 1)
        df[f'{col}_bin'] = pd.cut(df[col], bins=bin_edges, labels=labels, include_lowest=True)
    return df

train = create_binned_features(train, num_cols)
test = create_binned_features(test, num_cols)

In [50]:
def create_statistical_group_features(df, group_col, target_col):
    group_mean = df.groupby(group_col)[target_col].transform('mean')
    group_sum = df.groupby(group_col)[target_col].transform('sum')
    df[f'{group_col}_target_mean'] = group_mean
    df[f'{group_col}_target_sum'] = group_sum
    return df

train = create_statistical_group_features(train, num_cols, 'Calories')

def add_statistical_group_features_from_train(train_df, test_df, group_col, target_col):
    group_stats = train_df.groupby(group_col)[target_col].agg(['mean', 'sum']).reset_index()
    if isinstance(group_col, list):
        group_stats.columns = group_col + ['target_mean', 'target_sum']
        test_df = test_df.merge(group_stats, on=group_col, how='left')
        test_df = test_df.rename(columns={
            'target_mean': f"{'_'.join(group_col)}_target_mean",
            'target_sum': f"{'_'.join(group_col)}_target_sum"
        })
    else:
        group_stats.columns = [group_col, 'target_mean', 'target_sum']
        test_df = test_df.merge(group_stats, on=group_col, how='left')
        test_df = test_df.rename(columns={
            'target_mean': f"{group_col}_target_mean",
            'target_sum': f"{group_col}_target_sum"
        })
    return test_df

test = add_statistical_group_features_from_train(train, test, num_cols, 'Calories')

In [56]:
def create_rank_features(cols, df, test_df=None):
    for col in cols:
        df[f'{col}_rank'] = df[col].rank(method='average')
        if test_df is not None:
            test_df[f'{col}_rank'] = test_df[col].rank(method='average')
    return df, test_df

train, test = create_rank_features(num_cols, train, test)

In [61]:
def create_pca_features(df, cols, n_components=2):
    pca = PCA(n_components=n_components, random_state=42)
    pca_features = pca.fit_transform(df[cols])
    for i in range(n_components):
        df[f'pca_{i+1}'] = pca_features[:, i]
    return df

train = create_pca_features(train, num_cols, n_components=2)
test = create_pca_features(test, num_cols, n_components=2)

In [None]:
def select_features_by_correlation(df, threshold=0.9):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)

#### Predict

In [62]:
def clean_feature_names(df):
    df.columns = [
        col.replace('[', 'l_')
           .replace(']', '_r')
           .replace('<', 'lt_')
           .replace('>', 'gt_')
           .replace(' ', '_')
           .replace("'", '')
           .replace(',', '_')
           for col in df.columns
    ]
    return df

X = train.drop(columns=['id', 'Calories'])
y = train['Calories']

train = clean_feature_names(train)
test = clean_feature_names(test)
X = clean_feature_names(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = clean_feature_names(X_train)
X_val = clean_feature_names(X_val)

xgb_model = XGBRegressor(
    tree_method = 'hist',
    objective = 'reg:squarederror',
    random_state = 42,
    verbosity = 0,
    n_jobs = -1,
    n_estimators = 1000,
    learning_rate = 0.009456490533119234,
    max_depth = 10,
    reg_lambda = 1.7434697249357467,
    reg_alpha = 0.6488287215083401,
    subsample = 0.9494258651834557,
    colsample_bytree = 0.6332097368693999,
    min_child_weight = 6,
    gamma = 0.3841072597639574,
    enable_categorical = True
)

xgb_model.fit(X_train, y_train)



#### Scoring

In [63]:
preds = xgb_model.predict(X_val)
preds_clip = np.maximum(0, preds)
rmsle = np.sqrt(mean_squared_log_error(y_val, preds_clip))
print(f'RMSLE : {rmsle:.5f}')

RMSLE : 0.06015


In [None]:
def evaluate_with_log_transform(y_true, y_pred):
    y_true_log = np.log1p(y_true)
    y_pred_log = np.log1p(np.maximum(0, y_pred))
    return np.sqrt(mean_squared_log_error(y_true_log, y_pred_log))
rmsle_log = evaluate_with_log_transform(y_val, preds_clip)
print(f'RMSLE : {rmsle_log:.5f}')

RMSLE: 0.01759


In [None]:
def rmsle_cv_stratified(X, y, model, n_splits=5):
    y_quantile = pd.qcut(y, q=n_splits, labels=False, duplicates='drop')
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmsle_scores = []
    for train_idx, val_idx in skf.split(X, y_quantile):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        preds_clip = np.maximum(0, preds)
        rmsle_scores.append(np.sqrt(mean_squared_log_error(y_val, preds_clip)))
    return np.mean(rmsle_scores), np.std(rmsle_scores)

rmsle, rmsle_std = rmsle_cv_stratified(X, y, xgb_model)
print(f'RMSLE : {rmsle:.5f}, STD : {rmsle_std:.5f}')

RMSLE : 0.060798786897051836, STD : 0.0004128974657424499


In [None]:
def evaluate_with_boxcox(y_true, y_pred):
    pt = PowerTransformer(method='box-cox')
    y_true_bc = pt.fit_transform(y_true.values.reshape(-1, 1)).flatten()
    y_pred_bc = pt.transform(np.maximum(0, y_pred).reshape(-1, 1)).flatten()
    return np.sqrt(mean_squared_log_error(y_true_bc, y_pred_bc))

In [None]:
def evaluate_with_quantile_transform(y_true, y_pred):
    qt = QuantileTransformer(output_distribution='normal', random_state=42)
    y_true_qt = qt.fit_transform(y_true.values.reshape(-1, 1)).flatten()
    y_pred_qt = qt.transform(np.maximum(0, y_pred).reshape(-1, 1)).flatten()
    return np.sqrt(mean_squared_log_error(y_true_qt, y_pred_qt))

#### Save submission

In [53]:
submission = pd.read_csv("D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\sample_submission.csv")
X_test = test.drop(columns=['id'])
X_test.columns = X.columns

if list(X_test.columns) != list(X.columns):
    print("Feature names do not match between X_test and X!")
    print("X_test columns:", list(X_test.columns))
    print("X columns:", list(X.columns))
else:
    preds = xgb_model.predict(X_test)
    preds_clip = np.maximum(0, preds)
    submission['Calories'] = preds_clip
    submission.to_csv('submission3.csv', index=False)
    print("Submission saved successfully.")

preds = xgb_model.predict(X_test)
preds_clip = np.maximum(0, preds)
# assert len(preds) == len(submission), "Jumlah prediksi dan ID tidak cocok!"
submission['Calories'] = preds_clip
submission.to_csv('submission3.csv', index=False)



Submission saved successfully.
