#### Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from scipy import stats
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import warnings as w
w.filterwarnings('ignore')

#### Dataset

In [20]:
train = pd.read_csv('D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\\train.csv')
test = pd.read_csv('D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\\test.csv')

#### Feature engineering

In [21]:
# Weight_per_Age
train['Weight_per_Age'] = train['Weight'] / (train['Age'] + 1)
test['Weight_per_Age'] = test['Weight'] / (test['Age'] + 1)

# HeartRate per Weight
train['HeartRate_per_kg'] = train['Heart_Rate'] / train['Weight']
test['HeartRate_per_kg'] = test['Heart_Rate'] / test['Weight']
# Duration Per Age
train['Duration_per_age'] = train['Duration'] / (train['Age'] + 1)
test['Duration_per_age'] = test['Duration'] / (test['Age'] + 1 )

# Duration * Heart Rate
train['Duration_heart_rate']=train['Duration']*train['Heart_Rate']
test['Duration_heart_rate']=test['Duration']*test['Heart_Rate']

# Intensity
train['Duration_per_weight']=train['Duration']/train['Weight']
test['Duration_per_weight']=test['Duration']/test['Weight']

# All Durations add and multi
train['duration_sum']=train['Duration_per_weight']+train['Duration_heart_rate']+train['Duration_per_age']
test['duration_sum']=test['Duration_per_weight']+test['Duration_heart_rate']+test['Duration_per_age']

train['duration_multi']=train['Duration_per_weight']*train['Duration_heart_rate']*train['Duration_per_age']
test['duration_multi']=test['Duration_per_weight']*test['Duration_heart_rate']*test['Duration_per_age']

# Converting Height in Meters
train['Height']=train['Height']/100
test['Height']=test['Height']/100

# Creating new column 'BMI'
train['BMI']=train['Weight']/(train['Height'] ** 2)
train['BMI']=train['BMI'].round(2)
test['BMI']=test['Weight']/(test['Height'] ** 2)
test['BMI']=test['BMI'].round(2)

# Mapping Genders
map={'male':0,'female':1}
train['Sex']=train['Sex'].map(map)
test['Sex']=test['Sex'].map(map)

num_cols = train.select_dtypes(include=['int64', 'float64']).columns.drop(['id', 'Calories']).to_list()
cat_cols = train.select_dtypes(include=['object', 'category']).columns.to_list()

In [None]:
# def remove_outliers_by_zscore(df, columns, threshold=3):
#     df_clean = df.copy()
#     for col in columns:
#         z_score = np.abs(stats.zscore(df_clean[col]))
#         df_clean = df_clean[z_score < threshold]
#     return df_clean

# train = remove_outliers_by_zscore(train, num_cols)
# test = remove_outliers_by_zscore(test, num_cols)

# def clip_target_values(df, cols, lower_percentile=1, upper_percentile=99):
#     df_clipped = df.copy()
#     for col in cols:
#         lower_bound = df_clipped[col].quantile(lower_percentile / 100)
#         upper_bound = df_clipped[col].quantile(upper_percentile / 100)
#         df_clipped[col] = df_clipped[col].clip(lower=lower_bound, upper=upper_bound)
#     return df_clipped

# train = clip_target_values(train, num_cols)
# test = clip_target_values(test, num_cols)

# def log_transform_skewed_features(df, cols, skewness_threshold=0.75):
#     df_transformed = df.copy()
#     for col in cols:
#         skewness = df_transformed[col].skew()
#         if skewness > skewness_threshold:
#             df_transformed[col] = np.log1p(df_transformed[col])
#     return df_transformed

# train = log_transform_skewed_features(train, num_cols)
# test = log_transform_skewed_features(test, num_cols)

# def remove_low_variance_features(df, cols, threshold=0.01):
#     df_clean = df.copy()
#     selector = VarianceThreshold(threshold)
#     df_clean[cols] = selector.fit_transform(df_clean[cols])
#     return df_clean

# train = remove_low_variance_features(train, num_cols)
# test = remove_low_variance_features(test, num_cols)

#### Predict

In [None]:
X = train.drop(columns=['id', 'Calories'])
y = train['Calories']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    tree_method = 'hist',
    objective = 'reg:squarederror',
    random_state = 42,
    verbosity = 0,
    n_jobs = -1,
    n_estimators = 1000,
    learning_rate = 0.009456490533119234,
    max_depth = 10,
    reg_lambda = 1.7434697249357467,
    reg_alpha = 0.6488287215083401,
    subsample = 0.9494258651834557,
    colsample_bytree = 0.6332097368693999,
    min_child_weight = 6,
    gamma = 0.3841072597639574
)

xgb_model.fit(X_train, y_train)

#### Scoring

In [None]:
preds = xgb_model.predict(X_val)
preds_clip = np.maximum(0, preds)
rmsle = np.sqrt(mean_squared_log_error(y_val, preds_clip))
print(f'RMSLE : {rmsle:.5f}')

RMSLE : 1.02452


#### Save submission

In [None]:
submission = pd.read_csv("D:\Fahmi\AI-data\ML\Predict-calorie-expenditures\calorie_expenditure_regression\input\sample_submission.csv")
X_test = test.drop(columns=['id'])

preds = xgb_model.predict(X_test)
preds_clip = np.maximum(0, preds)
assert len(preds) == len(submission), "Jumlah prediksi dan ID tidak cocok!"
submission['Calories'] = preds_clip
submission.to_csv('submission3.csv', index=False)