In [None]:
import pandas as pd 
import numpy as np 
import time
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.simplefilter('ignore')

In [None]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [None]:
# Add column names for numerical features
numerical_features = ["Age","Height","Weight","Duration","Heart_Rate","Body_Temp"]

# Function to add cross terms i.e. feature1 x feature2 (only for numerical)
def add_feature_cross_terms(df, numerical_features):
    df_new = df.copy()
    
    for i in range(len(numerical_features)):
        for j in range(i + 1, len(numerical_features)):  
            feature1 = numerical_features[i]
            feature2 = numerical_features[j]
            cross_term_name = f"{feature1}_x_{feature2}"
            df_new[cross_term_name] = df_new[feature1] * df_new[feature2]

    return df_new

# Add cross terms to train & test sets
train = add_feature_cross_terms(train, numerical_features)
test = add_feature_cross_terms(test, numerical_features)

In [4]:
num_features = train.select_dtypes(include='number')

In [None]:
# Encode non-numerical features
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

train["Sex"] = train["Sex"].astype("category")
test["Sex"] = test["Sex"].astype("category")

X = train.drop(columns=["id", "Calories"])
y = np.log1p(train["Calories"])
X_test = test.drop(columns=["id"])

In [None]:
FOLDS = 50
FEATURES = X.columns.tolist()

# KFold setup
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Arrays to store predictions
oof = np.zeros(len(train))
pred = np.zeros(len(test))

# Start CV loop
for i, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"\n{'#'*10} Fold {i+1} {'#'*10}")
    
    x_train = X.iloc[train_idx].copy()
    y_train = y.iloc[train_idx]
    x_valid = X.iloc[valid_idx].copy()
    y_valid = y.iloc[valid_idx]
    x_test = X_test.copy()
    
    start = time.time()

    # Train model
    model = XGBRegressor(
        device="cuda" if XGBRegressor().get_params().get("device") == "cuda" else "cpu",
        max_depth=10,
        colsample_bytree=0.75,
        subsample=0.9,
        n_estimators=2000,
        learning_rate=0.02,
        gamma=0.01, 
        max_delta_step=2,
        early_stopping_rounds=100,
        eval_metric="rmse",
        enable_categorical=True
    )

    model.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=100
    )

    # Predict OOF and test
    oof[valid_idx] = model.predict(x_valid)
    pred += model.predict(x_test)

    rmse = np.sqrt(mean_squared_error(y_valid, oof[valid_idx]))
    print(f"Fold {i+1} RMSE: {rmse:.4f}")
    print(f"Feature engineering & training time: {time.time() - start:.1f} sec")

# Average test predictions
pred /= FOLDS

# Final RMSE
full_rmse = np.sqrt(mean_squared_error(y, oof))
print(f"\nFinal CV RMSE: {full_rmse:.4f}")


########## Fold 1 ##########
[0]	validation_0-rmse:0.95093
[100]	validation_0-rmse:0.14291
[200]	validation_0-rmse:0.06515
[300]	validation_0-rmse:0.06292
[400]	validation_0-rmse:0.06290
[416]	validation_0-rmse:0.06290
Fold 1 RMSE: 0.0629
Feature engineering & training time: 18.8 sec

########## Fold 2 ##########
[0]	validation_0-rmse:0.94321
[100]	validation_0-rmse:0.14189
[200]	validation_0-rmse:0.06355
[300]	validation_0-rmse:0.06105
[400]	validation_0-rmse:0.06094
[500]	validation_0-rmse:0.06096
[541]	validation_0-rmse:0.06095
Fold 2 RMSE: 0.0609
Feature engineering & training time: 22.3 sec

########## Fold 3 ##########
[0]	validation_0-rmse:0.95574
[100]	validation_0-rmse:0.14377
[200]	validation_0-rmse:0.06204
[300]	validation_0-rmse:0.05893
[400]	validation_0-rmse:0.05881
[500]	validation_0-rmse:0.05881
[524]	validation_0-rmse:0.05881
Fold 3 RMSE: 0.0588
Feature engineering & training time: 23.0 sec

########## Fold 4 ##########
[0]	validation_0-rmse:0.93257
[100]	validation_0

In [None]:
# Analysis & Generate submission
y_preds = np.expm1(pred)
print('predict mean :',y_preds.mean())
print('predict median :',np.median(y_preds))

y_preds = np.clip(y_preds,1,314)
print('predict mean after clip:',y_preds.mean())
print('predict median after clip:',np.median(y_preds))

submission["Calories"] = y_preds
submission.to_csv("xbgv2_submission_1.csv", index=False)
submission.head()

predict mean : 88.15055037742071
predict median : 76.41292912359289
predict mean after clip: 88.15055037742071
predict median after clip: 76.41292912359289


Unnamed: 0,id,Calories
0,750000,27.433538
1,750001,107.891647
2,750002,87.805718
3,750003,125.802524
4,750004,75.816528
