In [7]:
# attempt to setup and run the model as close to the author's description as possible
# this time use different train test splits

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
main_data = pd.read_csv("./data/train.csv")

# 'critical_temp' is the target
X = main_data.drop('critical_temp', axis=1)
y = main_data['critical_temp']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Create a baseline XGBoost model with the parameters specified in the paper
xgb_model = XGBRegressor(
    n_estimators=374,         # Tree size: 374
    max_depth=16,             # Maximum depth: 16
    learning_rate=0.02,       # Learning rate (η): 0.02
    min_child_weight=1,       # Minimum child weight: 1
    colsample_bytree=0.5,     # Column subsampling: 0.50
    random_state=42,
    objective='reg:squarederror'
)

xgb_model.fit(X_train, y_train)

# Generate predictions
y_pred_xgb_test = xgb_model.predict(X_test)

# Evaluate Model
xgb_test = np.sqrt(mean_squared_error(y_test, y_pred_xgb_test))
xgb_r2_test = r2_score(y_test, y_pred_xgb_test)

print(f"Test RMSE: {xgb_test:.4f}, Test R²: {xgb_r2_test:.4f}")



Test RMSE: 8.7622, Test R²: 0.9306


Runs:

66-33 split:

Test RMSE: 9.4656, Test R²: 0.9230

70-30 split:

Test RMSE: 9.4038, Test R²: 0.9238

80-20 split:

Test RMSE: 8.8457, Test R²: 0.9320

90-10 split:

Test RMSE: 8.5532, Test R²: 0.9362

95-05 split:

Test RMSE: 8.7622, Test R²: 0.9306


Note:


