In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
main_data = pd.read_csv("./data/train.csv")


# 'critical_temp' is the target
X = main_data.drop("critical_temp", axis=1)
y = main_data["critical_temp"]

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline XGBoost model

# Hyperparameters are set to common baseline values.
#xgb_model = XGBRegressor(
#    n_estimators=100,
#    max_depth=6,
#    learning_rate=0.1,
#    random_state=42
#)

# Create a baseline XGBoost model with the parameters specified in the paper
xgb_model = XGBRegressor(
    n_estimators=374,         # Tree size: 374
    max_depth=16,             # Maximum depth: 16
    learning_rate=0.02,       # Learning rate (η): 0.02
    min_child_weight=1,       # Minimum child weight: 1
    colsample_bytree=0.5,     # Column subsampling: 0.50
    random_state=42,
    objective='reg:squarederror'
)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model using RMSE and R² metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Author's parameters XGBoost Performance:")
print("RMSE: {:.4f}".format(rmse))
print("R²: {:.4f}".format(r2))

# Optional Cross-validation to further assess model performance
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -np.mean(cv_scores)
print("Cross-validated RMSE: {:.4f}".format(cv_rmse))


Results:

Baseline XGBoost Performance:

RMSE: 9.7176
R²: 0.9180
Cross-validated RMSE: 10.2925


Author's parameters XGBoost Performance:

RMSE: 8.8457
R²: 0.9320
Cross-validated RMSE: 9.6232
