In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.utils import shuffle

# Load datasets
main_data = pd.read_csv("./data/train.csv")

# Define target and features
target = "critical_temp"
X = main_data.drop(columns=[target])
y = main_data[target]

# Shuffle data before splitting
X, y = shuffle(X, y, random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline XGBoost model with the parameters specified in the paper
xgb_model = XGBRegressor(
    n_estimators=374,         # Tree size: 374
    max_depth=16,             # Maximum depth: 16
    learning_rate=0.02,       # Learning rate (η): 0.02
    min_child_weight=1,       # Minimum child weight: 1
    colsample_bytree=0.5,     # Column subsampling: 0.50
    random_state=42,
    objective='reg:squarederror'  # default: reg:squarederror
)


# Learning Curve Analysis
def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, valid_scores = learning_curve(model, X, y, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
    train_mean = -np.mean(train_scores, axis=1)
    valid_mean = -np.mean(valid_scores, axis=1)
    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, label="Training RMSE")
    plt.plot(train_sizes, valid_mean, label="Validation RMSE")
    plt.xlabel("Training Size")
    plt.ylabel("RMSE")
    plt.title(title)
    plt.legend()
    plt.show()

# Plot learning curves
plot_learning_curve(xgb_model, X_train, y_train, "Learning Curve - Author's original model")

