In [None]:
# # EXPERIMENTING WIT LAGGING THIS IS FOR FEATURE ENGINEERING 
# # Create a lagged version of the El Niño dataset, shifting by 3 months (90 days)
# df_nino_lagged = nino_indices_df.with_columns((pl.col('time') + pl.duration(days=90)).alias('time_lagged'))

In [None]:
# df_nino_lagged = df_nino_lagged.with_columns(
#     pl.col('time_lagged').dt.date().alias('time_lagged')
# )

In [None]:
output_file_path = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/02_intermediate/preprocessed_nino_data.parquet'
df_nino_lagged.write_parquet(output_file_path)

In [None]:
# Load meteorlogical data from NASA
df_nino_lagged = pl.read_parquet("/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/02_intermediate/preprocessed_nino_data.parquet")

In [None]:
df = pd.read_parquet("/teamspace/studios/this_studio/ml-drought-forecasting/ml-modeling-pipeline/data/03_primary/preprocessed_data.parquet")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Prepare the data
feature_columns = df.columns.drop(['eddi_06mn', 'time', 'lat', 'lon'])
X = df[feature_columns].copy()  # Explicitly create a copy
y = df['eddi_06mn']

# Downcast data types
for col in X.select_dtypes(include=['float64', 'int64']).columns:
    X[col] = pd.to_numeric(X[col], downcast='float')

# Proceed with splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# Initialize and apply the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames with original column names
X_train_scaled = pd.DataFrame(
    X_train_scaled, columns=X_train.columns, index=X_train.index
)
X_test_scaled = pd.DataFrame(
    X_test_scaled, columns=X_test.columns, index=X_test.index
)

# Create DMatrix for XGBoost with scaled data and feature names
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# Set up parameters for XGBoost with updated GPU settings
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',  # Use 'hist' instead of 'gpu_hist'
    'device': 'cuda',       # Specify GPU device
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'random_state': 42
}

# Train the model
evals = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    early_stopping_rounds=20,
    evals=evals,
    verbose_eval=50
)

# Evaluate the model
y_pred = xgb_model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}, R^2: {r2}')

# Extract feature importances
importance_dict = xgb_model.get_score(importance_type='gain')
importances = pd.DataFrame({
    'Feature': importance_dict.keys(),
    'Importance': importance_dict.values()
})
importances.sort_values(by='Importance', ascending=False, inplace=True)
print(importances.head(10))

# Plot feature importances
plt.rcParams['figure.figsize'] = [10, 7]
ax = xgb.plot_importance(xgb_model, max_num_features=50)
plt.title('XGBoost Feature Importances (GPU Accelerated)')
plt.show()

# Compute SHAP values
explainer = shap.TreeExplainer(xgb_model)
X_test_sample = X_test_scaled.sample(n=1000, random_state=42)
shap_values = explainer.shap_values(X_test_sample)

# SHAP summary plot
shap.summary_plot(shap_values, X_test_sample, feature_names=X.columns)
