In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Load base model predictions
xgboost_df = pd.read_csv("../base_predictions/xgboost_preds.csv")  
svr_df = pd.read_csv("../base_predictions/svr_preds.csv")          
rf_df = pd.read_csv("../base_predictions/rf_preds.csv")            
catboost_df = pd.read_csv("../base_predictions/catboost_preds.csv")
lightgbm_df = pd.read_csv("../base_predictions/lightgbm_preds.csv")
lstm_df = pd.read_csv("../base_predictions/lstm_preds.csv")

for df in [xgboost_df, svr_df, rf_df, catboost_df, lightgbm_df, lstm_df]:
  print(df.columns)

Index(['geocode', 'week', 'xgboost_pred', 'actual'], dtype='object')
Index(['geocode', 'svr_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'rf_pred', 'actual'], dtype='object')
Index(['geocode', 'catboost_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'lightgbm_predict', 'actual'], dtype='object')
Index(['geocode', 'lstm_pred', 'actual'], dtype='object')


In [3]:
# Rename columns for consistency
xgboost_df = xgboost_df.rename(columns={'xgboost_pred': 'XGBoost'})
svr_df = svr_df.rename(columns={'svr_pred': 'SVR'})
rf_df = rf_df.rename(columns={'rf_pred': 'RF'})
catboost_df = catboost_df.rename(columns={'catboost_pred': 'CatBoost'})
lightgbm_df = lightgbm_df.rename(columns={'lightgbm_predict': 'LightGBM'})
lstm_df = lstm_df.rename(columns={'lstm_pred': 'LSTM'})

# Round actuals
for df in [xgboost_df, svr_df, rf_df, catboost_df, lightgbm_df, lstm_df]:
    df['actual'] = df['actual'].round().astype(int)

In [4]:
# Start with XGBoost
ensemble_df = xgboost_df[['geocode', 'week', 'actual', 'XGBoost']].copy()

# Define a function to safely add predictions from another dataframe
def add_prediction_column(base_df, new_df, new_col_name):
    # Group by geocode to align within each geocode
    grouped_base = base_df.groupby('geocode')
    grouped_new = new_df.groupby('geocode')

    aligned_preds = []

    for geocode, base_group in grouped_base:
        new_group = grouped_new.get_group(geocode)
        
        # Ensure matching number of rows
        assert len(base_group) == len(new_group), f"Row count mismatch for geocode {geocode}"

        base_actuals = base_group['actual'].values
        new_actuals = new_group['actual'].values
        
        if not np.array_equal(base_actuals, new_actuals):
            mismatches = np.where(base_actuals != new_actuals)[0]
            print(f"\nActual value mismatch for geocode {geocode}")
            
            for idx in mismatches:
                print(f"Row {idx}: base actual = {base_actuals[idx]}, new actual = {new_actuals[idx]}")
            raise AssertionError(f"Actual mismatch for geocode {geocode} — {len(mismatches)} mismatches")

        # Collect predictions in order
        aligned_preds.extend(new_group[new_col_name].values)

    # Add new prediction column
    base_df[new_col_name] = aligned_preds
    return base_df

# Add predictions one by one
ensemble_df = add_prediction_column(ensemble_df, svr_df, 'SVR')
ensemble_df = add_prediction_column(ensemble_df, rf_df, 'RF')
ensemble_df = add_prediction_column(ensemble_df, catboost_df, 'CatBoost')
ensemble_df = add_prediction_column(ensemble_df, lightgbm_df, 'LightGBM')
# ensemble_df = add_prediction_column(ensemble_df, lstm_df, 'LSTM')

In [5]:
# View the final ensemble input
print(ensemble_df.head())

   geocode    week  actual   XGBoost       SVR        RF   CatBoost  LightGBM
0  3300100  202101       0  2.759864  5.177315  1.252551   6.915961  2.092089
1  3300100  202102       0  1.598456  4.924674  2.300421   9.648564  1.762717
2  3300100  202103       1  1.342089  4.374011  0.385281   5.725471  1.910225
3  3300100  202104       3  1.342089  4.707502  0.307886   7.316536  1.703125
4  3300100  202105       0  0.896632  5.444897  0.962705  16.632501  1.884989


In [6]:
features = ['XGBoost', 'SVR', 'RF', 'CatBoost', 'LightGBM']  # optionally add 'LSTM'
X = ensemble_df[features].values
y = ensemble_df['actual'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [7]:
meta_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
meta_model.fit(X_train, y_train)

In [8]:
y_pred = meta_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nMeta-model (Stacking) Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


Meta-model (Stacking) Evaluation:
Mean Absolute Error (MAE): 0.7746
Root Mean Squared Error (RMSE): 2.2370


In [9]:
# Use the Meta-Model for Final Predictions (on 2022 predictions)
# Load test set predictions from base models
xgboost_test_df = pd.read_csv("../base_predictions/xgboost_test_preds.csv")  
svr_test_df = pd.read_csv("../base_predictions/svr_test_preds.csv")          
rf_test_df = pd.read_csv("../base_predictions/rf_test_preds.csv")            
catboost_test_df = pd.read_csv("../base_predictions/catboost_test_preds.csv")
lightgbm_test_df = pd.read_csv("../base_predictions/lightgbm_test_preds.csv")
lstm_test_df = pd.read_csv("../base_predictions/lstm_test_preds.csv")

for df in [xgboost_test_df, svr_test_df, rf_test_df, catboost_test_df, lightgbm_test_df, lstm_test_df]:
  print(df.columns)

Index(['geocode', 'week', 'xgboost_pred', 'actual'], dtype='object')
Index(['geocode', 'svr_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'rf_pred', 'actual'], dtype='object')
Index(['geocode', 'catboost_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'lightgbm_predict', 'actual'], dtype='object')
Index(['geocode', 'lstm_pred', 'actual'], dtype='object')


In [10]:
xgboost_test_df = xgboost_test_df.rename(columns={'xgboost_pred': 'XGBoost'})
svr_test_df = svr_test_df.rename(columns={'svr_pred': 'SVR'})
rf_test_df = rf_test_df.rename(columns={'rf_pred': 'RF'})
catboost_test_df = catboost_test_df.rename(columns={'catboost_pred': 'CatBoost'})
lightgbm_test_df = lightgbm_test_df.rename(columns={'lightgbm_predict': 'LightGBM'})
lstm_test_df = lstm_test_df.rename(columns={'lstm_pred': 'LSTM'})

for df in [xgboost_test_df, svr_test_df, rf_test_df, catboost_test_df, lightgbm_test_df, lstm_test_df]:
    if 'actual' in df.columns:
        df['actual'] = df['actual'].round().astype(int)


ensemble_test_df = xgboost_test_df[['geocode', 'week', 'actual', 'XGBoost']].copy()

# Add predictions from other models
ensemble_test_df = add_prediction_column(ensemble_test_df, svr_test_df, 'SVR')
ensemble_test_df = add_prediction_column(ensemble_test_df, rf_test_df, 'RF')
ensemble_test_df = add_prediction_column(ensemble_test_df, catboost_test_df, 'CatBoost')
ensemble_test_df = add_prediction_column(ensemble_test_df, lightgbm_test_df, 'LightGBM')
# ensemble_test_df = add_prediction_column(ensemble_test_df, lstm_test_df, 'LSTM')

In [11]:
# Generate Ensemble Predictions for 2022
X_2022 = ensemble_test_df[features].values
ensemble_preds_2022 = meta_model.predict(X_2022)

# Add predictions to DataFrame
ensemble_test_df['ensemble_pred'] = ensemble_preds_2022

# Save final results to CSV
ensemble_test_df.to_csv("../ensemble_predictions/final_ensemble_predictions_2022.csv", index=False)

In [12]:
if 'actual' in ensemble_test_df.columns:
    mae_2022 = mean_absolute_error(ensemble_test_df['actual'], ensemble_test_df['ensemble_pred'])
    rmse_2022 = np.sqrt(mean_squared_error(ensemble_test_df['actual'], ensemble_test_df['ensemble_pred']))
    print(f"\n2022 Ensemble Model Evaluation:")
    print(f"MAE: {mae_2022:.4f}")
    print(f"RMSE: {rmse_2022:.4f}")


2022 Ensemble Model Evaluation:
MAE: 3.1055
RMSE: 16.2319


In [13]:
import matplotlib.pyplot as plt
import os

# Optional: create output directory
output_dir = "../ensemble_predictions/plots_2022"
os.makedirs(output_dir, exist_ok=True)

# List of model columns to plot
model_cols = ['XGBoost', 'SVR', 'RF', 'CatBoost', 'LightGBM']

# Iterate over each geocode
unique_geocodes = ensemble_test_df['geocode'].unique()

for geocode in unique_geocodes:
    df_geo = ensemble_test_df[ensemble_test_df['geocode'] == geocode].sort_values(by='week')

    plt.figure(figsize=(14, 6))

    # Actual: dashed bold black line
    plt.plot(df_geo['week'], df_geo['actual'], label='Actual', color='black', linewidth=2.5, linestyle='--')

    # Base models: default colors, thinner lines
    for model in model_cols:
        plt.plot(df_geo['week'], df_geo[model], label=model, linewidth=1.2)

    # Ensemble: bold solid blue line
    plt.plot(df_geo['week'], df_geo['ensemble_pred'], label='Ensemble', color='blue', linewidth=2.5)


    plt.title(f"Geocode {geocode} — 2022 Predictions vs Actual")
    plt.xlabel("Week")
    plt.ylabel("Dengue Cases")
    plt.legend()
    plt.tight_layout()

    # Save figure
    plt.savefig(os.path.join(output_dir, f"geocode_{geocode}.png"))
    plt.close()
