In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import os

In [2]:
# Load base model predictions
xgboost_df = pd.read_csv("../base_predictions/xgboost_preds_2021.csv")  
svr_df = pd.read_csv("../base_predictions/svr_preds_2021.csv")          
rf_df = pd.read_csv("../base_predictions/rf_preds_2021.csv")            
catboost_df = pd.read_csv("../base_predictions/catboost_preds_2021.csv")
lightgbm_df = pd.read_csv("../base_predictions/lightgbm_preds_2021.csv")
lstm_df = pd.read_csv("../base_predictions/lstm_preds_2021.csv")

for df in [xgboost_df, svr_df, rf_df, catboost_df, lightgbm_df, lstm_df]:
  print(df.columns)

Index(['geocode', 'week', 'xgboost_pred', 'actual'], dtype='object')
Index(['geocode', 'svr_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'rf_pred', 'actual'], dtype='object')
Index(['geocode', 'catboost_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'lightgbm_predict', 'actual'], dtype='object')
Index(['geocode', 'lstm_pred', 'actual'], dtype='object')


In [3]:
# Rename columns for consistency
xgboost_df = xgboost_df.rename(columns={'xgboost_pred': 'XGBoost'})
svr_df = svr_df.rename(columns={'svr_pred': 'SVR'})
rf_df = rf_df.rename(columns={'rf_pred': 'RF'})
catboost_df = catboost_df.rename(columns={'catboost_pred': 'CatBoost'})
lightgbm_df = lightgbm_df.rename(columns={'lightgbm_predict': 'LightGBM'})
lstm_df = lstm_df.rename(columns={'lstm_pred': 'LSTM'})

# Round actuals
for df in [xgboost_df, svr_df, rf_df, catboost_df, lightgbm_df, lstm_df]:
    df['actual'] = df['actual'].round().astype(int)

In [4]:
# Start with XGBoost
ensemble_df = xgboost_df[['geocode', 'week', 'actual', 'XGBoost']].copy()


# Define a function to safely add predictions from another dataframe
def add_prediction_column(base_df, new_df, new_col_name):
    # Group by geocode to align within each geocode
    grouped_base = base_df.groupby('geocode')
    grouped_new = new_df.groupby('geocode')

    aligned_preds = []

    for geocode, base_group in grouped_base:
        new_group = grouped_new.get_group(geocode)
        
        # Ensure matching number of rows
        assert len(base_group) == len(new_group), f"Row count mismatch for geocode {geocode}"

        base_actuals = base_group['actual'].values
        new_actuals = new_group['actual'].values
        
        if not np.array_equal(base_actuals, new_actuals):
            mismatches = np.where(base_actuals != new_actuals)[0]
            print(f"\nActual value mismatch for geocode {geocode}")
            
            for idx in mismatches:
                print(f"Row {idx}: base actual = {base_actuals[idx]}, new actual = {new_actuals[idx]}")
            raise AssertionError(f"Actual mismatch for geocode {geocode} — {len(mismatches)} mismatches")

        # Collect predictions in order
        aligned_preds.extend(new_group[new_col_name].values)

    # Add new prediction column
    base_df[new_col_name] = aligned_preds
    return base_df

def drop_first_n_rows_per_geocode(df, n=8):
    return (
        df.groupby('geocode', group_keys=False)
        .apply(lambda group: group.iloc[n:])
        .reset_index(drop=True)
    )

def drop_last_n_rows_per_geocode(df, n=4):
    return (
        df.groupby('geocode', group_keys=False)
        .apply(lambda group: group.iloc[:-n] if len(group) > n else group.iloc[0:0])
        .reset_index(drop=True)
    )


# Apply to model DataFrames
xgboost_df = drop_first_n_rows_per_geocode(xgboost_df, 4)
svr_df = drop_first_n_rows_per_geocode(svr_df, 4)
rf_df = drop_first_n_rows_per_geocode(rf_df, 4)
catboost_df = drop_first_n_rows_per_geocode(catboost_df, 4)
lightgbm_df = drop_first_n_rows_per_geocode(lightgbm_df, 4)
# lstm_df = drop_last_n_rows_per_geocode(lstm_df, 4)

# Start with XGBoost
ensemble_df = xgboost_df[['geocode', 'week', 'actual', 'XGBoost']].copy()

# Add predictions one by one
ensemble_df = add_prediction_column(ensemble_df, svr_df, 'SVR')
ensemble_df = add_prediction_column(ensemble_df, rf_df, 'RF')
ensemble_df = add_prediction_column(ensemble_df, catboost_df, 'CatBoost')
ensemble_df = add_prediction_column(ensemble_df, lightgbm_df, 'LightGBM')
ensemble_df = add_prediction_column(ensemble_df, lstm_df, 'LSTM')

  .apply(lambda group: group.iloc[n:])
  .apply(lambda group: group.iloc[n:])
  .apply(lambda group: group.iloc[n:])
  .apply(lambda group: group.iloc[n:])
  .apply(lambda group: group.iloc[n:])


In [5]:
# View the final ensemble input
print(ensemble_df.head())

   geocode    week  actual   XGBoost       SVR        RF  CatBoost  LightGBM  \
0  3300100  202105       0  1.059146  6.595941  1.578376  3.314909  1.725961   
1  3300100  202106       0  2.582202  6.060665  3.500376  4.117815  3.028438   
2  3300100  202107       1  1.794821  3.411337  1.800650  2.828655  2.178913   
3  3300100  202108       0  0.775880  5.403241  0.857398  4.072607  1.088853   
4  3300100  202109       3  0.865778  5.775082  1.282198  3.373198  1.773502   

       LSTM  
0  0.000000  
1  0.000000  
2  1.478634  
3  1.874367  
4  1.326040  


In [6]:
features = ['XGBoost', 'SVR', 'RF', 'CatBoost', 'LightGBM', 'LSTM']
X = ensemble_df[features].values
y = ensemble_df['actual'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

GradientBoostingRegressor with GridSearchCV

In [7]:
# # GridSearchCV for hyperparameter tuning
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'learning_rate': [0.05, 0.1, 0.2],
#     'max_depth': [2, 3, 4]
# }

# gbr = GradientBoostingRegressor(random_state=42)
# grid_search = GridSearchCV(gbr, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
# grid_search.fit(X_train, y_train)

# # Best model
# meta_model = grid_search.best_estimator_
# print(f"\nBest Hyperparameters: {grid_search.best_params_}")

GradientBoostingRegressor without GridSearchCV

In [8]:
# meta_model = GradientBoostingRegressor(
#     n_estimators=100,
#     learning_rate=0.1,
#     max_depth=3,
#     random_state=42
# )
# meta_model.fit(X_train, y_train)

In [9]:
# y_pred = meta_model.predict(X_test)

# mae = mean_absolute_error(y_test, y_pred)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# print(f"\nEvaluation on Test Data:")
# print(f"Mean Absolute Error (MAE): {mae:.4f}")
# print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Other Regressors with GridSearchCV

In [10]:
meta_models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'LinearRegression': LinearRegression()
}

from sklearn.model_selection import GridSearchCV

param_grids = {
    'Ridge': {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]},
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1.0]},
    'LinearRegression': {}  # No hyperparameters to tune
}

for name, model in meta_models.items():
    print(f"\n--- Testing {name} ---")
    
    param_grid = param_grids[name]
    
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best Parameters for {name}: {grid_search.best_params_}")
    
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"{name} — MAE: {mae:.4f}, RMSE: {rmse:.4f}")


meta_model = None
lowest_rmse = float('inf')

for name, model in meta_models.items():
    ...
    if rmse < lowest_rmse:
        lowest_rmse = rmse
        meta_model = best_model

print(f"\nBest Overall Model: {meta_model.__class__.__name__} with RMSE: {lowest_rmse:.4f}")




--- Testing Ridge ---
Best Parameters for Ridge: {'alpha': 100.0}
Ridge — MAE: 0.8211, RMSE: 1.9729

--- Testing Lasso ---
Best Parameters for Lasso: {'alpha': 0.1}
Lasso — MAE: 0.8229, RMSE: 1.9750

--- Testing LinearRegression ---
Best Parameters for LinearRegression: {}
LinearRegression — MAE: 0.8224, RMSE: 1.9749

Best Overall Model: LinearRegression with RMSE: 1.9749


In [11]:
# Use the Meta-Model for Final Predictions (on 2022 predictions)
# Load test set predictions from base models
xgboost_test_df = pd.read_csv("../base_predictions/xgboost_test_preds_2022.csv")  
svr_test_df = pd.read_csv("../base_predictions/svr_test_preds_2022.csv")          
rf_test_df = pd.read_csv("../base_predictions/rf_test_preds_2022.csv")            
catboost_test_df = pd.read_csv("../base_predictions/catboost_test_preds_2022.csv")
lightgbm_test_df = pd.read_csv("../base_predictions/lightgbm_test_preds_2022.csv")
lstm_test_df = pd.read_csv("../base_predictions/lstm_test_preds_2022.csv")

for df in [xgboost_test_df, svr_test_df, rf_test_df, catboost_test_df, lightgbm_test_df, lstm_test_df]:
  print(df.columns)

Index(['geocode', 'week', 'xgboost_pred', 'actual'], dtype='object')
Index(['geocode', 'svr_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'rf_pred', 'actual'], dtype='object')
Index(['geocode', 'catboost_pred', 'actual'], dtype='object')
Index(['geocode', 'week', 'lightgbm_predict', 'actual'], dtype='object')
Index(['geocode', 'lstm_pred', 'actual'], dtype='object')


In [12]:
xgboost_test_df = xgboost_test_df.rename(columns={'xgboost_pred': 'XGBoost'})
svr_test_df = svr_test_df.rename(columns={'svr_pred': 'SVR'})
rf_test_df = rf_test_df.rename(columns={'rf_pred': 'RF'})
catboost_test_df = catboost_test_df.rename(columns={'catboost_pred': 'CatBoost'})
lightgbm_test_df = lightgbm_test_df.rename(columns={'lightgbm_predict': 'LightGBM'})
lstm_test_df = lstm_test_df.rename(columns={'lstm_pred': 'LSTM'})

for df in [xgboost_test_df, svr_test_df, rf_test_df, catboost_test_df, lightgbm_test_df, lstm_test_df]:
    if 'actual' in df.columns:
        df['actual'] = df['actual'].round().astype(int)


ensemble_test_df = xgboost_test_df[['geocode', 'week', 'actual', 'XGBoost']].copy()

# Add predictions from other models
ensemble_test_df = add_prediction_column(ensemble_test_df, svr_test_df, 'SVR')
ensemble_test_df = add_prediction_column(ensemble_test_df, rf_test_df, 'RF')
ensemble_test_df = add_prediction_column(ensemble_test_df, catboost_test_df, 'CatBoost')
ensemble_test_df = add_prediction_column(ensemble_test_df, lightgbm_test_df, 'LightGBM')
ensemble_test_df = add_prediction_column(ensemble_test_df, lstm_test_df, 'LSTM')

In [13]:
# Generate Ensemble Predictions for 2022
X_2022 = ensemble_test_df[features].values
ensemble_preds_2022 = meta_model.predict(X_2022)

# Add predictions to DataFrame
ensemble_test_df['ensemble_pred'] = ensemble_preds_2022

# Save final results to CSV
ensemble_test_df.to_csv("../ensemble_predictions/final_ensemble_predictions_2022.csv", index=False)

In [14]:
if 'actual' in ensemble_test_df.columns:
    mae_2022 = mean_absolute_error(ensemble_test_df['actual'], ensemble_test_df['ensemble_pred'])
    rmse_2022 = np.sqrt(mean_squared_error(ensemble_test_df['actual'], ensemble_test_df['ensemble_pred']))
    print(f"\nEvaluation on the True Hold-out Set (2022):")
    print(f"MAE: {mae_2022:.4f}")
    print(f"RMSE: {rmse_2022:.4f}")


Evaluation on the True Hold-out Set (2022):
MAE: 2.2998
RMSE: 8.1301


In [15]:
import matplotlib.pyplot as plt
import os

# Optional: create output directory
output_dir = "../ensemble_predictions/plots_2022"
os.makedirs(output_dir, exist_ok=True)

# List of model columns to plot
model_cols = ['XGBoost', 'SVR', 'RF', 'CatBoost', 'LightGBM', 'LSTM']

# Iterate over each geocode
unique_geocodes = ensemble_test_df['geocode'].unique()

for geocode in unique_geocodes:
    df_geo = ensemble_test_df[ensemble_test_df['geocode'] == geocode].sort_values(by='week')

    plt.figure(figsize=(14, 6))

    # Actual: dashed bold black line
    plt.plot(df_geo['week'], df_geo['actual'], label='Actual', color='black', linewidth=2.5, linestyle='--')

    # Base models: default colors, thinner lines
    for model in model_cols:
        plt.plot(df_geo['week'], df_geo[model], label=model, linewidth=1.2)

    # Ensemble: bold solid blue line
    plt.plot(df_geo['week'], df_geo['ensemble_pred'], label='Ensemble', color='blue', linewidth=2.5)


    plt.title(f"Geocode {geocode} — 2022 Predictions vs Actual")
    plt.xlabel("Week")
    plt.ylabel("Dengue Cases")
    plt.legend()
    plt.tight_layout()

    # Save figure
    plt.savefig(os.path.join(output_dir, f"geocode_{geocode}.png"))
    plt.close()
