In [26]:
# ---------------------------
# CONFIG (edit these values)
# ---------------------------
import os
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, confusion_matrix

# =========================================
# GLOBAL SETTINGS - Edit for new analyses
# =========================================
season = "MONSOON"

model_folder = "logistic_regression"  # e.g., "random_forest" or "logistic_regression"
MODEL_NAME = "Logistic_Regression_t0.891.pkl"
start_month = 6
end_month = 7
years_to_process = [2017, 2022, 2023, 2024]

# Base configuration function (will be called for each year)
def create_config(year, season, model_folder, MODEL_NAME, start_month, end_month):
    model_name = Path(MODEL_NAME).stem
    return {
        "season": season,
        "year": year,
        "forecast_excel": Path("..") / "LSTM" / "ECMWF_HRES_forecast_excel" / f"{year}_january_july.xlsx",
        "obs_excel": Path("..") / "LSTM" / "PREMONSOON-janjul201724.xlsx",
        "input_xlsx": Path("simulated_x_monsoon-120.xlsx"),
        "models_dir": Path(f"models_{season}"),
        "model_folder": model_folder,
        "model_name": model_name,
        "predictions_dir": Path("predictionsv2")/model_folder/model_name/f"{season}",
        "plots_dir": Path("plotsv2")/model_folder/model_name/f"{season}",
        "W": 120,   # unit hydrograph width (hours)
        "A": 0.05,  # shape parameter
        "MODEL_NAME": MODEL_NAME,
        "start_month": start_month,
        "end_month": end_month,
        "threshold_monsoon": 12.34,  # Flood threshold for MONSOON season (mMSL)
        "threshold_other": 10.69     # Flood threshold for other seasons (mMSL)
    }

# Create initial CONFIG
CONFIG = create_config(years_to_process[0], season, model_folder, MODEL_NAME, start_month, end_month)

# Create folders
for p in (CONFIG['models_dir'], CONFIG['predictions_dir'], CONFIG['plots_dir']):
    os.makedirs(p, exist_ok=True)

# Control flags
RUN_TRAIN = False  # set True to run training on `input_xlsx`

print("="*70)
print("CONFIGURATION LOADED")
print("="*70)
print(f"Years to process: {years_to_process}")
print(f"Season: {season}, Model: {MODEL_NAME}")
print(f"Date range: Month {start_month} to {end_month}")
print("="*70)

CONFIGURATION LOADED
Years to process: [2017, 2022, 2023, 2024]
Season: MONSOON, Model: Logistic_Regression_t0.891.pkl
Date range: Month 6 to 7


In [27]:
# ---------------------------
# Helpers (metrics, UH, IO)
# ---------------------------
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, confusion_matrix


def compute_point_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = 2*precision*recall/(precision+recall) if (precision+recall) else 0.0
    far       = fp / (fp + tn) if (fp + tn) else 0.0
    return precision, recall, f1, far


def best_threshold_by_f1(y_true, y_scores):
    p, r, thresholds = precision_recall_curve(y_true, y_scores)
    if thresholds.size == 0:
        return 0.5, float(p[-1]), float(r[-1]), 0.0
    p1, r1 = p[1:], r[1:]
    f1 = 2 * p1 * r1 / (p1 + r1 + 1e-12)
    idx = int(np.nanargmax(f1))
    return float(thresholds[idx]), float(p1[idx]), float(r1[idx]), float(f1[idx])


def triangular_uh(width_h: int, alpha: float = 0.5):
    width_h = int(max(2, width_h))
    alpha = float(np.clip(alpha, 0.01, 0.99))
    r_len = max(1, int(np.ceil(width_h * alpha)))
    f_len = max(1, width_h - r_len)
    up = np.linspace(0, 1, r_len + 1)[:-1]
    down = np.linspace(1, 0, f_len + 1)
    k = np.concatenate([up, down[1:]])
    return k / k.sum()


def uh_conv_trailing(r: pd.Series, width_h: int, alpha: float = 0.5) -> pd.Series:
    k = triangular_uh(width_h, alpha)
    v = np.convolve(r.fillna(0.0).values, k, mode="full")[:len(r)]
    return pd.Series(v, index=r.index)


def fit_and_save_models(X, y, models_dir, random_state=42):
    rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=random_state)
    lr = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=random_state)
    summaries = {}

    for model, name in [(rf, 'Random_Forest'), (lr, 'Logistic_Regression')]:
        model.fit(X, y)
        y_scores = model.predict_proba(X)[:, 1]
        th, p, r, f1 = best_threshold_by_f1(y, y_scores)
        model_path = models_dir / f"{name}_t{th:.3f}.pkl"
        joblib.dump(model, model_path)
        y_pred = (y_scores >= th).astype(int)

        # Compute metrics using predictions at chosen threshold
        precision, recall, f1_score, far = compute_point_metrics(y, y_pred)
        summaries[name] = dict(
            threshold=th,
            precision=precision,
            recall=recall,
            f1=f1_score,
            far=far,
            model_file=str(model_path)
        )
    return summaries


def create_calendar_dataframe(sum_f, end_date):
    sum_f = sum_f.copy()
    sum_f.index = pd.to_datetime(sum_f.index).normalize()
    date_leads = {}
    for init_date, row in sum_f.iterrows():
        for lead_col in sum_f.columns:
            if row[lead_col] > 0:
                lead_days = int(lead_col.split("-")[0])
                verification_date = init_date + pd.Timedelta(days=lead_days)
                if verification_date <= end_date:
                    date_leads.setdefault(verification_date.date(), []).append(lead_col)
    return date_leads

In [28]:
# ---------------------------
# Forecast processing (modular)
# ---------------------------

def process_forecasts(forecast_excel, model_path, W, A, year, end_date, end_month, season='MONSOON'):
    """Load forecast excel, apply UH, apply baseflow for MONSOON, run model predictions, return daywise aggregated flags and df_pred."""
    df = pd.read_excel(forecast_excel)
    # expect a column named 'date' or index with start times; try to drop 'date' if present
    if 'date' in df.columns:
        df = df.drop(columns=['date'])

    hours = np.arange(0, 246, 6)
    df_pred = pd.DataFrame(index=np.arange(0, 241, 1))
    df_agg_rain = pd.DataFrame()

    # Optimized baseflow constants for MONSOON
    baseflow_constants = {5: 0.00, 6: 1.14, 7: 1.91}

    # The forecast file here is assumed to have columns where each column is a lead-block
    for col_name in df.columns:
        lead_time_stamp = None
        try:
            # attempt to parse a timestamp from column name if possible
            lead_time_stamp = pd.to_datetime(col_name)
        except Exception:
            lead_time_stamp = None

        # build hourly index; if no timestamp found, just use a placeholder sequence
        if lead_time_stamp is None:
            index_s = pd.date_range(start=f"{year}-03-01", periods=len(hours), freq='6H')
        else:
            index_s = [lead_time_stamp + pd.Timedelta(hours=int(h)) for h in hours]

        df_block = pd.DataFrame({'value': df[col_name].values}, index=index_s)
        
        df_block['increment'] = df_block['value'].diff().fillna(df_block['value'])
        df_hourly = df_block.resample('h').ffill()
        df_hourly['precip'] = df_hourly['increment'] / 6
        
        if df_agg_rain.empty:
            df_agg_rain = df_hourly.copy()
        else:
            df_agg_rain = pd.concat([df_agg_rain, df_hourly])
        df_agg_rain = df_agg_rain[~df_agg_rain.index.duplicated(keep='last')]

        df2 = df_agg_rain[df_agg_rain.index < pd.to_datetime(f"{year}-{end_month+1}-10 23:00:00")]
        df2 = df2.asfreq('h').fillna(0)
        df2['x'] = uh_conv_trailing(df2['precip'].reset_index(drop=True), width_h=W, alpha=A).values
        
        # Add baseflow for MONSOON season
        if season == 'MONSOON':
            df2['month'] = df2.index.month
            df2['baseflow_const'] = df2['month'].map(baseflow_constants).fillna(0)
            df2['x_total'] = df2['x'] + df2['baseflow_const']
            feature_col = 'x_total'
        else:
            feature_col = 'x'
        
        df2 = df2.tail(241)

        model = joblib.load(model_path)
        y_scores = model.predict_proba(df2[[feature_col]])[:, 1]
        th = float(str(model_path).split('_t')[-1].replace('.pkl','')) if '_t' in str(model_path) else 0.5
        predictions = (y_scores >= th).astype(int)
        df_pred[col_name] = predictions[:len(df_pred)]

    # transpose to have rows=lead hour
    df_pred = df_pred.T
    df_pred.index.name = 'Lead Hour'

    # daywise aggregation (example: sum in day windows)
    lead_time = np.arange(24, 264, 24)
    sum_f = pd.DataFrame()
    for i, l in enumerate(lead_time):
        if i == 0:
            sum_f[f'{i+1}-day'] = df_pred.iloc[:, 0:lead_time[i]].sum(axis=1)
        else:
            sum_f[f'{i+1}-day'] = df_pred.iloc[:, lead_time[i-1]:lead_time[i]].sum(axis=1)

    return df_pred, sum_f

In [29]:
# ---------------------------
# Plotting utilities with proper water level scaling
# ---------------------------

from matplotlib.patches import Patch

def plot_persistence_vs_wl(sum_f, df_obs, year, threshold, plots_dir, config):
    """
    Plot water level (line), threshold (line) on primary axis, and persistence (scatter) on secondary axis.
    Water level axis minimum starts from actual data minimum.
    """
    # Extract date range from CONFIG
    start_month = config['start_month']
    end_month = config['end_month']
    start_date = pd.to_datetime(f"{year}-{start_month}-01")
    end_day = 31 if end_month in [1,3,5,7,8,10,12] else (30 if end_month in [4,6,9,11] else 29)
    end_date = pd.to_datetime(f"{year}-{end_month}-{end_day}")
    
    # Create full date range
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Calculate persistence counts
    date_leads = create_calendar_dataframe(sum_f, end_date)
    date_consistency = {d: len(l) for d, l in date_leads.items()}
    persistence_df = pd.DataFrame({
        'Date': pd.to_datetime(list(date_consistency.keys())), 
        'persistence_count': list(date_consistency.values())
    }).sort_values('Date').reset_index(drop=True)
    
    # Get daily water levels from observations
    df_obs_copy = df_obs.copy()
    df_obs_copy['Date'] = pd.to_datetime(df_obs_copy['Date Time']).dt.normalize()
    wl_daily = df_obs_copy.groupby('Date')['WL (mMSL)'].mean().reset_index()
    
    # Merge all data on the CONFIG date range
    base = pd.DataFrame({'Date': all_dates}).reset_index(drop=True)
    plot_df = base.merge(persistence_df, on='Date', how='left').reset_index(drop=True)
    plot_df = plot_df.merge(wl_daily, on='Date', how='left').reset_index(drop=True)
    plot_df['persistence_count'] = plot_df['persistence_count'].fillna(0).astype(int)
    plot_df['WL (mMSL)'] = plot_df['WL (mMSL)'].fillna(np.nan)
    
    # Filter to only non-zero persistence counts for scatter plot
    plot_df_scatter = plot_df[plot_df['persistence_count'] > 0].copy()
    
    # Color map by persistence count (only non-zero counts)
    colors_scatter = np.where(
        plot_df_scatter['persistence_count'] >= 6, '#0b3d91',
        np.where(plot_df_scatter['persistence_count'] >= 3, '#7bb8ff', '#d3d3d3')
    )
    
    # Create figure with two y-axes
    fig, ax1 = plt.subplots(figsize=(16, 7))
    ax2 = ax1.twinx()
    ax2.scatter(plot_df_scatter['Date'], plot_df_scatter['persistence_count'],
                c=colors_scatter, s=120, alpha=0.8, edgecolors='black', linewidth=0.5,
                label='Persistence Count', zorder=4, marker='D')
    # Calculate water level axis limits from actual data
    valid_wl = wl_daily['WL (mMSL)'].dropna()
    if len(valid_wl) > 0:
        wl_min = valid_wl.min()
        wl_max = valid_wl.max()
        wl_padding = (wl_max - wl_min) * 0.1  # 10% padding
        wl_min_axis = max(0, wl_min - wl_padding)
        wl_max_axis = wl_max + wl_padding
    else:
        wl_min_axis = 0
        wl_max_axis = 15
    
    # Plot water level
    ax1.plot(plot_df['Date'], plot_df['WL (mMSL)'],
             color='#1f77b4', marker='o', markersize=4, linewidth=2,
             label='Water Level (mMSL)', zorder=3)
    ax1.axhline(threshold, color='red', linestyle='--', linewidth=2,
                label=f'Threshold ({threshold} mMSL)', zorder=2)
    
    
    ax1.set_xlabel('Date', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Water Level (mMSL)', fontsize=12, fontweight='bold', color='#1f77b4')
    ax1.tick_params(axis='y', labelcolor='#1f77b4')
    ax1.minorticks_on()
    ax1.grid(True, which='both', axis='both', linestyle=':', alpha=0.5)
    ax1.set_xlim(start_date, end_date)
    ax1.set_ylim(wl_min_axis, wl_max_axis)  # Set from actual minimum

In [30]:
# ---------------------------
# TRAINING (RUN_TRAIN)
# ---------------------------

print("\n" + "="*60)
print("Training models from input_xlsx...")
print("="*60)

if RUN_TRAIN:
    df_train = pd.read_excel(CONFIG['input_xlsx'])
    
    # Detect label column (case-insensitive)
    label_candidates = ['Flash Flood', 'Flash flood', 'flash flood', 'Flash_flood', 'FlashFlood', 'label', 'target', 'y', 'is_flood']
    lower_cols = {c.lower(): c for c in df_train.columns}
    label_col = None
    for cand in label_candidates:
        if cand.lower() in lower_cols:
            label_col = lower_cols[cand.lower()]
            break
    if label_col is None:
        raise ValueError(f"No label column found in {CONFIG['input_xlsx']}. Expected one of {label_candidates}.")
    
    # Require feature column 'x'
    if 'x' not in df_train.columns:
        raise ValueError("Feature column 'x' not found in training data. Add column 'x' to proceed.")
    
    # Add baseflow constant for MONSOON season
    if CONFIG['season'] == 'MONSOON':
        print("\nMONSOON season detected: Adding optimized baseflow constants to x")
        
        # Optimized baseflow constants (from correlation optimization)
        baseflow_constants = {5: 0.00, 6: 1.14, 7: 1.91}
        
        # Require month column
        if 'month' not in df_train.columns:
            raise ValueError("MONSOON season requires 'month' column in training data for baseflow adjustment.")
        
        # Apply baseflow constants
        df_train['baseflow_const'] = df_train['month'].map(baseflow_constants).fillna(0)
        df_train['x_total'] = df_train['x'] + df_train['baseflow_const']
        
        X = df_train[['x_total']].astype(float).fillna(0)
        print(f"Training with x_total (x + baseflow). Baseflow constants: {baseflow_constants}")
    else:
        X = df_train[['x']].astype(float).fillna(0)
        print("Non-MONSOON season: Training with x only (no baseflow adjustment)")
    
    y = df_train[label_col].astype(int).values
    
    # Train and save models
    summaries = fit_and_save_models(X, y, CONFIG['models_dir'])
    
    print("\nTraining complete. Saved models:")
    for name, info in summaries.items():
        print(f"- {name}: threshold={info['threshold']:.3f}, precision={info['precision']:.3f}, recall={info['recall']:.3f}, f1={info['f1']:.3f}, FAR={info['far']:.3f}")
        print(f"  file: {info['model_file']}")
    
    print("\nUpdate CONFIG['MODEL_NAME'] to the chosen model filename.")
else:
    print("RUN_TRAIN is False. Skipping training.")


Training models from input_xlsx...
RUN_TRAIN is False. Skipping training.


In [31]:
RUN_FORECAST = True  # set True to process forecast Excel and make predictions
RUN_PLOTS = True     # set True to produce calendar and persistence plots

In [32]:
# ---------------------------
# Visualization: Water level with Flood spells and Persistence Overlay
# ---------------------------

def flood_spells_persistence_plot(flood_dates, df_obs, sum_f, threshold, year, plots_dir, config):
    """
    Plot water level (line) with flood spells highlighted and persistence overlay.
    Water level axis starts from actual data minimum.
    """
    # Extract date range from CONFIG
    start_month = config['start_month']
    end_month = config['end_month']
    start_date = pd.to_datetime(f"{year}-{start_month}-01")
    end_day = 31 if end_month in [1,3,5,7,8,10,12] else (30 if end_month in [4,6,9,11] else 29)
    end_date = pd.to_datetime(f"{year}-{end_month}-{end_day}")
    
    # Get daily water levels
    df_obs_copy = df_obs.copy()
    df_obs_copy['Date'] = pd.to_datetime(df_obs_copy['Date Time']).dt.normalize()
    wl_daily = df_obs_copy.groupby('Date')['WL (mMSL)'].mean().reset_index()
    wl_daily = wl_daily.sort_values('Date').reset_index(drop=True)
    
    # Calculate persistence for all dates in range
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    date_leads = create_calendar_dataframe(sum_f, end_date)
    date_consistency = {d: len(l) for d, l in date_leads.items()}
    persistence_df = pd.DataFrame({
        'Date': pd.to_datetime(list(date_consistency.keys())),
        'persistence_count': list(date_consistency.values())
    }).sort_values('Date').reset_index(drop=True)
    
    # Create base dataframe with full date range
    base = pd.DataFrame({'Date': all_dates}).reset_index(drop=True)
    plot_df = base.merge(persistence_df, on='Date', how='left').reset_index(drop=True)
    plot_df = plot_df.merge(wl_daily, on='Date', how='left').reset_index(drop=True)
    plot_df['persistence_count'] = plot_df['persistence_count'].fillna(0).astype(int)
    plot_df['WL (mMSL)'] = plot_df['WL (mMSL)'].fillna(np.nan)
    
    # Filter to only non-zero persistence counts for scatter plot
    plot_df_scatter = plot_df[plot_df['persistence_count'] > 0].copy()
    
    # Create figure with two y-axes
    fig, ax1 = plt.subplots(figsize=(16, 7))
    ax2 = ax1.twinx()
    
    # Calculate water level axis limits from actual data (NOT including zero)
    valid_wl = wl_daily['WL (mMSL)'].dropna()
    if len(valid_wl) > 0:
        wl_min = valid_wl.min()
        wl_max = valid_wl.max()
        wl_padding = (wl_max - wl_min) * 0.1  # 10% padding
        wl_min_axis = wl_min - wl_padding  # Start from actual minimum (no max(0, ...) constraint)
        wl_max_axis = wl_max + wl_padding
    else:
        wl_min_axis = 0
        wl_max_axis = 15
    
    # Plot water level line (reduced marker size for cleaner look)
    ax1.plot(plot_df['Date'], plot_df['WL (mMSL)'],
             color='#1f77b4', marker='o', markersize=2, linewidth=1.5,
             label='Mean Water Level', zorder=3)
    
    # Plot threshold
    ax1.axhline(threshold, color='red', linestyle='--', linewidth=1.5,
                label=f'Threshold ({threshold} mMSL)', zorder=2)
    
    # Add shaded region for min-max water level range
    wl_range = df_obs_copy.groupby('Date')['WL (mMSL)'].agg(['min', 'max']).reset_index()
    plot_df = plot_df.merge(wl_range, on='Date', how='left')
    ax1.fill_between(plot_df['Date'], plot_df['min'], plot_df['max'], 
                     alpha=0.2, color='#1f77b4', label='Min-Max Range', zorder=1.5)
    
    # Highlight flood spells on the background with spell numbers
    for spell_idx, spell in enumerate(flood_dates, start=1):
        start = pd.to_datetime(spell[0])-pd.Timedelta(hours=48) # extend 2 days before
        end = pd.to_datetime(spell[1])
        ax1.axvspan(start, end, alpha=0.25, color='orange', zorder=2.5)
        
    
    # Plot persistence overlay (ONLY NON-ZERO counts with color coding)
    colors_scatter = np.where(
        plot_df_scatter['persistence_count'] >= 6, '#0b3d91',
        np.where(plot_df_scatter['persistence_count'] >= 3, '#7bb8ff', '#d3d3d3')
    )
    ax2.scatter(plot_df_scatter['Date'], plot_df_scatter['persistence_count'],
                c=colors_scatter, s=80, alpha=0.8, edgecolors='black', linewidth=0.5,
                label='Persistence Count', zorder=4, marker='D')
    
    # Set axis labels and formatting
    ax1.set_xlabel('Date', fontsize=11, fontweight='bold')
    ax1.set_ylabel('Water Level (mMSL)', fontsize=11, fontweight='bold', color='#1f77b4')
    ax1.tick_params(axis='y', labelcolor='#1f77b4', labelsize=9)
    ax1.tick_params(axis='x', labelsize=8)
    ax1.minorticks_on()
    ax1.grid(True, which='major', axis='both', linestyle=':', alpha=0.3)
    ax1.set_xlim(start_date, end_date)
    ax1.set_ylim(wl_min_axis, wl_max_axis)
    
    # Set secondary y-axis
    ax2.set_ylabel('Persistence Count', fontsize=11, fontweight='bold', color='black')
    ax2.tick_params(axis='y', labelcolor='black', labelsize=9)
    ax2.set_ylim(0, max(10, plot_df['persistence_count'].max() + 1))
    
    # Create comprehensive legend
    from matplotlib.patches import Patch
    lines1, labels1 = ax1.get_legend_handles_labels()
    bin_handles = [
        Patch(facecolor='#d3d3d3', edgecolor='black', label='Persistence < 3'),
        Patch(facecolor='#7bb8ff', edgecolor='black', label='Persistence 3-5'),
        Patch(facecolor='#0b3d91', edgecolor='black', label='Persistence ≥ 6'),
    ]
    all_labels = labels1 + [h.get_label() for h in bin_handles]
    all_handles = lines1 + bin_handles
    ax1.legend(all_handles, all_labels, loc='upper left', fontsize=9, framealpha=0.9)
    
    plt.title(f'Observed Water Level, Flood Spells and Forecast Persistence: {start_date.date()} to {end_date.date()}',
              fontsize=12, fontweight='bold', pad=12)
    
    # Use matplotlib date formatter instead of autofmt for better DPI handling
    import matplotlib.dates as mdates
    ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    fig.autofmt_xdate(rotation=45, ha='right')
    plt.tight_layout()
    
    out = Path(plots_dir) / f'flood_spells_with_persistence_{year}.png'
    plt.savefig(out, dpi=100, bbox_inches='tight')
    print(f'Saved flood spells + persistence plot to: {out}')
    plt.close()  # Close figure to free memory


In [33]:
 # ---------------------------
# MAIN EXECUTION
# ---------------------------

print("\n" + "="*70)
print("MAIN EXECUTION: Processing Forecasts and Extracting Event Dates")
print("="*70)



for year in years_to_process:
    print(f"\n{'='*70}")
    print(f"Processing Year: {year}")
    print(f"{'='*70}")
    
    # Update CONFIG for this year
    CONFIG = create_config(year, season, model_folder, MODEL_NAME, start_month, end_month)
    
    # Check if files exist
    forecast_file = CONFIG['forecast_excel']
    model_file = CONFIG['models_dir'] / CONFIG['MODEL_NAME']
    obs_file = CONFIG['obs_excel']
    
    if not forecast_file.exists():
        print(f"⚠️  Forecast file not found: {forecast_file}")
        continue
    if not model_file.exists():
        print(f"⚠️  Model file not found: {model_file}")
        continue
    if not obs_file.exists():
        print(f"⚠️  Observation file not found: {obs_file}")
        continue
    
    # Calculate end date for this season
    end_day = 31 if end_month in [1,3,5,7,8,10,12] else (30 if end_month in [4,6,9,11] else 29)
    end_date = pd.to_datetime(f"{year}-{end_month}-{end_day}")
    
    # Process forecasts
    print(f"Loading forecasts from: {forecast_file}")
    df_pred, sum_f = process_forecasts(
        forecast_excel=forecast_file,
        model_path=model_file,
        W=CONFIG['W'],
        A=CONFIG['A'],
        year=year,
        end_date=end_date,
        end_month=end_month,
        season=season
    )
    
    print(f"✓ Processed {len(sum_f)} forecast lead times")
    print(f"✓ Generated predictions for {len(sum_f.columns)} lead day windows")
    
    
    # Calculate season date range for filtering
    start_date = pd.to_datetime(f"{year}-{CONFIG['start_month']}-01")
    
    # Filter sum_f to only include initialization dates within the season
    sum_f_season = sum_f.copy()
    sum_f_season.index = pd.to_datetime(sum_f_season.index)
    sum_f_season = sum_f_season[(sum_f_season.index >= start_date) & (sum_f_season.index <= end_date)]
    sum_f_season.to_csv(CONFIG['predictions_dir'] / f'sum_f_{year}.csv')
    print(f"✓ Filtered to {len(sum_f_season)} forecasts within season ({start_date.date()} to {end_date.date()})")
    # Extract predicted event dates from sum_f
    # sum_f structure:
    #   Index: initialization dates (when forecast was issued)
    #   Columns: '1-day', '2-day', '3-day', etc. (lead time windows)
    #   Values: sum of hourly predictions in each window (0 or positive integer)
    # 
    # PREDICTED DATE = INITIALIZATION DATE + LEAD DAYS (when value > 0)
    
    print(f"\nExtracting predicted flood event dates...")
    print(f"  Formula: Predicted Date = Init Date + Lead Days (if value > 0)")
    predicted_events = {}
    
    for init_date, row in sum_f_season.iterrows():
        init_datetime = pd.to_datetime(init_date)
        
        # Check each lead time column
        for lead_col in sum_f_season.columns:
            prediction_value = row[lead_col]
            
            # If non-zero, this init_date predicts a flood at init_date + lead_days
            if prediction_value > 0:
                lead_days = int(lead_col.split('-')[0])
                predicted_date = (init_datetime + pd.Timedelta(days=lead_days)).date()
                
                # Only include if predicted date is within the year and season range
                predicted_datetime = pd.to_datetime(predicted_date)
                if start_date <= predicted_datetime <= end_date:
                    # Store all sources predicting this date
                    if predicted_date not in predicted_events:
                        predicted_events[predicted_date] = []
                    predicted_events[predicted_date].append({
                        'init_date': init_datetime.date(),
                        'lead_days': lead_days,
                        'prediction_sum': int(prediction_value)
                    })
    
    # Sort predicted events by date
    predicted_events = dict(sorted(predicted_events.items()))
    
    # Load observations for plotting
    print(f"\nLoading observations from: {obs_file}")
    df_obs = pd.read_excel(obs_file)
    
    # CSV EXPORT: Lead Days & Predicted vs Actual Flood Dates
    print(f"\n{'='*70}")
    print(f"CSV EXPORT: Flood Predictions for {year}")
    print(f"{'='*70}")
    
    # Get actual flood dates from observations
    df_obs_copy = df_obs.copy()
    df_obs_copy['Date'] = pd.to_datetime(df_obs_copy['Date Time']).dt.normalize()
    wl_daily = df_obs_copy.groupby('Date')['WL (mMSL)'].mean().reset_index()
    wl_daily = wl_daily.sort_values('Date').reset_index(drop=True)
    
    threshold = CONFIG['threshold_monsoon'] if season == 'MONSOON' else CONFIG['threshold_other']
    flood_mask = wl_daily['WL (mMSL)'] > threshold
    
    # Define season date range for filtering
    start_date = pd.to_datetime(f"{year}-{CONFIG['start_month']}-01")
    
    # Collect all dates where water level exceeded threshold (within season only)
    actual_flood_dates = set()
    for idx in wl_daily.index:
        if flood_mask.loc[idx]:
            flood_date = wl_daily.loc[idx, 'Date']
            # Only include if date is within the year and season range
            if start_date <= flood_date <= end_date:
                actual_flood_dates.add(flood_date.date())
    
    # Build CSV with lead days as columns
    all_lead_days = sorted(set(src['lead_days'] for sources in predicted_events.values() for src in sources))
    csv_data = {}
    
    # For each lead day, collect all predicted dates
    for lead_day in all_lead_days:
        dates_for_lead = []
        for pred_date in sorted(predicted_events.keys()):
            sources = predicted_events[pred_date]
            for src in sources:
                if src['lead_days'] == lead_day:
                    dates_for_lead.append(str(pred_date))
                    break
        csv_data[f'lead-{lead_day}'] = dates_for_lead
    
    # Add actual flood dates column
    csv_data['Actual_Flood_Dates'] = [str(d) for d in sorted(actual_flood_dates)]
    
    # Pad all columns to same length
    max_len = max([len(v) for v in csv_data.values()])
    for key in csv_data:
        csv_data[key].extend([''] * (max_len - len(csv_data[key])))
    
    csv_df = pd.DataFrame(csv_data)
    csv_filename = CONFIG['predictions_dir'] / f'flood_predictions_{year}.csv'
    csv_df.to_csv(csv_filename, index=False)
    
    print(f"  ✓ Saved to: {csv_filename}")
    print(f"  Total rows: {len(csv_df)}")
    print(f"  Columns: {list(csv_df.columns)}")


    # Generate plots if enabled
    if RUN_PLOTS:
        print(f"\n{'='*70}")
        print(f"GENERATING PLOTS for {year}")
        print(f"{'='*70}")
        
        # Identify flood spells (consecutive True values) - reuse existing wl_daily and flood_mask
        flood_spells = []
        in_spell = False
        spell_start_idx = None
        
        for idx in wl_daily.index:
            is_flood = flood_mask.loc[idx]
            if is_flood and not in_spell:
                spell_start_idx = idx
                in_spell = True
            elif not is_flood and in_spell:
                spell_end_idx = wl_daily.index[wl_daily.index.get_loc(idx) - 1]
                spell_start_date = wl_daily.loc[spell_start_idx, 'Date']
                spell_end_date = wl_daily.loc[spell_end_idx, 'Date']
                flood_spells.append((spell_start_date, spell_end_date))
                in_spell = False
        
        if in_spell:
            spell_start_date = wl_daily.loc[spell_start_idx, 'Date']
            spell_end_date = wl_daily['Date'].iloc[-1]
            flood_spells.append((spell_start_date, spell_end_date))
        
        print(f"  Observed flood spells: {len(flood_spells)}")
        
        # Create plots
        flood_spells_persistence_plot(
            flood_dates=flood_spells,
            df_obs=df_obs,
            sum_f=sum_f,
            threshold=threshold,
            year=year,
            plots_dir=CONFIG['plots_dir'],
            config=CONFIG
        )
    
    print(f"\n{'='*70}")
    print(f"✓ Year {year} processing complete")
    print(f"{'='*70}")


MAIN EXECUTION: Processing Forecasts and Extracting Event Dates

Processing Year: 2017
Loading forecasts from: ..\LSTM\ECMWF_HRES_forecast_excel\2017_january_july.xlsx


  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] 

✓ Processed 212 forecast lead times
✓ Generated predictions for 10 lead day windows
✓ Filtered to 61 forecasts within season (2017-06-01 to 2017-07-31)

Extracting predicted flood event dates...
  Formula: Predicted Date = Init Date + Lead Days (if value > 0)

Loading observations from: ..\LSTM\PREMONSOON-janjul201724.xlsx

CSV EXPORT: Flood Predictions for 2017
  ✓ Saved to: predictionsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_predictions_2017.csv
  Total rows: 9
  Columns: ['lead-1', 'lead-2', 'lead-3', 'lead-4', 'lead-5', 'lead-6', 'lead-7', 'lead-8', 'lead-9', 'lead-10', 'Actual_Flood_Dates']

GENERATING PLOTS for 2017
  Observed flood spells: 4
Saved flood spells + persistence plot to: plotsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_spells_with_persistence_2017.png

✓ Year 2017 processing complete

Processing Year: 2022
Loading forecasts from: ..\LSTM\ECMWF_HRES_forecast_excel\2022_january_july.xlsx


  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] 

✓ Processed 212 forecast lead times
✓ Generated predictions for 10 lead day windows
✓ Filtered to 61 forecasts within season (2022-06-01 to 2022-07-31)

Extracting predicted flood event dates...
  Formula: Predicted Date = Init Date + Lead Days (if value > 0)

Loading observations from: ..\LSTM\PREMONSOON-janjul201724.xlsx

CSV EXPORT: Flood Predictions for 2022
  ✓ Saved to: predictionsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_predictions_2022.csv
  Total rows: 7
  Columns: ['lead-1', 'lead-2', 'lead-3', 'lead-4', 'lead-5', 'lead-6', 'lead-7', 'lead-8', 'lead-9', 'lead-10', 'Actual_Flood_Dates']

GENERATING PLOTS for 2022
  Observed flood spells: 4
Saved flood spells + persistence plot to: plotsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_spells_with_persistence_2022.png

✓ Year 2022 processing complete

Processing Year: 2023
Loading forecasts from: ..\LSTM\ECMWF_HRES_forecast_excel\2023_january_july.xlsx


  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] 

✓ Processed 365 forecast lead times
✓ Generated predictions for 10 lead day windows
✓ Filtered to 61 forecasts within season (2023-06-01 to 2023-07-31)

Extracting predicted flood event dates...
  Formula: Predicted Date = Init Date + Lead Days (if value > 0)

Loading observations from: ..\LSTM\PREMONSOON-janjul201724.xlsx

CSV EXPORT: Flood Predictions for 2023
  ✓ Saved to: predictionsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_predictions_2023.csv
  Total rows: 3
  Columns: ['lead-3', 'lead-4', 'lead-9', 'lead-10', 'Actual_Flood_Dates']

GENERATING PLOTS for 2023
  Observed flood spells: 4
Saved flood spells + persistence plot to: plotsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_spells_with_persistence_2023.png

✓ Year 2023 processing complete

Processing Year: 2024
Loading forecasts from: ..\LSTM\ECMWF_HRES_forecast_excel\2024_january_july.xlsx


  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] = predictions[:len(df_pred)]
  df_pred[col_name] 

✓ Processed 366 forecast lead times
✓ Generated predictions for 10 lead day windows
✓ Filtered to 61 forecasts within season (2024-06-01 to 2024-07-31)

Extracting predicted flood event dates...
  Formula: Predicted Date = Init Date + Lead Days (if value > 0)

Loading observations from: ..\LSTM\PREMONSOON-janjul201724.xlsx

CSV EXPORT: Flood Predictions for 2024
  ✓ Saved to: predictionsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_predictions_2024.csv
  Total rows: 9
  Columns: ['lead-1', 'lead-2', 'lead-3', 'lead-4', 'lead-5', 'lead-6', 'lead-7', 'lead-8', 'lead-9', 'lead-10', 'Actual_Flood_Dates']

GENERATING PLOTS for 2024
  Observed flood spells: 4
Saved flood spells + persistence plot to: plotsv2\logistic_regression\Logistic_Regression_t0.891\MONSOON\flood_spells_with_persistence_2024.png

✓ Year 2024 processing complete
