In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import matplotlib.pyplot as plt
import os
from itertools import product
import json
from datetime import datetime

In [None]:
df = pd.read_csv('/Rainfall Data.csv')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Check the number of years of data for each state (SUBDIVISION)
years_per_state = df.groupby('SUBDIVISION')['YEAR'].nunique()

# Filter to find states that do not have 115 years of data
states_less_than_115_years = years_per_state[years_per_state != 115]

In [7]:
years_per_subdivision = df.groupby('SUBDIVISION')['YEAR'].nunique()
states_to_drop = years_per_subdivision[years_per_subdivision != 115].index.tolist()
df_filtered = df[~df['SUBDIVISION'].isin(states_to_drop)]

In [8]:
num_states_filtered = df_filtered['SUBDIVISION'].nunique()
print(f"\nNumber of states (SUBDIVISIONs) in the filtered dataset: {num_states_filtered}")

# Verify the number of years of data for each state in the filtered DataFrame
years_per_state_filtered = df_filtered.groupby('SUBDIVISION')['YEAR'].nunique()


Number of states (SUBDIVISIONs) in the filtered dataset: 33


In [9]:
year_range_per_state = df_filtered.groupby('SUBDIVISION')['YEAR'].agg(['min', 'max'])

# Check if the year range is the same for all states
if year_range_per_state['min'].nunique() == 1 and year_range_per_state['max'].nunique() == 1:
    common_min_year = year_range_per_state['min'].iloc[0]
    common_max_year = year_range_per_state['max'].iloc[0]
    print(f"The common year range for all states is: {common_min_year} - {common_max_year}")
else:
    print("The year range is not the same for all states:")
    print(year_range_per_state)

The common year range for all states is: 1901 - 2015


In [10]:
device_name = tf.test.gpu_device_name()
if device_name:
    print(f'GPU device: {device_name}')
else:
    print('No GPU found. Make sure you selected GPU in the runtime settings.')

GPU device: /device:GPU:0


In [11]:
# --- Configuration Parameters ---
LOOK_BACK_MONTHS = 24  # Number of past months to use as input (e.g., 2 years)
FORECAST_HORIZON_MONTHS = 60 # Number of future months to predict (5 years * 12 months)

# Grid Search Parameters
GRID_SEARCH_PARAMS = {
     'lstm_units': [32, 50],
     'num_lstm_layers': [2, 3],
     'learning_rate': [0.01, 0.001],
     'batch_size': [32, 64],
     'dropout_rate': [0.2, 0.3],
     'epochs': [50, 100],
     'patience': [15]}

# --- 1. Use Pre-loaded Data ---
print("Using pre-loaded DataFrame 'df_filtered'.")
print(f"DataFrame shape: {df_filtered.shape}")
print(f"Columns: {list(df_filtered.columns)}")

# --- 2. Missing Value Imputation ---
month_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

print(f"\nBefore imputation, missing values in monthly columns:\n{df_filtered[month_columns].isnull().sum()}")

# Apply KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = df_filtered.copy()
df_imputed[month_columns] = imputer.fit_transform(df_imputed[month_columns])

print(f"\nAfter imputation, missing values in monthly columns:\n{df_imputed[month_columns].isnull().sum()}")

# --- Data Preparation Functions ---

def create_sequences(data, n_steps_in, n_steps_out):
    """Create sequences for LSTM training."""
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > len(data):
            break
        seq_x = data[i:end_ix]
        seq_y = data[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

def prepare_data():
    """Prepare training and test data for all subdivisions."""
    all_X_train, all_y_train = [], []
    all_X_val, all_y_val = [], []
    all_X_test_input, all_y_test_actual = [], []
    scalers = {}
    subdivision_names_for_plotting = []

    subdivisions = df_imputed['SUBDIVISION'].unique()

    print("\nProcessing data for each subdivision...")
    for sub_id in subdivisions:
        sub_df = df_imputed[df_imputed['SUBDIVISION'] == sub_id].sort_values('YEAR').copy()

        # Reshape monthly data into a continuous series
        monthly_data_sub = sub_df[month_columns].values.flatten()
        total_months_sub = len(monthly_data_sub)

        # Skip if not enough data
        if total_months_sub < FORECAST_HORIZON_MONTHS:
            print(f"Skipping {sub_id}: Not enough total months ({total_months_sub}) for a {FORECAST_HORIZON_MONTHS}-month forecast horizon.")
            continue

        # Actual values for the prediction horizon
        actual_test_values = monthly_data_sub[total_months_sub - FORECAST_HORIZON_MONTHS:]

        # Training data
        train_data_full_slice_end_idx = total_months_sub - FORECAST_HORIZON_MONTHS

        if train_data_full_slice_end_idx > (110 * 12):
            train_data = monthly_data_sub[:(110 * 12)]
        else:
            train_data = monthly_data_sub[:train_data_full_slice_end_idx]

        if len(train_data) < LOOK_BACK_MONTHS:
            print(f"Skipping {sub_id}: Not enough training data ({len(train_data)} months) to form a {LOOK_BACK_MONTHS}-month lookback window.")
            continue

        # Scale data
        scaler = MinMaxScaler(feature_range=(0, 1))
        train_scaled = scaler.fit_transform(train_data.reshape(-1, 1))
        scalers[sub_id] = scaler

        # Create sequences for training
        X_train_sub, y_train_sub = create_sequences(train_scaled, LOOK_BACK_MONTHS, FORECAST_HORIZON_MONTHS)

        if X_train_sub.size > 0:
            # Split training data into train and validation
            val_split_idx = int(0.8 * len(X_train_sub))

            X_train_sub_split = X_train_sub[:val_split_idx]
            y_train_sub_split = y_train_sub[:val_split_idx]
            X_val_sub = X_train_sub[val_split_idx:]
            y_val_sub = y_train_sub[val_split_idx:]

            all_X_train.append(X_train_sub_split)
            all_y_train.append(y_train_sub_split)
            all_X_val.append(X_val_sub)
            all_y_val.append(y_val_sub)
        else:
            print(f"Warning: Not enough data to create training sequences for {sub_id}.")
            continue

        # Prepare test input
        test_input_seq = train_scaled[-LOOK_BACK_MONTHS:]
        all_X_test_input.append(test_input_seq)
        all_y_test_actual.append(actual_test_values)
        subdivision_names_for_plotting.append(sub_id)

    if not all_X_train:
        print("\nError: No valid training sequences could be created from any subdivision.")
        return None, None, None, None, None, None, None

    # Concatenate all data
    X_train = np.vstack(all_X_train)
    y_train = np.vstack(all_y_train)
    X_val = np.vstack(all_X_val)
    y_val = np.vstack(all_y_val)
    X_test_input = np.array(all_X_test_input)
    y_test_actual = np.array(all_y_test_actual)

    # Reshape for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
    y_val = y_val.reshape((y_val.shape[0], y_val.shape[1], 1))
    X_test_input = X_test_input.reshape((X_test_input.shape[0], X_test_input.shape[1], 1))

    return X_train, y_train, X_val, y_val, X_test_input, y_test_actual, scalers

def build_model(lstm_units, num_lstm_layers, learning_rate, dropout_rate):
    """Build LSTM model with specified hyperparameters."""
    # Clear any existing models
    tf.keras.backend.clear_session()

    # Encoder
    encoder_inputs = Input(shape=(LOOK_BACK_MONTHS, 1))

    # Stack LSTM layers in encoder
    x = encoder_inputs
    for i in range(num_lstm_layers - 1):
        x = LSTM(lstm_units, return_sequences=True)(x)
        if dropout_rate > 0:
            x = Dropout(dropout_rate)(x)

    # Last encoder layer returns states
    encoder_output = LSTM(lstm_units, return_state=True)(x)
    encoder_states = encoder_output[1:]  # Get h and c states

    # Decoder
    decoder_inputs = RepeatVector(FORECAST_HORIZON_MONTHS)(encoder_states[0])

    # Stack LSTM layers in decoder
    x = decoder_inputs
    for i in range(num_lstm_layers):
        if i == 0:
            x = LSTM(lstm_units, return_sequences=True)(x, initial_state=encoder_states)
        else:
            x = LSTM(lstm_units, return_sequences=True)(x)
        if dropout_rate > 0:
            x = Dropout(dropout_rate)(x)

    decoder_outputs = TimeDistributed(Dense(1))(x)

    model = Model(encoder_inputs, decoder_outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mae')

    return model

def evaluate_model(model, X_test_input, y_test_actual, scalers, subdivision_names):
    """Evaluate model and return metrics."""
    all_y_pred_unscaled = []

    for i, sub_id in enumerate(subdivision_names):
        input_seq_for_pred = np.expand_dims(X_test_input[i], axis=0)
        predicted_scaled_sequence = model.predict(input_seq_for_pred, verbose=0)
        predicted_scaled_sequence = predicted_scaled_sequence.reshape(-1, 1)

        scaler = scalers[sub_id]
        predicted_unscaled_sequence = scaler.inverse_transform(predicted_scaled_sequence)
        all_y_pred_unscaled.append(predicted_unscaled_sequence.flatten())

    y_pred_final = np.array(all_y_pred_unscaled)

    # Calculate metrics
    mae = mean_absolute_error(y_test_actual.flatten(), y_pred_final.flatten())
    rmse = np.sqrt(mean_squared_error(y_test_actual.flatten(), y_pred_final.flatten()))

    return mae, rmse

def grid_search():
    """Perform grid search for hyperparameter optimization."""
    print("\n" + "="*50)
    print("STARTING GRID SEARCH FOR HYPERPARAMETER OPTIMIZATION")
    print("="*50)

    # Prepare data once
    print("Preparing data...")
    data_prep_result = prepare_data()
    if data_prep_result[0] is None:
        return None

    X_train, y_train, X_val, y_val, X_test_input, y_test_actual, scalers = data_prep_result
    subdivision_names = list(scalers.keys())

    print(f"Data prepared successfully:")
    print(f"  Training samples: {X_train.shape[0]}")
    print(f"  Validation samples: {X_val.shape[0]}")
    print(f"  Test subdivisions: {len(subdivision_names)}")

    # Generate all parameter combinations
    param_names = list(GRID_SEARCH_PARAMS.keys())
    param_values = list(GRID_SEARCH_PARAMS.values())
    param_combinations = list(product(*param_values))

    print(f"\nTotal parameter combinations to test: {len(param_combinations)}")

    # Store results
    results = []
    best_score = float('inf')
    best_params = None
    best_model = None

    # Grid search loop
    for i, param_combo in enumerate(param_combinations):
        params = dict(zip(param_names, param_combo))

        print(f"\n--- Combination {i+1}/{len(param_combinations)} ---")
        print(f"Parameters: {params}")

        try:
            # Build model
            model = build_model(
                lstm_units=params['lstm_units'],
                num_lstm_layers=params['num_lstm_layers'],
                learning_rate=params['learning_rate'],
                dropout_rate=params['dropout_rate']
            )

            # Early stopping
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=params['patience'],
                verbose=0,
                mode='min',
                restore_best_weights=True
            )

            # Train model
            history = model.fit(
                X_train, y_train,
                epochs=params['epochs'],
                batch_size=params['batch_size'],
                validation_data=(X_val, y_val),
                callbacks=[early_stopping],
                verbose=0
            )

            # Get validation loss
            val_loss = min(history.history['val_loss'])

            # Evaluate on test set
            test_mae, test_rmse = evaluate_model(model, X_test_input, y_test_actual, scalers, subdivision_names)

            # Store results
            result = {
                'params': params,
                'val_loss': val_loss,
                'test_mae': test_mae,
                'test_rmse': test_rmse,
                'epochs_trained': len(history.history['loss'])
            }
            results.append(result)

            print(f"Validation Loss: {val_loss:.4f}")
            print(f"Test MAE: {test_mae:.4f}")
            print(f"Test RMSE: {test_rmse:.4f}")
            print(f"Epochs trained: {result['epochs_trained']}")

            # Check if this is the best model
            if val_loss < best_score:
                best_score = val_loss
                best_params = params.copy()
                best_model = model
                print("*** NEW BEST MODEL! ***")

        except Exception as e:
            print(f"Error with parameters {params}: {str(e)}")
            continue

    # Sort results by validation loss
    results.sort(key=lambda x: x['val_loss'])

    return results, best_params, best_model, scalers, subdivision_names

def save_results(results, best_params):
    """Save grid search results to file."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"grid_search_results_{timestamp}.json"

    # Convert results to JSON-serializable format
    results_json = []
    for result in results:
        result_copy = result.copy()
        results_json.append(result_copy)

    output = {
        'best_params': best_params,
        'all_results': results_json,
        'search_params': GRID_SEARCH_PARAMS
    }

    with open(filename, 'w') as f:
        json.dump(output, f, indent=2)

    print(f"\nResults saved to: {filename}")

def print_top_results(results, top_n=5):
    """Print top N results from grid search."""
    print(f"\n" + "="*60)
    print(f"TOP {min(top_n, len(results))} PARAMETER COMBINATIONS")
    print("="*60)

    for i, result in enumerate(results[:top_n]):
        print(f"\nRank {i+1}:")
        print(f"  Validation Loss: {result['val_loss']:.4f}")
        print(f"  Test MAE: {result['test_mae']:.4f}")
        print(f"  Test RMSE: {result['test_rmse']:.4f}")
        print(f"  Epochs Trained: {result['epochs_trained']}")
        print(f"  Parameters: {result['params']}")

# --- Main Execution ---
if __name__ == "__main__":
    # Run grid search
    results, best_params, best_model, scalers, subdivision_names = grid_search()

    if results is None:
        print("Grid search failed due to data preparation issues.")
        exit()

    # Print results
    print_top_results(results, top_n=10)

    print(f"\n" + "="*60)
    print("BEST MODEL SUMMARY")
    print("="*60)
    print(f"Best Parameters: {best_params}")
    print(f"Best Validation Loss: {results[0]['val_loss']:.4f}")
    print(f"Best Test MAE: {results[0]['test_mae']:.4f}")
    print(f"Best Test RMSE: {results[0]['test_rmse']:.4f}")

    # Save results
    save_results(results, best_params)

    # Final evaluation with best model
    print(f"\n" + "="*60)
    print("FINAL EVALUATION WITH BEST MODEL")
    print("="*60)

    if best_model is not None:
        X_train, y_train, X_val, y_val, X_test_input, y_test_actual, _ = prepare_data()
        final_mae, final_rmse = evaluate_model(best_model, X_test_input, y_test_actual, scalers, subdivision_names)

        print(f"Final Test MAE: {final_mae:.4f}")
        print(f"Final Test RMSE: {final_rmse:.4f}")

        # Save best model
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_filename = f"best_lstm_model_{timestamp}.h5"
        best_model.save(model_filename)
        print(f"Best model saved as: {model_filename}")

    print("\nGrid search completed!")

Using pre-loaded DataFrame 'df_filtered'.
DataFrame shape: (3795, 19)
Columns: ['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'ANNUAL', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']

Before imputation, missing values in monthly columns:
JAN    1
FEB    1
MAR    0
APR    0
MAY    0
JUN    0
JUL    1
AUG    0
SEP    0
OCT    0
NOV    1
DEC    1
dtype: int64

After imputation, missing values in monthly columns:
JAN    0
FEB    0
MAR    0
APR    0
MAY    0
JUN    0
JUL    0
AUG    0
SEP    0
OCT    0
NOV    0
DEC    0
dtype: int64

STARTING GRID SEARCH FOR HYPERPARAMETER OPTIMIZATION
Preparing data...

Processing data for each subdivision...
Data prepared successfully:
  Training samples: 32637
  Validation samples: 8184
  Test subdivisions: 33

Total parameter combinations to test: 64

--- Combination 1/64 ---
Parameters: {'lstm_units': 32, 'num_lstm_layers': 2, 'learning_rate': 0.01, 'batch_size': 32, 'dropout_rate': 0.2, 'epochs



Final Test MAE: 38.5509
Final Test RMSE: 70.9242
Best model saved as: best_lstm_model_20250822_192226.h5

Grid search completed!
