In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
import random
import warnings

# --- Basic Setup ---
warnings.filterwarnings('ignore')

def set_seed(seed_value=42):
    """Sets the seed for reproducibility."""
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# --- Hyperparameters ---
INPUT_LENGTH = 48
PREDICTION_HORIZON = 20
INPUT_FEATURES = 4
OUTPUT_FEATURES = 1 # We only predict the 'Rewaghat' feature
HIDDEN_SIZE = 64
NUM_LAYERS = 2
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 100
EARLY_STOPPING_PATIENCE = 10
TEACHER_FORCING_RATIO = 0.5 # Probability of using teacher forcing

# --- Data Loading (Identical to previous script) ---
def load_and_merge_data():
    try:
        old_chatia_df = pd.read_excel('Chatia_train.xlsx')
        old_rewaghat_df = pd.read_excel('Rewaghat_train.xlsx')
        new_chatia_df = pd.read_excel('Chatia_test.xlsx')
        new_rewaghat_df = pd.read_excel('Rewaghat_test.xlsx')
        rainfall_df = pd.read_excel('rainfall_data.xlsx')
        dumariaghat_df = pd.read_excel('Dumariaghat_data.xlsx')
    except FileNotFoundError as e:
        print(f'Error: {e}. Please ensure all data files are in the same directory.')
        return None
    full_chatia_df = pd.concat([old_chatia_df, new_chatia_df]).drop_duplicates(subset=['Date']).reset_index(drop=True)
    full_rewaghat_df = pd.concat([old_rewaghat_df, new_rewaghat_df]).drop_duplicates(subset=['Date']).reset_index(drop=True)
    for df_item in [full_chatia_df, full_rewaghat_df, rainfall_df, dumariaghat_df]:
        df_item['Date'] = pd.to_datetime(df_item['Date'], format='%d-%m-%Y %H:%M')
    dumariaghat_df = dumariaghat_df.drop_duplicates(subset=['Date']).reset_index(drop=True)
    df_bases = pd.merge(full_chatia_df, full_rewaghat_df, on='Date', how='inner')
    df_bases = pd.merge(df_bases, dumariaghat_df[['Date', 'Dumariaghat']], on='Date', how='inner')
    df_final = pd.merge(df_bases, rainfall_df, on='Date', how='inner')
    df_final.set_index('Date', inplace=True)
    df_final.sort_index(inplace=True)
    print('All datasets loaded and merged successfully.')
    return df_final

# --- Data Preparation (Identical to previous script) ---
class WaterLevelDataset(Dataset):
    def __init__(self, data, target_data, input_length, prediction_horizon):
        self.data = data
        self.target_data = target_data
        self.input_length = input_length
        self.prediction_horizon = prediction_horizon

    def __len__(self):
        return len(self.data) - self.input_length - self.prediction_horizon + 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.input_length]
        y_start_idx = idx + self.input_length
        y_end_idx = y_start_idx + self.prediction_horizon
        y = self.target_data[y_start_idx:y_end_idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Seq2Seq Model Definition ---
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.2):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_size)
        _, (hidden, cell) = self.lstm(x)
        # hidden, cell shapes: (num_layers, batch_size, hidden_size)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers, dropout=0.2):
        super(Decoder, self).__init__()
        self.output_size = output_size
        self.lstm = nn.LSTM(output_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (batch_size, 1, output_size) - single time step
        # hidden, cell shapes: (num_layers, batch_size, hidden_size)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        # output shape: (batch_size, 1, hidden_size)
        prediction = self.fc(output)
        # prediction shape: (batch_size, 1, output_size)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src shape: (batch_size, src_len, input_features)
        # trg shape: (batch_size, trg_len, output_features)
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_output_size = self.decoder.output_size

        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_output_size).to(self.device)

        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)

        # first input to the decoder is the last value of the input sequence (or a start token)
        # Here we use a zero tensor as the initial input for simplicity
        input = torch.zeros(batch_size, 1, trg_output_size).to(self.device)

        for t in range(trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t:t+1, :] = output

            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[:, t:t+1, :] if teacher_force else output

        return outputs

# --- Training and Evaluation Functions ---
def train_epoch(model, dataloader, optimizer, criterion, teacher_forcing_ratio):
    model.train()
    epoch_loss = 0
    for x_batch, y_batch in dataloader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        # y_batch shape needs to be (batch_size, seq_len, features)
        outputs = model(x_batch, y_batch, teacher_forcing_ratio)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate_epoch(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            # Turn off teacher forcing for evaluation
            outputs = model(x_batch, y_batch, 0)
            loss = criterion(outputs, y_batch)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# --- Main Execution Block ---
if __name__ == '__main__':
    df = load_and_merge_data()
    if df is not None:
        features = ['Chatia', 'Rewaghat', 'Dumariaghat', 'Rainfall']
        target = 'Rewaghat'

        train_df_raw = df.loc[df.index < df.index.max() - pd.DateOffset(years=2)]
        val_df_raw = df.loc[(df.index >= df.index.max() - pd.DateOffset(years=2)) & (df.index < df.index.max() - pd.DateOffset(years=1))]
        test_df_raw = df.loc[df.index >= df.index.max() - pd.DateOffset(years=1)]

        feature_scaler = MinMaxScaler()
        target_scaler = MinMaxScaler()

        train_features_scaled = feature_scaler.fit_transform(train_df_raw[features])
        train_target_scaled = target_scaler.fit_transform(train_df_raw[[target]])

        val_features_scaled = feature_scaler.transform(val_df_raw[features])
        val_target_scaled = target_scaler.transform(val_df_raw[[target]])

        test_features_scaled = feature_scaler.transform(test_df_raw[features])
        test_target_scaled = target_scaler.transform(test_df_raw[[target]])

        train_dataset = WaterLevelDataset(train_features_scaled, train_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)
        val_dataset = WaterLevelDataset(val_features_scaled, val_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)
        test_dataset = WaterLevelDataset(test_features_scaled, test_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        encoder = Encoder(INPUT_FEATURES, HIDDEN_SIZE, NUM_LAYERS).to(device)
        decoder = Decoder(OUTPUT_FEATURES, HIDDEN_SIZE, NUM_LAYERS).to(device)
        model = Seq2Seq(encoder, decoder, device).to(device)
        
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

        print('\n--- Starting True LSTM Sequence-to-Sequence Model Training ---')
        best_val_loss = float('inf')
        patience_counter = 0
        for epoch in range(NUM_EPOCHS):
            train_loss = train_epoch(model, train_loader, optimizer, criterion, TEACHER_FORCING_RATIO)
            val_loss = evaluate_epoch(model, val_loader, criterion)
            print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), 'best_true_seq2seq_model.pth')
                patience_counter = 0
                print('  -> Best model saved.')
            else:
                patience_counter += 1
            if patience_counter >= EARLY_STOPPING_PATIENCE:
                print('Early stopping triggered.')
                break
        print('--- Training Complete ---')

        print('\n--- Running Final Evaluation on Test Set ---')
        model.load_state_dict(torch.load('best_true_seq2seq_model.pth'))
        
        all_preds_scaled = []
        all_actuals_scaled = []
        model.eval()
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                outputs = model(x_batch, y_batch, 0) # No teacher forcing
                all_preds_scaled.extend(outputs.cpu().numpy())
                all_actuals_scaled.extend(y_batch.cpu().numpy())

        # Select only the 20th hour prediction for evaluation
        preds_scaled_20th_hour = np.array(all_preds_scaled)[:, -1, :].squeeze()
        actuals_scaled_20th_hour = np.array(all_actuals_scaled)[:, -1, :].squeeze()

        predictions = target_scaler.inverse_transform(preds_scaled_20th_hour.reshape(-1, 1)).flatten()
        actuals = target_scaler.inverse_transform(actuals_scaled_20th_hour.reshape(-1, 1)).flatten()

        start_index = INPUT_LENGTH + PREDICTION_HORIZON - 1
        test_timestamps = test_df_raw.index[start_index:start_index + len(predictions)]
        results_df = pd.DataFrame({'Timestamp': test_timestamps, 'Actual': actuals, 'Predicted': predictions}).set_index('Timestamp')

        # --- Seasonal Accuracy Calculation ---
        monsoon_months = [6, 7, 8, 9, 10]
        monsoon_df = results_df[results_df.index.month.isin(monsoon_months)]
        dry_df = results_df[~results_df.index.month.isin(monsoon_months)]

        def calculate_custom_accuracy(df):
            if df.empty: return np.nan
            errors = df['Predicted'] - df['Actual']
            return np.mean(np.abs(errors) <= 0.15) * 100

        accuracy_monsoon = calculate_custom_accuracy(monsoon_df)
        accuracy_full_year = calculate_custom_accuracy(results_df)
        accuracy_dry = calculate_custom_accuracy(dry_df)

        print('\n--- Final Accuracy Results (True Sequence-to-Sequence Model) ---')
        print(f'1. Custom Accuracy for Monsoon Season (June-Oct): {accuracy_monsoon:.2f}%')
        print(f'2. Custom Accuracy for Full Year: {accuracy_full_year:.2f}%')
        print(f'3. Custom Accuracy for Dry Season (Nov-May): {accuracy_dry:.2f}%')

Using device: cuda
All datasets loaded and merged successfully.

--- Starting True LSTM Sequence-to-Sequence Model Training ---
Epoch 1/100, Train Loss: 0.003594, Val Loss: 0.000888
  -> Best model saved.
Epoch 2/100, Train Loss: 0.000431, Val Loss: 0.000787
  -> Best model saved.
Epoch 3/100, Train Loss: 0.000372, Val Loss: 0.000628
  -> Best model saved.
Epoch 4/100, Train Loss: 0.000341, Val Loss: 0.000735
Epoch 5/100, Train Loss: 0.000282, Val Loss: 0.000657
Epoch 6/100, Train Loss: 0.000262, Val Loss: 0.000685
Epoch 7/100, Train Loss: 0.000266, Val Loss: 0.000611
  -> Best model saved.
Epoch 8/100, Train Loss: 0.000227, Val Loss: 0.000740
Epoch 9/100, Train Loss: 0.000228, Val Loss: 0.000643
Epoch 10/100, Train Loss: 0.000199, Val Loss: 0.000707
Epoch 11/100, Train Loss: 0.000189, Val Loss: 0.000667
Epoch 12/100, Train Loss: 0.000170, Val Loss: 0.000568
  -> Best model saved.
Epoch 13/100, Train Loss: 0.000189, Val Loss: 0.000539
  -> Best model saved.
Epoch 14/100, Train Loss: 0.