In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
import random
import warnings

# --- Basic Setup ---
warnings.filterwarnings('ignore')

def set_seed(seed_value=42):
    """Sets the seed for reproducibility."""
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# --- Hyperparameters ---
INPUT_LENGTH = 48
PREDICTION_HORIZON = 20 # This is now the output sequence length
HIDDEN_SIZE = 64
NUM_LAYERS = 2
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 100
EARLY_STOPPING_PATIENCE = 10

# --- Data Loading (Remains the same) ---
def load_and_merge_data():
    # ... (Data loading code is identical to the previous script, kept concise here)
    try:
        old_chatia_df = pd.read_excel('Chatia_train.xlsx')
        old_rewaghat_df = pd.read_excel('Rewaghat_train.xlsx')
        new_chatia_df = pd.read_excel('Chatia_test.xlsx')
        new_rewaghat_df = pd.read_excel('Rewaghat_test.xlsx')
        rainfall_df = pd.read_excel('rainfall_data.xlsx')
        dumariaghat_df = pd.read_excel('Dumariaghat_data.xlsx')
    except FileNotFoundError as e:
        print(f'Error: {e}. Please ensure all data files are in the same directory.')
        return None
    full_chatia_df = pd.concat([old_chatia_df, new_chatia_df]).drop_duplicates(subset=['Date']).reset_index(drop=True)
    full_rewaghat_df = pd.concat([old_rewaghat_df, new_rewaghat_df]).drop_duplicates(subset=['Date']).reset_index(drop=True)
    for df_item in [full_chatia_df, full_rewaghat_df, rainfall_df, dumariaghat_df]:
        df_item['Date'] = pd.to_datetime(df_item['Date'], format='%d-%m-%Y %H:%M')
    dumariaghat_df = dumariaghat_df.drop_duplicates(subset=['Date']).reset_index(drop=True)
    df_bases = pd.merge(full_chatia_df, full_rewaghat_df, on='Date', how='inner')
    df_bases = pd.merge(df_bases, dumariaghat_df[['Date', 'Dumariaghat']], on='Date', how='inner')
    df_final = pd.merge(df_bases, rainfall_df, on='Date', how='inner')
    df_final.set_index('Date', inplace=True)
    df_final.sort_index(inplace=True)
    print('All datasets loaded and merged successfully.')
    return df_final

# --- Seq2Seq Data Preparation ---
class WaterLevelDataset(Dataset):
    def __init__(self, data, target_data, input_length, prediction_horizon):
        self.data = data
        self.target_data = target_data
        self.input_length = input_length
        self.prediction_horizon = prediction_horizon

    def __len__(self):
        return len(self.data) - self.input_length - self.prediction_horizon + 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.input_length]
        # **MODIFIED: Target 'y' is now a sequence of 20 future values**
        y_start_idx = idx + self.input_length
        y_end_idx = y_start_idx + self.prediction_horizon
        y = self.target_data[y_start_idx:y_end_idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).flatten()

# --- Seq2Seq Model Definition ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        # **MODIFIED: The output size of the final layer is now PREDICTION_HORIZON (20)**
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :]) # Still uses seq2vec logic, but predicts a vector of 20 steps
        return out

# --- Training Function (Modified for sequence targets) ---
def train_model_loop(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience):
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch) # Target is now a sequence, no unsqueeze needed
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        # ... (rest of the training loop is identical)
        train_loss /= len(train_loader)
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = (x_batch.to(device), y_batch.to(device))
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_lstm_seq2seq_model.pth')
            patience_counter = 0
            print('  -> Best model saved.')
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping triggered.')
            break

if __name__ == '__main__':
    df = load_and_merge_data()
    if df is not None:
        features = ['Chatia', 'Rewaghat', 'Dumariaghat', 'Rainfall']
        target = 'Rewaghat'
        
        train_df_raw = df.loc[df.index < df.index.max() - pd.DateOffset(years=2)]
        val_df_raw = df.loc[(df.index >= df.index.max() - pd.DateOffset(years=2)) & (df.index < df.index.max() - pd.DateOffset(years=1))]
        test_df_raw = df.loc[df.index >= df.index.max() - pd.DateOffset(years=1)]

        feature_scaler = MinMaxScaler()
        target_scaler = MinMaxScaler()

        train_features_scaled = feature_scaler.fit_transform(train_df_raw[features])
        train_target_scaled = target_scaler.fit_transform(train_df_raw[[target]])

        val_features_scaled = feature_scaler.transform(val_df_raw[features])
        val_target_scaled = target_scaler.transform(val_df_raw[[target]])
        
        test_features_scaled = feature_scaler.transform(test_df_raw[features])
        test_target_scaled = target_scaler.transform(test_df_raw[[target]])

        train_dataset = WaterLevelDataset(train_features_scaled, train_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)
        val_dataset = WaterLevelDataset(val_features_scaled, val_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)
        test_dataset = WaterLevelDataset(test_features_scaled, test_target_scaled, INPUT_LENGTH, PREDICTION_HORIZON)
        
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        model = LSTMModel(input_size=len(features), hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=PREDICTION_HORIZON).to(device)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

        print('\n--- Starting LSTM Sequence-to-Sequence Model Training ---')
        train_model_loop(model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, EARLY_STOPPING_PATIENCE)
        print('--- Training Complete ---')

        print('\n--- Running Final Evaluation on Test Set ---')
        best_model = LSTMModel(input_size=len(features), hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=PREDICTION_HORIZON).to(device)
        best_model.load_state_dict(torch.load('best_lstm_seq2seq_model.pth'))
        best_model.eval()

        all_preds_scaled_seq = []
        all_actuals_scaled_seq = []
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch = x_batch.to(device)
                outputs = best_model(x_batch)
                all_preds_scaled_seq.extend(outputs.cpu().numpy())
                all_actuals_scaled_seq.extend(y_batch.cpu().numpy())

        # **MODIFIED: Select only the 20th hour prediction from the output sequence**
        preds_scaled_20th_hour = np.array(all_preds_scaled_seq)[:, -1] # Select last element of each sequence
        actuals_scaled_20th_hour = np.array(all_actuals_scaled_seq)[:, -1]

        predictions = target_scaler.inverse_transform(preds_scaled_20th_hour.reshape(-1, 1)).flatten()
        actuals = target_scaler.inverse_transform(actuals_scaled_20th_hour.reshape(-1, 1)).flatten()

        start_index = INPUT_LENGTH + PREDICTION_HORIZON - 1
        test_timestamps = test_df_raw.index[start_index:start_index + len(predictions)]
        results_df = pd.DataFrame({'Timestamp': test_timestamps, 'Actual': actuals, 'Predicted': predictions}).set_index('Timestamp')
        
        # --- Seasonal Accuracy Calculation (Identical to previous script) ---
        monsoon_months = [6, 7, 8, 9, 10]
        monsoon_df = results_df[results_df.index.month.isin(monsoon_months)]
        dry_df = results_df[~results_df.index.month.isin(monsoon_months)]

        def calculate_custom_accuracy(df):
            if df.empty: return np.nan
            errors = df['Predicted'] - df['Actual']
            return np.mean(np.abs(errors) <= 0.15) * 100

        accuracy_monsoon = calculate_custom_accuracy(monsoon_df)
        accuracy_full_year = calculate_custom_accuracy(results_df)
        accuracy_dry = calculate_custom_accuracy(dry_df)

        print('\n--- Final Accuracy Results (Sequence-to-vector stepwise Model) ---')
        print(f'1. Custom Accuracy for Monsoon Season (June-Oct): {accuracy_monsoon:.2f}%')
        print(f'2. Custom Accuracy for Full Year: {accuracy_full_year:.2f}%')
        print(f'3. Custom Accuracy for Dry Season (Nov-May): {accuracy_dry:.2f}%')

Using device: cuda
All datasets loaded and merged successfully.

--- Starting LSTM Sequence-to-Sequence Model Training ---
Epoch 1/100, Train Loss: 0.003574, Val Loss: 0.000729
  -> Best model saved.
Epoch 2/100, Train Loss: 0.000689, Val Loss: 0.000655
  -> Best model saved.
Epoch 3/100, Train Loss: 0.000629, Val Loss: 0.000641
  -> Best model saved.
Epoch 4/100, Train Loss: 0.000565, Val Loss: 0.000587
  -> Best model saved.
Epoch 5/100, Train Loss: 0.000516, Val Loss: 0.000743
Epoch 6/100, Train Loss: 0.000463, Val Loss: 0.000539
  -> Best model saved.
Epoch 7/100, Train Loss: 0.000436, Val Loss: 0.000507
  -> Best model saved.
Epoch 8/100, Train Loss: 0.000418, Val Loss: 0.000503
  -> Best model saved.
Epoch 9/100, Train Loss: 0.000406, Val Loss: 0.000499
  -> Best model saved.
Epoch 10/100, Train Loss: 0.000385, Val Loss: 0.000461
  -> Best model saved.
Epoch 11/100, Train Loss: 0.000372, Val Loss: 0.000496
Epoch 12/100, Train Loss: 0.000363, Val Loss: 0.000493
Epoch 13/100, Train