In [50]:
"""
================================================================================
DEEP NEURAL NETWORKS - ASSIGNMENT 3: RNN vs TRANSFORMER FOR TIME SERIES
Recurrent Neural Networks vs Transformers for Time Series Prediction
================================================================================
"""



In [51]:
"""
================================================================================
STUDENT INFORMATION (REQUIRED - DO NOT DELETE)
================================================================================

BITS ID: 2025AA05688
Name: Ramya D
Email: 2025aa05688@wilp.bits-pilani.ac.in
Date: 06-02-2026

================================================================================
"""



In [52]:
"""
================================================================================
ASSIGNMENT OVERVIEW
================================================================================

This assignment requires you to implement and compare two approaches for
time series forecasting:
1. LSTM or GRU using Keras/PyTorch
2. Transformer encoder using Keras/PyTorch layers

Learning Objectives:
- Build recurrent neural networks for sequential data
- Use transformer architecture for time series
- Implement or integrate positional encoding
- Compare RNN vs Transformer architectures
- Understand time series preprocessing and evaluation

IMPORTANT:
- Positional encoding MUST be added to transformer
- Use torch.nn.TransformerEncoder or keras.layers.MultiHeadAttention
- DO NOT use pre-trained transformers (HuggingFace, TimeGPT, etc.)
- Use temporal train/test split (NO shuffling)

================================================================================
"""



In [53]:
"""
================================================================================
‚ö†Ô∏è IMPORTANT SUBMISSION REQUIREMENTS - STRICTLY ENFORCED ‚ö†Ô∏è
================================================================================

1. FILENAME FORMAT: <BITS_ID>_rnn_assignment.ipynb
   Example: 2025AA05036_rnn_assignment.ipynb
   ‚ùå Wrong filename = Automatic 0 marks

2. STUDENT INFORMATION MUST MATCH:
   ‚úì BITS ID in filename = BITS ID in notebook (above)
   ‚úì Name in folder = Name in notebook (above)
   ‚ùå Mismatch = 0 marks

3. EXECUTE ALL CELLS BEFORE SUBMISSION:
   - Run: Kernel ‚Üí Restart & Run All
   - Verify all outputs are visible
   ‚ùå No outputs = 0 marks

4. FILE INTEGRITY:
   - Ensure notebook opens without errors
   - Check for corrupted cells
   ‚ùå Corrupted file = 0 marks

5. IMPLEMENTATION REQUIREMENTS:
   - MUST add positional encoding to transformer (custom or built-in)
   - CAN use torch.nn.TransformerEncoder or keras.layers.MultiHeadAttention
   - DO NOT use pre-trained transformers (HuggingFace, TimeGPT, etc.)
   - DO NOT shuffle time series data (temporal order required)
   ‚ùå Missing positional encoding = 0 marks for transformer section

6. DATASET REQUIREMENTS:
   - Minimum 1000 time steps
   - Train/test split: 90/10 OR 85/15 (temporal split only)
   - Sequence length: 10-50 time steps
   - Prediction horizon: 1-10 time steps

7. USE KERAS OR PYTORCH:
   - Use framework's LSTM/GRU layers
   - Use torch.nn.TransformerEncoder or keras.layers.MultiHeadAttention
   - Add positional encoding (custom implementation or built-in)
   - Use standard training methods

8. FILE SUBMISSION:
   - Submit ONLY the .ipynb file
   - NO zip files, NO separate data files, NO separate image files
   - All code and outputs must be in the notebook
   - Only one submission attempt allowed

================================================================================
"""



In [54]:
# ================================
# Import Required Libraries
# ================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import time
import json
import os
import math

# ================================
# PyTorch Imports (LSTM & Transformer)
# ================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ================================
# Reproducibility
# ================================

torch.manual_seed(42)
np.random.seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# ================================
# Device configuration
# ================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


Deep learning frameworks (choose Keras or PyTorch)

In [55]:
"""
================================================================================
PART 1: DATASET LOADING AND EXPLORATION (Informational)
================================================================================

Instructions:
1. Choose ONE dataset from the allowed list
2. Load and explore the time series data
3. Fill in ALL required metadata fields below
4. Provide justification for your primary metric choice

ALLOWED DATASETS:
- Stock Prices (daily/hourly closing prices)
- Weather Data (temperature, humidity, pressure)
- Energy Consumption (electricity/power usage)
- Sensor Data (IoT sensor readings)
- Custom time series (with approval)

REQUIRED OUTPUT:
- Print all metadata fields
- Time series plots
- Stationarity analysis
- Train/test split visualization
================================================================================
"""



1.1 Dataset Selection and Loading
TODO: Load your chosen time series dataset

In [56]:
import kagglehub

# Download dataset
dataset_path = kagglehub.dataset_download("bhanupratapbiswas/weather-data")
print("Dataset downloaded to:", dataset_path)

Using Colab cache for faster access to the 'weather-data' dataset.
Dataset downloaded to: /kaggle/input/weather-data


In [57]:
# =============================================================================
# REQUIRED: Fill in these metadata fields
# =============================================================================
dataset_name = "Weather Data Dataset"
dataset_source = "Kaggle: bhanupratapbiswas/weather-data"
# NOTE: Exact number may change slightly after dropping NaNs
n_samples = 8784          # hourly data for 1 year (non-leap year)
n_features = 10           # 6 weather vars + hour_sin + hour_cos + dow_sin + dow_cos
sequence_length = 30      # Lookback window (30 hours)
prediction_horizon = 1    # Forecast 1 step ahead
problem_type = "time_series_forecasting"


In [58]:
primary_metric = "RMSE"

metric_justification = """
Root Mean Squared Error (RMSE) is selected because it penalizes larger errors
more heavily, making it suitable for evaluating forecasting models where
large deviations are undesirable in weather prediction tasks.
"""


In [59]:
print("\n" + "="*70)
print("DATASET INFORMATION")
print("="*70)
print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Total Samples: {n_samples}")
print(f"Number of Features: {n_features}")
print(f"Sequence Length: {sequence_length}")
print(f"Prediction Horizon: {prediction_horizon}")
print(f"Primary Metric: {primary_metric}")
print(f"Metric Justification: {metric_justification}")
print("="*70)


DATASET INFORMATION
Dataset: Weather Data Dataset
Source: Kaggle: bhanupratapbiswas/weather-data
Total Samples: 8784
Number of Features: 10
Sequence Length: 30
Prediction Horizon: 1
Primary Metric: RMSE
Metric Justification: 
Root Mean Squared Error (RMSE) is selected because it penalizes larger errors
more heavily, making it suitable for evaluating forecasting models where
large deviations are undesirable in weather prediction tasks.



In [60]:
# =============================================================================
# Load Weather Dataset (Kaggle / Colab / Local compatible)
# =============================================================================

import pandas as pd
import glob
import os
import numpy as np

# --------------------------------------------------
# Locate CSV file
# --------------------------------------------------
csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))
assert len(csv_files) > 0, "No CSV files found in dataset directory!"

csv_path = csv_files[0]
print("Using file:", csv_path)

# --------------------------------------------------
# Load dataset
# --------------------------------------------------
df = pd.read_csv(csv_path).copy()

print("Raw dataset shape:", df.shape)
print(df.head())

# --------------------------------------------------
# Parse datetime column
# --------------------------------------------------
if "Formatted Date" in df.columns:
    df["Formatted Date"] = pd.to_datetime(df["Formatted Date"], utc=True)
    df = df.sort_values("Formatted Date")
    df.set_index("Formatted Date", inplace=True)

elif "Date/Time" in df.columns:
    df["Date/Time"] = pd.to_datetime(df["Date/Time"])
    df = df.sort_values("Date/Time")
    df.set_index("Date/Time", inplace=True)

else:
    raise ValueError("No recognizable datetime column found!")

print("After datetime parsing:", df.shape)

# --------------------------------------------------
# Time-based feature engineering (CYCLIC ENCODING)
# --------------------------------------------------
# Hour of day
df["hour"] = df.index.hour
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# Day of year (cyclic)
days_in_year = 366 if df.index.is_leap_year.any() else 365
df["doy_sin"] = np.sin(2 * np.pi * df["dayofyear"] / days_in_year)
df["doy_cos"] = np.cos(2 * np.pi * df["dayofyear"] / days_in_year)

# --------------------------------------------------
# Select numeric features for forecasting
# --------------------------------------------------
FEATURE_COLUMNS = [
    "Temp_C",               # Target
    "Rel Hum_%",            # Humidity
    "Wind Speed_km/h",      # Wind speed
    "Visibility_km",        # Visibility
    "Press_kPa",            # Pressure
    "hour_sin",             # Cyclic hour
    "hour_cos",
    "doy_sin",              # Cyclic day of year
    "doy_cos"
]

# Keep only required columns
data = df[FEATURE_COLUMNS].copy()

# Handle missing values (time-series safe)
data = data.ffill().bfill()

# --------------------------------------------------
# Update metadata (AUTOGRADER-CRITICAL)
# --------------------------------------------------
n_samples = data.shape[0]
n_features = data.shape[1]   # MUST be 9

print("Selected data shape:", data.shape)
print(data.head())

print("\n================ PREPROCESSING CHECK =================")
print("Missing values per column:\n", data.isna().sum())
print("Final data shape:", data.shape)


Using file: /kaggle/input/weather-data/Weather Data.csv
Raw dataset shape: (8784, 8)
       Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
0  1/1/2012 0:00    -1.8              -3.9         86                4   
1  1/1/2012 1:00    -1.8              -3.7         87                4   
2  1/1/2012 2:00    -1.8              -3.4         89                7   
3  1/1/2012 3:00    -1.5              -3.2         88                6   
4  1/1/2012 4:00    -1.5              -3.3         88                7   

   Visibility_km  Press_kPa               Weather  
0            8.0     101.24                   Fog  
1            8.0     101.24                   Fog  
2            4.0     101.26  Freezing Drizzle,Fog  
3            4.0     101.27  Freezing Drizzle,Fog  
4            4.8     101.23                   Fog  
After datetime parsing: (8784, 7)


KeyError: 'dayofyear'

1.2 Time Series Exploration
TODO: Plot time series data
TODO: Check for trends, seasonality
TODO: Perform stationarity tests (optional but recommended)

In [None]:
# =============================================================================
# 1.2 Time Series Exploration
# =============================================================================

import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

# --------------------------------------------------
# 1. Plot time series data
# --------------------------------------------------
plt.figure(figsize=(14, 6))
plt.plot(data.index, data["Temp_C"], label="Temperature (C)")
plt.title("Time Series of Temperature")
plt.xlabel("Date")
plt.ylabel("Temperature (C)")
plt.legend()
plt.show()

# Optional: plot other features
data[["Rel Hum_%", "Wind Speed_km/h", "Visibility_km", "Press_kPa"]].plot(
    figsize=(14, 6), subplots=True, layout=(2,2), title="Weather Features"
)
plt.tight_layout()
plt.show()

# --------------------------------------------------
# 2. Decompose time series (trend, seasonality, residual)
# --------------------------------------------------
# Using additive model (suitable if seasonal variations are roughly constant)
decompose_result = seasonal_decompose(data["Temp_C"], model='additive', period=24)  # period=24 if hourly, adjust if daily

plt.figure(figsize=(12, 9))
decompose_result.plot()
plt.suptitle("Seasonal Decomposition of Temperature", fontsize=16)
plt.show()

# --------------------------------------------------
# 3. Stationarity test (Augmented Dickey-Fuller)
# --------------------------------------------------
adf_result = adfuller(data["Temp_C"].dropna())
print("ADF Statistic:", adf_result[0])
print("p-value:", adf_result[1])
for key, value in adf_result[4].items():
    print("Critical Value ({}): {:.3f}".format(key, value))

if adf_result[1] < 0.05:
    print("=> The series is likely stationary.")
else:
    print("=> The series is likely non-stationary, differencing may be needed.")


1.3 Data Preprocessing

In [None]:
def preprocess_timeseries(data):
    """
    Preprocess time series data

    Args:
        data: raw time series data (pandas DataFrame)

    Returns:
        preprocessed data (numpy array), scaler
    """

    import numpy as np
    from sklearn.preprocessing import MinMaxScaler

    # --------------------------------------------------
    # Handle missing values (time-series safe)
    # --------------------------------------------------
    data = data.copy()
    data = data.fillna(method="ffill").fillna(method="bfill")

    # --------------------------------------------------
    # Normalize data
    # --------------------------------------------------
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data.values)

    return scaled_data, scaler


TODO: Preprocess data
TODO: Create sequences

In [None]:
# =============================================================================
# 1.4 Sequence Generation
# =============================================================================

def create_sequences(data, seq_length, pred_horizon):
    """
    Create sequences for time series prediction

    Args:
        data (np.ndarray): normalized data (n_samples, n_features)
        seq_length (int): lookback window
        pred_horizon (int): forecast horizon

    Returns:
        X (np.ndarray): (samples, seq_length, n_features)
        y (np.ndarray): (samples, 1) or (samples, horizon)
    """

    assert isinstance(data, np.ndarray), "Data must be NumPy array"
    assert data.ndim == 2, "Data must be 2D"
    assert seq_length > 0 and pred_horizon > 0

    X, y = [], []

    for i in range(len(data) - seq_length - pred_horizon + 1):

        # Input window
        X.append(data[i : i + seq_length])

        # Target: Temperature only (first column)
        y.append(
            data[i + seq_length : i + seq_length + pred_horizon, 0]
        )

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    # Shape normalization for single-step forecasting
    if pred_horizon == 1:
        y = y.reshape(-1, 1)

    print("\n================ SEQUENCE GENERATION =================")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    return X, y


In [None]:
# =============================================================================
# Feature Selection
# =============================================================================

feature_cols = [
    "Temp_C",            # Target (kept for scaling + sequence creation)
    "Rel Hum_%",
    "Wind Speed_km/h",
    "Visibility_km",
    "Press_kPa",
    "hour_sin",
    "hour_cos",
    "doy_sin",
    "doy_cos"
]

scaled_data, scaler = preprocess_timeseries(
    data=df,
    feature_cols=feature_cols
)

X, y = create_sequences(
    data=scaled_data,
    seq_length=sequence_length,
    pred_horizon=prediction_horizon
)


In [None]:
# =============================================================================
# 1.5 Train / Test Split (Temporal ‚Äì NO SHUFFLING)
# =============================================================================

train_test_ratio = 0.9  # 90% train, 10% test

n_total = X.shape[0]
train_samples = int(n_total * train_test_ratio)
test_samples = n_total - train_samples

# Temporal split (VERY IMPORTANT)
X_train = X[:train_samples]
y_train = y[:train_samples]

X_test = X[train_samples:]
y_test = y[train_samples:]

# Sanity checks
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [None]:
print(f"\nTrain/Test Split: {train_test_ratio}")
print(f"Training Samples: {train_samples}")
print(f"Test Samples: {test_samples}")
print("‚ö†Ô∏è  IMPORTANT: Temporal split used (NO shuffling)")

In [None]:
"""
================================================================================
PART 2: LSTM/GRU IMPLEMENTATION (5 MARKS)
================================================================================

REQUIREMENTS:
- Build LSTM OR GRU using Keras/PyTorch layers
- Architecture must include:
  * At least 2 stacked recurrent layers
  * Output layer for prediction
- Use model.compile() and model.fit() (Keras) OR standard PyTorch training
- Track initial_loss and final_loss

GRADING:
- LSTM/GRU architecture with stacked layers: 2 marks
- Model properly compiled/configured: 1 mark
- Training completed with loss tracking: 1 mark
- All metrics calculated correctly: 1 mark
================================================================================
"""

2.1 LSTM/GRU Architecture Design
TODO: Choose LSTM or GRU
TODO: Design architecture with stacked layers

In [None]:
# =============================================================================
# 2.1 LSTM/GRU Architecture Design
# =============================================================================
# - Model Type: LSTM or GRU
# - Stacked recurrent layers (n_layers >= 2)
# - Output layer for forecasting
# =============================================================================

import torch
import torch.nn as nn

def build_rnn_model(model_type, input_shape, hidden_units, n_layers, output_size):
    """
    Build LSTM or GRU model
    """

    # ------------------------------------------------------------------
    # Assertions (ENFORCED by assignment)
    # ------------------------------------------------------------------
    assert model_type in ["LSTM", "GRU"], "model_type must be 'LSTM' or 'GRU'"
    assert n_layers >= 2, "n_layers must be at least 2 (stacked layers REQUIRED)"

    _, n_features = input_shape

    # ------------------------------------------------------------------
    # RNN Model Definition
    # ------------------------------------------------------------------
    class RNNModel(nn.Module):
        def __init__(self):
            super(RNNModel, self).__init__()

            if model_type == "LSTM":
                self.rnn = nn.LSTM(
                    input_size=n_features,
                    hidden_size=hidden_units,
                    num_layers=n_layers,
                    batch_first=True
                )
            else:  # GRU
                self.rnn = nn.GRU(
                    input_size=n_features,
                    hidden_size=hidden_units,
                    num_layers=n_layers,
                    batch_first=True
                )

            # Output layer
            self.fc = nn.Linear(hidden_units, output_size)

        def forward(self, x):
            out, _ = self.rnn(x)
            out = out[:, -1, :]   # last time step
            out = self.fc(out)
            return out

    model = RNNModel()
    return model


TODO: Create RNN model
rnn_model = build_rnn_model('LSTM', (sequence_length, n_features), 64, 2, prediction_horizon)

In [None]:
# =============================================================================
# Create RNN Model
# =============================================================================

rnn_model = build_rnn_model(
    model_type="LSTM",
    input_shape=(sequence_length, n_features),
    hidden_units=64,
    n_layers=2,
    output_size=prediction_horizon
)

rnn_model = rnn_model.to(device)

print(rnn_model)


TODO: Compile model
For Keras: model.compile(optimizer='adam', loss='mse', metrics=['mae'])
For PyTorch: define optimizer and loss function

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(
    rnn_model.parameters(),
    lr=0.001
)


2.2 Train RNN Model

In [None]:
print("\n" + "="*70)
print("RNN MODEL TRAINING")
print("="*70)

In [None]:
# Track training time
rnn_start_time = time.time()

TODO: Train your model
For Keras: history = rnn_model.fit(X_train, y_train, epochs=50, batch_size=32)
For PyTorch: write training loop

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------------
# Move model to device
# -------------------------------
rnn_model = rnn_model.to(device)

batch_size = 64

train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

epochs = 25
loss_history = []

rnn_model.train()

for epoch in range(epochs):
    epoch_loss = 0.0

    for xb, yb in train_loader:

        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()

        preds = rnn_model(xb)
        loss = criterion(preds, yb)

        loss.backward()

        # RNN gradient safety
        torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 1.0)

        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}")


In [None]:
rnn_training_time = time.time() - rnn_start_time

In [None]:
# REQUIRED: Track initial and final loss
rnn_initial_loss = loss_history[0]
rnn_final_loss   = loss_history[-1]


In [None]:
print(f"Training completed in {rnn_training_time:.2f} seconds")
print(f"Initial Loss: {rnn_initial_loss:.4f}")
print(f"Final Loss: {rnn_final_loss:.4f}")
print("="*70)

# Alias for later comparison plots
rnn_loss_history = loss_history

2.3 Evaluate RNN Model

TODO: Make predictions on test set
TODO: Inverse transform if data was normalized
TODO: Calculate all 4 required metrics

In [None]:
# =============================================================================
# 2.3 Evaluate RNN Model
# =============================================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import torch

# -------------------------------
# Model inference
# -------------------------------
rnn_model.eval()

y_preds = []
y_trues = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        preds = rnn_model(xb)

        y_preds.append(preds.cpu().numpy())
        y_trues.append(yb.cpu().numpy())

y_pred_scaled = np.vstack(y_preds)
y_true_scaled = np.vstack(y_trues)


if prediction_horizon > 1:
    y_pred_scaled = y_pred_scaled[:, 0]
    y_true_scaled = y_true_scaled[:, 0]
else:
    y_pred_scaled = y_pred_scaled.reshape(-1)
    y_true_scaled = y_true_scaled.reshape(-1)

n_features = scaled_data.shape[1]

y_pred_dummy = np.zeros((len(y_pred_scaled), n_features))
y_true_dummy = np.zeros((len(y_true_scaled), n_features))

y_pred_dummy[:, 0] = y_pred_scaled
y_true_dummy[:, 0] = y_true_scaled

y_pred = scaler.inverse_transform(y_pred_dummy)[:, 0]
y_true = scaler.inverse_transform(y_true_dummy)[:, 0]

rnn_y_true = y_true
rnn_y_pred = y_pred


In [None]:
def calculate_mape(y_true, y_pred):
    epsilon = 1e-8
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100


In [None]:
# REQUIRED: Calculate all 4 metrics
rnn_mae  = mean_absolute_error(y_true, y_pred)
rnn_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
rnn_mape = calculate_mape(y_true, y_pred)
rnn_r2   = r2_score(y_true, y_pred)

In [None]:
print("\nRNN Model Performance:")
print(f"MAE:   {rnn_mae:.4f}")
print(f"RMSE:  {rnn_rmse:.4f}")
print(f"MAPE:  {rnn_mape:.4f}%")
print(f"R¬≤ Score: {rnn_r2:.4f}")

2.4 Visualize RNN Results
TODO: Plot training loss curve
TODO: Plot actual vs predicted values
TODO: Plot residuals

In [None]:
# =============================================================================
# 2.4 Visualize RNN Results
# =============================================================================

import matplotlib.pyplot as plt

# ------------------------------------------------------------------
# Plot training loss curve
# ------------------------------------------------------------------
plt.figure(figsize=(8, 5))
plt.plot(loss_history, marker="o", linewidth=2)
plt.title("RNN Training Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.grid(True)
plt.show()


# ------------------------------------------------------------------
# Plot Actual vs Predicted values
# ------------------------------------------------------------------
n_plot = 500

plt.figure(figsize=(14, 5))
plt.plot(y_true[:n_plot], label="Actual", linewidth=2)
plt.plot(y_pred[:n_plot], label="Predicted", alpha=0.8)
plt.title("Actual vs Predicted Temperature (RNN)")
plt.xlabel("Time Step")
plt.ylabel("Temperature (¬∞C)")
plt.legend()
plt.grid(True)
plt.show()


# ------------------------------------------------------------------
# Plot Residuals
# ------------------------------------------------------------------
residuals = y_true[:n_plot] - y_pred[:n_plot]

plt.figure(figsize=(14, 5))
plt.plot(residuals, color="red", alpha=0.8)
plt.axhline(0, linestyle="--", color="black")
plt.title("Residuals (Actual ‚àí Predicted)")
plt.xlabel("Time Step")
plt.ylabel("Prediction Error (¬∞C)")
plt.grid(True)
plt.show()


In [None]:
"""
================================================================================
PART 3: TRANSFORMER IMPLEMENTATION (5 MARKS)
================================================================================

REQUIREMENTS:
- Build Transformer encoder using Keras/PyTorch layers
- MUST add positional encoding to input:
  * Custom sinusoidal implementation OR
  * Use built-in positional encoding (if framework provides)
- Use torch.nn.TransformerEncoder or keras.layers.MultiHeadAttention
- Use standard training methods
- Track initial_loss and final_loss

PROHIBITED:
- Using pre-trained transformers (HuggingFace, TimeGPT, etc.)
- Skipping positional encoding entirely

GRADING:
- Positional encoding added: 1 mark
- Transformer architecture properly configured: 2 marks
- Training completed with loss tracking: 1 mark
- All metrics calculated correctly: 1 mark
================================================================================
"""

3.1 Positional Encoding Implementation

In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float32)
            * (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        return x + self.pe[:, :x.size(1), :]


3.2 Transformer Encoder Architecture

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, n_features, d_model, n_heads, n_layers, d_ff, output_size):
        super().__init__()

        self.input_projection = nn.Linear(n_features, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=n_layers
        )

        self.fc = nn.Linear(d_model, output_size)

    def forward(self, x):
        # x: (batch, seq_len, n_features)
        x = self.input_projection(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Global average pooling
        return self.fc(x)


3.3 Build Your Transformer Model

TODO: Create Transformer model using PyTorch or Keras
Example for PyTorch:
transformer_model = TransformerModel(n_features, d_model=64, n_heads=4, n_layers=2, d_ff=256, output_size=prediction_horizon)
Example for Keras:
transformer_model = build_transformer_model(sequence_length, n_features, d_model=64, n_heads=4, n_layers=2, d_ff=256, output_size=prediction_horizon)

In [None]:
# =============================================================================
# 3.3 Build Transformer Model
# =============================================================================

import torch
import torch.nn as nn

# Create Transformer model
transformer_model = TransformerModel(
    n_features=n_features,
    d_model=64,
    n_heads=4,
    n_layers=2,
    d_ff=256,
    output_size=prediction_horizon
)

# Move model to device
transformer_model = transformer_model.to(device)


TODO: Define optimizer and loss
For PyTorch: optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001); criterion = nn.MSELoss()
For Keras: model.compile(optimizer='adam', loss='mse', metrics=['mae'])
For PyTorch: define optimizer and loss function

In [None]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)
criterion = nn.MSELoss()


3.4 Train Transformer Model

In [None]:
print("\n" + "="*70)
print("TRANSFORMER MODEL TRAINING")
print("="*70)

In [None]:
# Track training time
transformer_start_time = time.time()

TODO: Train your model
For Keras: history = transformer_model.fit(X_train, y_train, epochs=50, batch_size=32)
For PyTorch: write training loop

In [None]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64
epochs = 25

train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

transformer_loss_history = []

transformer_model.train()

for epoch in range(epochs):
    epoch_loss = 0.0

    for xb, yb in train_loader:

        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()

        preds = transformer_model(xb)
        loss = criterion(preds, yb)

        loss.backward()

        # üîí Safety: gradient clipping (good practice for transformers too)
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), 1.0)

        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    transformer_loss_history.append(avg_loss)

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}")


In [None]:
transformer_training_time = time.time() - transformer_start_time

In [None]:
# REQUIRED: Track initial and final loss
transformer_initial_loss = transformer_loss_history[0]
transformer_final_loss   = transformer_loss_history[-1]

In [None]:
print(f"Training completed in {transformer_training_time:.2f} seconds")
print(f"Initial Loss: {transformer_initial_loss:.4f}")
print(f"Final Loss: {transformer_final_loss:.4f}")
print("="*70)

3.5 Evaluate Transformer Model

TODO: Make predictions on test set
TODO: Inverse transform if data was normalized
TODO: Calculate all 4 required metrics

In [None]:
# =============================================================================
# 3.5 Evaluate Transformer Model
# =============================================================================

transformer_model.eval()

y_preds = []
y_trues = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        preds = transformer_model(xb)

        y_preds.append(preds.cpu().numpy())
        y_trues.append(yb.cpu().numpy())

y_pred_scaled = np.vstack(y_preds)
y_true_scaled = np.vstack(y_trues)

if prediction_horizon > 1:
    y_pred_scaled = y_pred_scaled[:, 0]
    y_true_scaled = y_true_scaled[:, 0]
else:
    y_pred_scaled = y_pred_scaled.reshape(-1)
    y_true_scaled = y_true_scaled.reshape(-1)

n_features = scaled_data.shape[1]

y_pred_dummy = np.zeros((len(y_pred_scaled), n_features))
y_true_dummy = np.zeros((len(y_true_scaled), n_features))

y_pred_dummy[:, 0] = y_pred_scaled
y_true_dummy[:, 0] = y_true_scaled

y_pred = scaler.inverse_transform(y_pred_dummy)[:, 0]
y_true = scaler.inverse_transform(y_true_dummy)[:, 0]

transformer_y_true = y_true
transformer_y_pred = y_pred



In [None]:
# REQUIRED: Calculate all 4 metrics
transformer_mae  = mean_absolute_error(y_true, y_pred)
transformer_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
transformer_mape = calculate_mape(y_true, y_pred)
transformer_r2   = r2_score(y_true, y_pred)

In [None]:
print("\nTransformer Model Performance:")
print(f"MAE:   {transformer_mae:.4f}")
print(f"RMSE:  {transformer_rmse:.4f}")
print(f"MAPE:  {transformer_mape:.4f}%")
print(f"R¬≤ Score: {transformer_r2:.4f}")

3.6 Visualize Transformer Results
TODO: Plot training loss curve
TODO: Plot actual vs predicted values
TODO: Plot attention weights (optional but informative)

In [None]:
# =============================================================================
# 3.6 Visualize Transformer Results
# =============================================================================

import matplotlib.pyplot as plt

# ------------------------------------------------------------------
# Plot training loss curve
# ------------------------------------------------------------------
plt.figure(figsize=(8, 5))
plt.plot(transformer_loss_history, marker="o", linewidth=2)
plt.title("Transformer Training Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.grid(True)
plt.show()

# ------------------------------------------------------------------
# Plot Actual vs Predicted values
# ------------------------------------------------------------------
n_plot = 500

plt.figure(figsize=(14, 5))
plt.plot(y_true[:n_plot], label="Actual", linewidth=2)
plt.plot(y_pred[:n_plot], label="Predicted", alpha=0.8)
plt.title("Actual vs Predicted Temperature (Transformer)")
plt.xlabel("Time Step")
plt.ylabel("Temperature (¬∞C)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
"""
================================================================================
PART 4: MODEL COMPARISON AND VISUALIZATION (Informational)
================================================================================

Compare both models on:
- Performance metrics
- Training time
- Model complexity
- Convergence behavior
- Ability to capture long-term dependencies
================================================================================
"""

4.1 Metrics Comparison

In [None]:
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)

In [None]:
comparison_df = pd.DataFrame({
    'Metric': ['MAE', 'RMSE', 'MAPE (%)', 'R¬≤ Score', 'Training Time (s)', 'Parameters'],
    'RNN (LSTM/GRU)': [
        rnn_mae,
        rnn_rmse,
        rnn_mape,
        rnn_r2,
        rnn_training_time,
        sum(p.numel() for p in rnn_model.parameters() if p.requires_grad)
    ],
    'Transformer': [
        transformer_mae,
        transformer_rmse,
        transformer_mape,
        transformer_r2,
        transformer_training_time,
        sum(p.numel() for p in transformer_model.parameters() if p.requires_grad)
    ]
})

In [None]:
print(comparison_df.to_string(index=False))
print("="*70)

4.2 Visual Comparison
TODO: Create bar plot comparing metrics
TODO: Plot predictions comparison (both models vs actual)
TODO: Plot training curves comparison

In [None]:
import numpy as np
import matplotlib.pyplot as plt

metrics = ['MAE', 'RMSE', 'MAPE (%)', 'R¬≤ Score']

rnn_metrics = [
    rnn_mae,
    rnn_rmse,
    rnn_mape,
    rnn_r2
]

transformer_metrics = [
    transformer_mae,
    transformer_rmse,
    transformer_mape,
    transformer_r2
]

x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(10, 5))
plt.bar(x - width/2, rnn_metrics, width, label='RNN')
plt.bar(x + width/2, transformer_metrics, width, label='Transformer')

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.grid(axis='y')
plt.show()


n_plot = 500  # number of time steps to visualize

plt.figure(figsize=(14, 5))
plt.plot(y_true[:n_plot], label='Actual', linewidth=2)
plt.plot(rnn_y_pred[:n_plot], label='RNN Prediction', alpha=0.8)
plt.plot(transformer_y_pred[:n_plot], label='Transformer Prediction', alpha=0.8)

plt.title('Actual vs Predicted Temperature')
plt.xlabel('Time Step')
plt.ylabel('Temperature (¬∞C)')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(rnn_loss_history, label='RNN Training Loss', linewidth=2)
plt.plot(transformer_loss_history, label='Transformer Training Loss', linewidth=2)

plt.title('Training Loss Comparison')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
"""
================================================================================
PART 5: ANALYSIS (2 MARKS)
================================================================================

REQUIRED:
- Write MAXIMUM 200 words (guideline - no marks deduction if exceeded)
- Address key topics with depth

GRADING (Quality-based):
- Covers 5+ key topics with deep understanding: 2 marks
- Covers 3-4 key topics with good understanding: 1 mark
- Covers <3 key topics or superficial: 0 marks

Key Topics:
1. Performance comparison with specific metrics
2. RNN vs Transformer architecture advantages
3. Impact of attention mechanism vs recurrent connections
4. Long-term dependency handling comparison
5. Computational cost comparison
6. Convergence behavior differences
================================================================================
"""

In [None]:
analysis_text = """
TODO: Write your analysis here (maximum 200 words guideline)

Address:
1. Which model performed better and by how much?
   [Compare specific metrics]

2. RNN vs Transformer architecture advantages?
   [Discuss sequential processing vs parallel processing]

3. Impact of attention mechanism?
   [Discuss how attention captures dependencies]

4. Long-term dependency handling?
   [Compare vanishing gradients vs attention]

5. Computational cost comparison?
   [Compare training time, parameters]

6. Convergence behavior?
   [Discuss training stability, loss curves]
"""

In [None]:
# REQUIRED: Print analysis with word count
print("\n" + "="*70)
print("ANALYSIS")
print("="*70)
print(analysis_text)
print("="*70)
print(f"Analysis word count: {len(analysis_text.split())} words")
if len(analysis_text.split()) > 200:
    print("‚ö†Ô∏è  Warning: Analysis exceeds 200 words (guideline)")
else:
    print("‚úì Analysis within word count guideline")
print("="*70)

In [None]:
"""
================================================================================
PART 6: ASSIGNMENT RESULTS SUMMARY (REQUIRED FOR AUTO-GRADING)
================================================================================

DO NOT MODIFY THE STRUCTURE BELOW
This JSON output is used by the auto-grader
Ensure all field names are EXACT
================================================================================
"""

In [None]:
def get_assignment_results():
    """
    Generate complete assignment results in required format

    Returns:
        dict: Complete results with all required fields
    """

    framework_used = "keras"  # TODO: Change to "pytorch" if using PyTorch
    rnn_model_type = "LSTM"  # TODO: Change to "GRU" if using GRU

    results = {
        # Dataset Information
        'dataset_name': dataset_name,
        'dataset_source': dataset_source,
        'n_samples': n_samples,
        'n_features': n_features,
        'sequence_length': sequence_length,
        'prediction_horizon': prediction_horizon,
        'problem_type': problem_type,
        'primary_metric': primary_metric,
        'metric_justification': metric_justification,
        'train_samples': train_samples,
        'test_samples': test_samples,
        'train_test_ratio': train_test_ratio,

        # RNN Model Results
        'rnn_model': {
            'framework': framework_used,
            'model_type': rnn_model_type,
            'architecture': {
                'n_layers': 0,  # TODO: Number of stacked layers
                'hidden_units': 0,  # TODO: Hidden units per layer
                'total_parameters': 0  # TODO: Calculate total parameters
            },
            'training_config': {
                'learning_rate': 0.001,  # TODO: Your actual learning rate
                'n_epochs': 50,  # TODO: Your actual epochs
                'batch_size': 32,  # TODO: Your actual batch size
                'optimizer': 'Adam',  # TODO: Your actual optimizer
                'loss_function': 'MSE'  # TODO: Your actual loss
            },
            'initial_loss': rnn_initial_loss,
            'final_loss': rnn_final_loss,
            'training_time_seconds': rnn_training_time,
            'mae': rnn_mae,
            'rmse': rnn_rmse,
            'mape': rnn_mape,
            'r2_score': rnn_r2
        },

        # Transformer Model Results
        'transformer_model': {
            'framework': framework_used,
            'architecture': {
                'n_layers': 0,  # TODO: Number of transformer layers
                'n_heads': 0,  # TODO: Number of attention heads
                'd_model': 0,  # TODO: Model dimension
                'd_ff': 0,  # TODO: Feed-forward dimension
                'has_positional_encoding': True,  # MUST be True
                'has_attention': True,  # MUST be True
                'total_parameters': 0  # TODO: Calculate total parameters
            },
            'training_config': {
                'learning_rate': 0.001,  # TODO: Your actual learning rate
                'n_epochs': 50,  # TODO: Your actual epochs
                'batch_size': 32,  # TODO: Your actual batch size
                'optimizer': 'Adam',  # TODO: Your actual optimizer
                'loss_function': 'MSE'  # TODO: Your actual loss
            },
            'initial_loss': transformer_initial_loss,
            'final_loss': transformer_final_loss,
            'training_time_seconds': transformer_training_time,
            'mae': transformer_mae,
            'rmse': transformer_rmse,
            'mape': transformer_mape,
            'r2_score': transformer_r2
        },

        # Analysis
        'analysis': analysis_text,
        'analysis_word_count': len(analysis_text.split()),

        # Training Success Indicators
        'rnn_loss_decreased': rnn_final_loss < rnn_initial_loss if rnn_initial_loss and rnn_final_loss else False,
        'transformer_loss_decreased': transformer_final_loss < transformer_initial_loss if transformer_initial_loss and transformer_final_loss else False,
    }

    return results

In [None]:
# Generate and print results
try:
    assignment_results = get_assignment_results()

    print("\n" + "="*70)
    print("ASSIGNMENT RESULTS SUMMARY")
    print("="*70)
    print(json.dumps(assignment_results, indent=2))
    print("="*70)

In [None]:
except Exception as e:
    print(f"\n‚ö†Ô∏è  ERROR generating results: {str(e)}")
    print("Please ensure all variables are properly defined")

In [None]:
"""
================================================================================
FINAL CHECKLIST - VERIFY BEFORE SUBMISSION
================================================================================

‚ñ° Student information filled at the top (BITS ID, Name, Email)
‚ñ° Filename is <BITS_ID>_rnn_assignment.ipynb
‚ñ° All cells executed (Kernel ‚Üí Restart & Run All)
‚ñ° All outputs visible
‚ñ° LSTM/GRU implemented with stacked layers
‚ñ° Positional encoding implemented (sinusoidal)
‚ñ° Multi-head attention implemented (Q, K, V, scaled dot-product)
‚ñ° Both models use Keras or PyTorch
‚ñ° Both models trained with loss tracking (initial_loss and final_loss)
‚ñ° All 4 metrics calculated for both models (MAE, RMSE, MAPE, R¬≤)
‚ñ° Temporal train/test split used (NO shuffling)
‚ñ° Primary metric selected and justified
‚ñ° Analysis written (quality matters, not just word count)
‚ñ° Visualizations created
‚ñ° Assignment results JSON printed at the end
‚ñ° No execution errors in any cell
‚ñ° File opens without corruption
‚ñ° Submit ONLY .ipynb file (NO zip, NO data files, NO images)
‚ñ° Screenshot of environment with account details included
‚ñ° Only one submission attempt

================================================================================
"""

In [None]:
"""
================================================================================
ENVIRONMENT VERIFICATION - SCREENSHOT REQUIRED
================================================================================

IMPORTANT: Take a screenshot of your environment showing account details

For Google Colab:
- Click on your profile icon (top right)
- Screenshot should show your email/account clearly
- Include the entire Colab interface with notebook name visible

For BITS Virtual Lab:
- Screenshot showing your login credentials/account details
- Include the entire interface with your username/session info visible

Paste the screenshot below this cell or in a new markdown cell.
This helps verify the work was done by you in your environment.

================================================================================
"""

In [None]:
# Display system information
import platform
import sys
from datetime import datetime

In [None]:
print("="*70)
print("ENVIRONMENT INFORMATION")
print("="*70)
print("\n‚ö†Ô∏è  REQUIRED: Add screenshot of your Google Colab/BITS Virtual Lab")
print("showing your account details in the cell below this one.")
print("="*70)