In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# Load the dataset from Hugging Face (if available)
dataset = load_dataset("gigaword",trust_remote_code=True)

# Split into training, validation, and test sets
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")

def preprocess_data(example):
    inputs = tokenizer(example["document"], max_length=100, truncation=True, padding="max_length")
    targets = tokenizer(example["summary"], max_length=50, truncation=True, padding="max_length")

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

# Apply tokenization
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)
test_data = test_data.map(preprocess_data, batched=True)

In [None]:
import torch
from torch.utils.data import DataLoader
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


train_data_sample = train_data.select(range(256000))  # Use only 100 samples
val_data_sample = val_data.select(range(20))  # Use 20 samples
test_data_sample = test_data.select(range(256))  # Use 20 samples

train_data_sample.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data_sample.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_data_sample.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_loader = DataLoader(train_data_sample, batch_size=256, shuffle=True)
val_loader = DataLoader(val_data_sample, batch_size=256, shuffle=False)
test_loader = DataLoader(test_data_sample, batch_size=256, shuffle=False)

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class VariationalEncoder(nn.Module):
    def __init__(self, input_size=50265, hidden_size=250, embedding_dim = 300, n_layers=1, bi_dir=True ):
        super(VariationalEncoder, self).__init__()
        self.bi = 2 if bi_dir else 1
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, batch_first=True, bidirectional=bi_dir)


    def forward(self, input, intial_hidden , attention_mask):
        seq_len = len(input)
        embedded = self.embedding(input)
        if attention_mask is not None:
            embedded = embedded * attention_mask.unsqueeze(-1)
        output, hidden = self.gru(embedded, intial_hidden)
        return output

    def initHidden(self , batch_size):
        return torch.zeros( self.n_layers * self.bi, batch_size , self.hidden_size, device=device)

class latent(nn.Module):
    def __init__(self, input_size=50265, hidden_size=500, embedding_dim = 300, latent_size=500, n_layers=1, bi_dir=True):
        super(latent, self).__init__()
        self.wyh = nn.Linear(embedding_dim, hidden_size)
        self.wzh = nn.Linear(latent_size, hidden_size)
        self.whh = nn.Linear(hidden_size, hidden_size)
        self.bh = nn.Parameter(torch.randn(hidden_size))
        self.sigmoid = nn.Sigmoid()
        self.fc_mu = nn.Linear(hidden_size , latent_size)
        self.fc_logvar = nn.Linear(hidden_size, latent_size)
    def forward(self, output_embedding,decoder_hidden,previous_z):
        h_z = self.sigmoid(self.wyh(output_embedding) + self.wzh(previous_z) + self.whh(decoder_hidden) + self.bh)
        mu = self.fc_mu(h_z)
        logvar = self.fc_logvar(h_z)
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z_t = mu + (eps * std)
        return  h_z , z_t

class generate(nn.Module):
    def __init__(self, input_size=50265, hidden_size=500, embedding_dim = 300, latent_size=500, n_layers=1, bi_dir=True):
        super(generate, self).__init__()
        self.wzh = nn.Linear(latent_size, hidden_size)
        self.whh = nn.Linear(hidden_size, hidden_size)
        self.bh = nn.Parameter(torch.randn(hidden_size))
        self.softmax = nn.LogSoftmax()
        self.why = nn.Linear(hidden_size, input_size)
        self.bhy = nn.Parameter(torch.randn(input_size))
    def forward(self , z_t , hidden_secondgru,):
        h_y = torch.tanh(self.wzh(z_t)+ self.whh(hidden_secondgru))
        y_t = self.why(h_y) + self.bhy
        return y_t


class Attention(nn.Module):
    def __init__(self, input_size=50265, hidden_size=500, embedding_dim = 300, latent_size=500, n_layers=1, bi_dir=False  ):
        super(Attention, self).__init__()
        self.wd = nn.Linear(hidden_size, hidden_size)
        self.we = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Parameter(torch.randn(hidden_size))
        self.ba = nn.Parameter(torch.randn(hidden_size))
    def score(self, dec_st, enc_st):
        alpha = torch.matmul(torch.tanh(self.wd(dec_st.permute(1,0,2)) + self.we(enc_st) + self.ba), self.v.unsqueeze(1))
        return alpha

    def forward(self, dec_state, enc_states):
        alpha = self.score(dec_state, enc_states)
        alpha = torch.softmax(alpha, dim=1)
        alpha = alpha.permute(0, 2, 1)
        return torch.bmm(alpha, enc_states), alpha



class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size= 500, output_size = 50265, attn_type = 'gen', n_layers=1, dropout_p=0.1, bi_dir=False , input_dim = 50265 , embedding_dim = 300):
        super(AttnDecoderRNN, self).__init__()
        if bi_dir == True:
            self.bi = 2
        else:
            self.bi = 1
        self.hidden_size = hidden_size
        self.maxlen = 0
        self.attn_type = attn_type

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers,  dropout=dropout_p,
                          batch_first=True, bidirectional = bi_dir)
        self.gru2 = nn.GRU(embedding_dim + hidden_size, hidden_size, 1, dropout=dropout_p, batch_first=True, bidirectional=False)
        self.out = nn.Linear(hidden_size, output_size)
        self.out2 = nn.Linear(hidden_size*2, output_size)
        self.latent = latent().to(device)
        self.generate = generate().to(device)
        self.attention = Attention().to(device)

    def forward(self, input, previous_hidden1 , previous_hidden2 , encoder_hidden , hidden_latent_previous ):
        output_embedding = self.embedding(input)
        output_gru1, hidden1 = self.gru(output_embedding, previous_hidden1.permute(1, 0, 2))
        dec_out = hidden1
        c_t, attn_wt = self.attention(dec_out, encoder_hidden)
        gru2_input = torch.cat((output_embedding, previous_hidden2.permute(1, 0, 2)), dim=-1)  # (batch_size, 1, hidden_size * 2)
        output_gru2, hidden2 = self.gru2(gru2_input, c_t.permute(1,0,2))
        hidden_latent , z_t = self.latent(output_embedding , previous_hidden2.permute(1,0,2) , hidden_latent_previous)
        final_output = self.generate(z_t , output_gru2)
        return final_output, hidden1.permute(1,0,2), hidden2.permute(1,0,2), attn_wt , hidden_latent

class Seq2Seq_Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.encoder = VariationalEncoder().to(device)
            self.decoder = AttnDecoderRNN().to(device)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from torch.autograd import Variable
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------
#  Model Parameters
# ---------------------------
hidden_size = 500
vocab_size = 50265  # Matches input_size/output_size in your model
n_layers = 1
learning_rate = 0.5
teacher_forcing_ratio = 0  # 50% Teacher Forcing
clip = 5.0
num_epochs = 10 # Change as needed

# ---------------------------
#  Initialize Model, Loss, Optimizer
# ---------------------------

seq_Model = Seq2Seq_Model().to(device)

seq_Model_optimizer = optim.Adadelta(seq_Model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(ignore_index=1)  # Ignore PAD token (assumed to be 0)

def train_epoch(seq_Model, dataloader, criterion, seq_Model_optimizer, device):
    seq_Model.train()
    total_loss = 0.0
    teacher_forcing_ratio_2 = 0.0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training")
    for i , batch in progress_bar:
        # Reset gradients
        seq_Model.zero_grad()
        # Load data and move to GPU
        input_tensor = batch["input_ids"].to(device)  # Source sentence (batch, src_len)
        target_tensor = batch["labels"].to(device)  # Target sentence (batch, tgt_len)
        attention_mask = batch["attention_mask"].to(device)  # Attention mask
        batch_size = input_tensor.size(0)
        # Initialize encoder hidden state
        batch_size = batch["input_ids"].size(0)
        encoder_hidden = seq_Model.encoder.initHidden(batch_size)
        encoder_hidden = seq_Model.encoder(input_tensor, encoder_hidden , attention_mask)
        # Decoder: Start with SOS token (assuming SOS = 1)
        decoder_input = torch.tensor([[1]] * batch_size, device=device)  # (batch, 1)
        FristGRU_decoder_hidden = encoder_hidden.mean(dim=1 , keepdim = True)
        SecondGRU_decoder_hidden = encoder_hidden.mean(dim=1 , keepdim=True)
        hidden_latent_previous = torch.zeros(256 , 1 , 500).to(device)
        loss = 0
        use_teacher_forcing = random.random() < teacher_forcing_ratio_2
        c_t_prev=None
        if use_teacher_forcing:
            for t in range(target_tensor.size(1)-1):
                decoder_output, FristGRU_decoder_hidden,SecondGRU_decoder_hidden, _ , hidden_latent_previous = seq_Model.decoder(decoder_input, FristGRU_decoder_hidden,SecondGRU_decoder_hidden.permute(1,0,2) , encoder_hidden , hidden_latent_previous)
                decoder_output = decoder_output.squeeze(1)
                decoder_output = F.log_softmax(decoder_output , dim = -1)
                if(torch.isnan(criterion(decoder_output, target_tensor[:, t+1])) == True):
                    break
                loss += criterion(decoder_output, target_tensor[:, t+1] )  # Compute loss
                decoder_input = target_tensor[:, t+1]  # Next input is the correct word
                decoder_input = decoder_input.unsqueeze(1)
        else:
            for t in range(target_tensor.size(1) - 1):
                decoder_output, FristGRU_decoder_hidden,SecondGRU_decoder_hidden, _ , hidden_latent_previous = seq_Model.decoder(decoder_input, FristGRU_decoder_hidden,SecondGRU_decoder_hidden.permute(1,0,2) , encoder_hidden , hidden_latent_previous)
                decoder_output = decoder_output.squeeze(1)
                decoder_output = F.log_softmax(decoder_output, dim = -1)
                if(torch.isnan(criterion(decoder_output, target_tensor[:, t+1])) == True):
                    continue
                loss += criterion(decoder_output, target_tensor[:, t+1])
                topv, topi = decoder_output.data.topk(1)# Select highest-probability word
                decoder_input = topi


        # teacher_forcing_ratio = teacher_forcing_ratio * 0.95
        teacher_forcing_ratio_2 = 0.95 * teacher_forcing_ratio_2
        loss.backward()
        seq_Model_optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=(loss.item() / target_tensor.size(0)))

    return total_loss / len(train_loader) 

for epoch in range(num_epochs):
    train_loss = train_epoch(seq_Model, train_loader, criterion, seq_Model_optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}  - Train Loss: {train_loss:.4f}")

In [None]:
import numpy as np
from collections import defaultdict

def beam_search_decode(seq_Model, input_tensor, beam_width, max_length,attention_mask, device,length_penalty=0.6, min_length=10):
    
    seq_Model.eval()
    with torch.no_grad():
        # Encode the input sequence
        batch_size = 1  # Beam search works on one sequence at a time
        encoder_hidden = seq_Model.encoder.initHidden(batch_size)
        encoder_hidden = seq_Model.encoder(input_tensor.unsqueeze(0), encoder_hidden, attention_mask.unsqueeze(0))

        # Initialize the beam with the start token (e.g., <SOS> = 2)
        start_token = 1
        decoder_input = torch.tensor([[start_token]], device=device)  # (1, 1)
        FristGRU_decoder_hidden = encoder_hidden.mean(dim=1, keepdim=True)
        SecondGRU_decoder_hidden = encoder_hidden.mean(dim=1, keepdim=True)
        hidden_latent_previous = torch.zeros(1, 1, 500).to(device)

        # Initialize the beam
        beam = [{
            'sequence': [1],
            'log_prob': 0.0,
            'hidden1': FristGRU_decoder_hidden,
            'hidden2': SecondGRU_decoder_hidden,
            'hidden_latent': hidden_latent_previous
        }]

        for step in range(max_length):
            candidates = []
            for candidate in beam:
                # Stop expanding if the last token is <EOS> (e.g., <EOS> = 2)
                if candidate['sequence'][-1] == 2:
                    candidates.append(candidate)
                    continue
                # Prepare decoder input
                decoder_input = torch.tensor([[candidate['sequence'][-1]]]).to(device)  # (1, 1)
                # Decode the next token
                decoder_output, hidden1, hidden2, _, hidden_latent = seq_Model.decoder(
                    decoder_input, candidate['hidden1'], candidate['hidden2'],
                     encoder_hidden, candidate['hidden_latent']
                )
                decoder_output = decoder_output.squeeze(1)
                decoder_output = F.log_softmax(decoder_output, dim=-1)  # (1, vocab_size)

                # Get the top k tokens and their log probabilities
                topk_log_probs, topk_tokens = decoder_output.topk(beam_width, dim=-1)
                topk_log_probs = topk_log_probs.squeeze(0).cpu().numpy()  # (beam_width,)
                topk_tokens = topk_tokens.squeeze(0).cpu().numpy()  # (beam_width,)
                
                for i in range(beam_width):
                    new_sequence = candidate['sequence'] + [topk_tokens[i]]
                    new_log_prob = (float(candidate['log_prob']) + float(topk_log_probs[i]))  # Ensure scalar
                    candidates.append({
                            'sequence': new_sequence,
                            'log_prob': new_log_prob,
                            'hidden1': hidden1,
                            'hidden2': hidden2,
                            'hidden_latent': hidden_latent
                        })

            candidates = sorted(candidates, key=lambda x: x['log_prob'], reverse=True)[:beam_width]
            beam = candidates

        print(beam[0]['sequence'])
        best_candidate = beam[0]
        return best_candidate['sequence'], best_candidate['log_prob']


def evaluate_with_beam_search(seq_Model, dataloader, beam_width, max_length, device):
    predictions = []
    seq_Model.eval()

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating with Beam Search"):
            input_tensor = batch["input_ids"].to(device)  # (batch_size, src_len)
            attention_mask = batch["attention_mask"].to(device)  # Attention mask
            batch_size = input_tensor.size(0)
            for i in range(20):
              sequence, _ = beam_search_decode(
                    seq_Model, input_tensor[i], beam_width, max_length,attention_mask[i], device
                )
              predictions.append(sequence)
              

    return predictions


# Example usage
beam_width = 10  # Number of beams
max_length = 100  # Maximum sequence length
test_predictions = evaluate_with_beam_search(seq_Model, test_loader, beam_width, max_length, device)
wrapped_predictions = [test_predictions]
print(wrapped_predictions)

In [None]:
decoded_sentences = [
    tokenizer.decode(tokens, skip_special_tokens=True).strip()
    for batch in wrapped_predictions for tokens in batch
]

decoded_inputs = [
    tokenizer.decode(tokens, skip_special_tokens=True).strip()
    for tokens in test_loader.dataset["input_ids"]
]

decoded_targets = [
    tokenizer.decode(tokens, skip_special_tokens=True).strip()
    for tokens in test_loader.dataset["labels"]
]

# Print a few examples
for i in range(min(20, len(decoded_sentences))):
    print(f"Example {i+1}:")
    print(f"Input: {decoded_inputs[i]}")
    print(f"Target: {decoded_targets[i]}")
    print(f"Predicted: {decoded_sentences[i]}")
    print("-" * 50)