In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import json
import pandas as pd


In [14]:
import torch
import torch.nn as nn
from transformers import PreTrainedTokenizerFast

# Step 1: Load Tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '<s>', 'eos_token': '</s>'})

vocab_size = 50265
embedding_dim = 512  # Hidden size of the model


# Positional and Token Embeddings
class TokenEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TokenEmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)


class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=1024):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-torch.log(torch.tensor(10000.0)) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len=512):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_embedding = TokenEmbeddingLayer(vocab_size, embed_dim)
        self.position_encoding = PositionalEncoding(embed_dim, max_len)

    def forward(self, input_ids):
        token_embeddings = self.token_embedding(input_ids)
        return self.position_encoding(token_embeddings)


# Encoder and Decoder
class BARTEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers=4, max_len=512):
        super(BARTEncoder, self).__init__()
        self.embedding = TokenAndPositionEmbedding(vocab_size, embed_dim, max_len)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True)
            for _ in range(num_layers)
        ])

    def forward(self, input_ids, attention_mask=None):
      x = self.embedding(input_ids)
      # print("Input IDs shape:", input_ids.shape)
      # print("Attention mask shape before conversion:", attention_mask.shape)

      if attention_mask is not None:
          attention_mask = ~attention_mask.bool()  # Convert and invert mask
          # print("Attention mask shape after conversion:", attention_mask.shape)

      for layer in self.layers:
          x = layer(x, src_key_padding_mask=attention_mask)
      return x



class BARTDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers=4, max_len=512):
        super(BARTDecoder, self).__init__()
        self.embedding = TokenAndPositionEmbedding(vocab_size, embed_dim, max_len)
        self.layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True)
            for _ in range(num_layers)
        ])

    def forward(self, input_ids, encoder_output, attention_mask=None, tgt_mask=None):
        x = self.embedding(input_ids)
        # print("Decoder input_ids shape:", input_ids.shape)
        # print("Encoder output shape:", encoder_output.shape)

        if attention_mask is not None:
            # print("Attention mask shape before conversion:", attention_mask.shape)
            attention_mask = ~attention_mask.bool()  # Convert and invert mask
            # print("Attention mask shape after conversion:", attention_mask.shape)

        if tgt_mask is not None:
            print("Tgt mask shape:", tgt_mask.shape)

        for layer in self.layers:
            x = layer(
                x,
                memory=encoder_output,
                tgt_key_padding_mask=attention_mask,
                memory_key_padding_mask=attention_mask
            )
        return x



# Combined BART Model
class BARTModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, hidden_dim, max_len=512):
        super(BARTModel, self).__init__()
        self.encoder = BARTEncoder(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_len)
        self.decoder = BARTDecoder(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_len)
        self.output_projection = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, attention_mask, labels=None):
        # Encoder forward pass
        encoder_outputs = self.encoder(input_ids, attention_mask)

        # Prepare decoder inputs
        if labels is not None:
            decoder_input_ids = torch.cat([labels[:, :1], labels[:, :-1]], dim=1)  # Shift target for teacher forcing
        else:
            raise ValueError("Decoder input required for training. Use inference mode for generation.")

        # Decoder forward pass
        decoder_outputs = self.decoder(decoder_input_ids, encoder_outputs, attention_mask)

        # Project to vocabulary
        logits = self.output_projection(decoder_outputs)
        return logits


# Initialize Model
num_layers = 4
num_heads = 8
hidden_dim = 1024
max_len = 512

model = BARTModel(vocab_size, embedding_dim, num_layers, num_heads, hidden_dim, max_len)

# Example Input
example_input = ["This is an example."]
input_data = tokenizer.batch_encode_plus(
    example_input,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)
input_ids = input_data['input_ids']
attention_mask = input_data['attention_mask']

# Example Target
target_data = tokenizer.batch_encode_plus(
    ["This is the output."],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)
labels = target_data['input_ids']
print(labels)

# Forward Pass
logits = model(input_ids, attention_mask, labels)
print("Logits shape:", logits.shape)  # Should be (batch_size, seq_len, vocab_size)


tensor([[   0,  713,   16,    5, 4195,    4,    2]])
Logits shape: torch.Size([1, 7, 50265])


In [4]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
if torch.cuda.is_available():
    print("GPU device name:", torch.cuda.get_device_name(0))

PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA version: 12.4
GPU device name: NVIDIA GeForce RTX 2050


In [4]:
model.to(device)

BARTModel(
  (encoder): BARTEncoder(
    (embedding): TokenAndPositionEmbedding(
      (token_embedding): TokenEmbeddingLayer(
        (embedding): Embedding(50265, 512)
      )
      (position_encoding): PositionalEncoding()
    )
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): BARTDecoder(
    (embedding): TokenAndPositionEmbedding(
      (token_embedding):

In [5]:
# Define the dataset class for conversations
class ConversationDataset(Dataset):
    def __init__(self, dialogues, summaries, tokenizer, max_input_length=512, max_target_length=150):
        self.dialogues = dialogues
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        # Get the dialogue and summary for the given index
        dialogue = self.dialogues[idx]
        summary = self.summaries[idx]

        # Tokenize the dialogue and summary
        input_encodings = self.tokenizer(
            dialogue,
            max_length=150,  # Adjusted to match `max_position_embeddings`
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

        target_encodings = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': input_encodings['attention_mask'].squeeze(0),
            'labels': target_encodings['input_ids'].squeeze(0)  # Ensure correct shape
        }


In [26]:
%pip install datasets

from datasets import load_dataset

# Load the Gigaword dataset with custom code execution enabled
dataset = load_dataset("gigaword", trust_remote_code=True)


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm  # For progress bar during training

# Assuming you have the dataset class already
# Create DataLoader



batch_size = 8
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# Extract training data and limit to 70,000 samples
train_data = dataset['train']
train_dialogues = train_data['document'][:30000]
train_summaries = train_data['summary'][:30000]

print(train_dialogues[0])
print(train_summaries[0])

# Limit to 70,000 samples
train_dialogues = [item for item in train_dialogues]
train_summaries = [item for item in train_summaries]
train_dataset = ConversationDataset(train_dialogues, train_summaries, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model
model = BARTModel(vocab_size, embedding_dim, num_layers, num_heads, hidden_dim, max_len=512)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define loss function
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
num_epochs = 3  # Set number of epochs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0  # Initialize total loss for the epoch

    # Iterate through batches
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        # Move input data to device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass through the model
        outputs = model(input_ids, attention_mask, labels=labels)
        logits = outputs  # logits should be of shape (batch_size, seq_len, vocab_size)

        # Compute loss (we only compute loss on the tokens, not the padding tokens)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))

        # Backpropagation
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    # Print loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the trained model
# torch.save(model.state_dict(), "bart_model.pth")


australia 's current account deficit shrunk by a record #.## billion dollars -lrb- #.## billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
australian current account deficit narrows sharply


Epoch 1/3: 100%|███████████████████████████████████████████████████████████████████| 3750/3750 [16:58<00:00,  3.68it/s]


Epoch 1/3, Loss: 4.3875


Epoch 2/3: 100%|███████████████████████████████████████████████████████████████████| 3750/3750 [17:00<00:00,  3.67it/s]


Epoch 2/3, Loss: 1.4621


Epoch 3/3: 100%|███████████████████████████████████████████████████████████████████| 3750/3750 [16:57<00:00,  3.69it/s]

Epoch 3/3, Loss: 0.6420





In [13]:
import torch
# torch.save(model.state_dict(), "bart_model.pth")

state_dict = torch.load("bart_model.pth", map_location="cpu")

# for key in state_dict.keys():
#     print(key, state_dict[key].shape)


from transformers import BartForConditionalGeneration, BartConfig

config = BartConfig(
    vocab_size=50265,  # Match tokenizer vocab size
    encoder_layers=4,  # Match number of encoder layers
    decoder_layers=4,  # Match number of decoder layers
    d_model=512,  # Match hidden size
    decoder_ffn_dim=1024,  # FFN hidden size for decoder
    encoder_ffn_dim=1024,  # FFN hidden size for encoder
    max_position_embeddings=512  # Match max position embeddings
)


model = BartForConditionalGeneration(config)
for key in model.state_dict().keys():
    print(key, model.state_dict()[key].shape)


new_state_dict = {}
for key in state_dict.keys():
    if "encoder" in key:
        new_key = key.replace("encoder", "model.encoder")
        new_state_dict[new_key] = state_dict[key]

    elif "decoder" in key:
        new_key = key.replace("decoder", "model.decoder")
        new_state_dict[new_key] = state_dict[key]



print(new_state_dict.keys())

  state_dict = torch.load("bart_model.pth", map_location="cpu")


final_logits_bias torch.Size([1, 50265])
model.shared.weight torch.Size([50265, 512])
model.encoder.embed_tokens.weight torch.Size([50265, 512])
model.encoder.embed_positions.weight torch.Size([514, 512])
model.encoder.layers.0.self_attn.k_proj.weight torch.Size([512, 512])
model.encoder.layers.0.self_attn.k_proj.bias torch.Size([512])
model.encoder.layers.0.self_attn.v_proj.weight torch.Size([512, 512])
model.encoder.layers.0.self_attn.v_proj.bias torch.Size([512])
model.encoder.layers.0.self_attn.q_proj.weight torch.Size([512, 512])
model.encoder.layers.0.self_attn.q_proj.bias torch.Size([512])
model.encoder.layers.0.self_attn.out_proj.weight torch.Size([512, 512])
model.encoder.layers.0.self_attn.out_proj.bias torch.Size([512])
model.encoder.layers.0.self_attn_layer_norm.weight torch.Size([512])
model.encoder.layers.0.self_attn_layer_norm.bias torch.Size([512])
model.encoder.layers.0.fc1.weight torch.Size([1024, 512])
model.encoder.layers.0.fc1.bias torch.Size([1024])
model.encoder.

In [None]:
import io
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Define the model configuration
config = BartConfig(
    vocab_size=50265,  # Match tokenizer vocab size
    encoder_layers=4,  # Match number of encoder layers
    decoder_layers=4,  # Match number of decoder layers
    d_model=512,  # Match hidden size
    decoder_ffn_dim=1024,  # FFN hidden size for decoder
    encoder_ffn_dim=1024,  # FFN hidden size for encoder
    max_position_embeddings=512  # Match max position embeddings
)

# Initialize the model with the custom config
model = BartForConditionalGeneration(config)

# Load the updated state_dict
buffer = io.BytesIO()
torch.save(new_state_dict, buffer)  # Save the state_dict to a buffer
buffer.seek(0)  # Reset buffer position
loaded_state_dict = torch.load(buffer, map_location="cpu")

# Load the weights into the model
model.load_state_dict(loaded_state_dict, strict=False)

# Set model to evaluation mode and move to the correct device
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare input text for testing
# input_text = "This is a test input for the BART model."
input_text = "white house hopeful barack obama professes no anxiety about polls that show his longstanding lead evaporating, but senior democrats are rattled at the republicans ' Sarah UNK charge ."

inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

# Generate output
outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)

# Decode and print the generated text
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))

# Example Input

# train_dataset_test
# input_text = "white house hopeful barack obama professes no anxiety about polls that show his longstanding lead evaporating , but senior democrats are rattled at the republicans ' sarah UNK charge ."




In [6]:
with open('train.json', 'r', encoding='utf-8', errors='ignore') as file:
    train = json.load(file)
# Since it's too computationally expensive to run this model I won't be doing any validation testing.
# with open('val.json', 'r', encoding='utf-8', errors='ignore') as file:
#     val = json.load(file)

with open('test.json', 'r', encoding='utf-8', errors='ignore') as file:
    test = json.load(file)

# Convert to DataFrame
df_train = pd.DataFrame(train)
# df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)


In [18]:
train_dataset = ConversationDataset(df_train['dialogue'].tolist(), df_train['summary'].tolist(), tokenizer)
train_model(model)



  scaler = GradScaler()
  with autocast():
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [16:29<00:00,  2.48it/s]


Epoch: 1, Average Loss: 1.2441


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [15:30<00:00,  2.64it/s]


Epoch: 2, Average Loss: 0.7692


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [16:12<00:00,  2.52it/s]

Epoch: 3, Average Loss: 0.6460





In [10]:
import evaluate

df_train = df_train.sample(n=300, random_state=42)
# Define the test dataset and dataloader
test_dataset = ConversationDataset(df_train['dialogue'].tolist(), df_train['summary'].tolist(), tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=False)

# Set model to evaluation mode
model.eval()

# Load ROUGE metric
rouge = evaluate.load("rouge")

predictions = []
references = []

# Generate predictions and collect references
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Generate predictions
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4)
        
        # Decode predictions and references
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_refs = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

        predictions.extend(decoded_preds)
        references.extend(decoded_refs)

# Compute ROUGE scores
results_training = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

Testing: 100%|█████████████████████████████████████████████████████████████████████████| 50/50 [00:53<00:00,  1.06s/it]


In [11]:
# Testing on the training dataset
for key, value in results_training.items():
    print(f"{key}: {value:.4f}")

rouge1: 0.2469
rouge2: 0.0886
rougeL: 0.2165
rougeLsum: 0.2161


In [12]:
test_dataset = ConversationDataset(df_test['dialogue'].tolist(), df_test['summary'].tolist(), tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=False)

# Set model to evaluation mode
model.eval()

# Load ROUGE metric
rouge = evaluate.load("rouge")

predictions = []
references = []

# Generate predictions and collect references
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Generate predictions
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4)
        
        # Decode predictions and references
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_refs = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

        predictions.extend(decoded_preds)
        references.extend(decoded_refs)

# Compute ROUGE scores
results_test = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

Testing: 100%|███████████████████████████████████████████████████████████████████████| 137/137 [02:02<00:00,  1.12it/s]


In [13]:
# Testing on the test dataset
for key, value in results_test.items():
    print(f"{key}: {value:.4f}")

rouge1: 0.2292
rouge2: 0.0723
rougeL: 0.2000
rougeLsum: 0.1998
