In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import json
import pandas as pd

In [2]:
config = BartConfig(
    vocab_size=50265,  # Adjust according to your tokenizer
    encoder_layers=4,  # Number of encoder layers
    decoder_layers=4,  # Number of decoder layers
    d_model=256,       # Dimensionality of the model
    decoder_ffn_dim=1024,  # FFN size
    encoder_ffn_dim=1024,
    max_position_embeddings=512
)

In [3]:
model = BartForConditionalGeneration(config)

In [4]:
def reset_weights(model):
    for module in model.modules():
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.reset_parameters()
        elif isinstance(module, nn.LayerNorm):
            module.reset_parameters()
reset_weights(model)

In [5]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
if torch.cuda.is_available():
    print("GPU device name:", torch.cuda.get_device_name(0))

PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA version: 12.4
GPU device name: NVIDIA GeForce RTX 2050


In [6]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 256, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 256, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(514, 256)
      (layers): ModuleList(
        (0-3): 4 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=256, out_features=256, bias=True)
            (v_proj): Linear(in_features=256, out_features=256, bias=True)
            (q_proj): Linear(in_features=256, out_features=256, bias=True)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=256, out_features=1024, bias=True)
          (fc2): Linear(in_features=1024, out_features=256, bias=True)
          (final_laye

In [7]:
# Define the dataset class for conversations
class ConversationDataset(Dataset):
    def __init__(self, dialogues, summaries, tokenizer, max_input_length=512, max_target_length=150):
        self.dialogues = dialogues
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        # Get the dialogue and summary for the given index
        dialogue = self.dialogues[idx]
        summary = self.summaries[idx]

        # Tokenize the dialogue and summary
        input_encodings = self.tokenizer(
            dialogue,
            max_length=self.max_input_length,  # Adjusted to match `max_position_embeddings`
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        
        target_encodings = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': input_encodings['attention_mask'].squeeze(0),
            'labels': target_encodings['input_ids'].squeeze(0)  # Ensure correct shape
        }


In [8]:
# Define the training function
def train_model(model, epochs=3, lr=0.0001):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scaler = GradScaler()

    for epoch in range(epochs):
        train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Clear memory cache after each batch
            torch.cuda.empty_cache()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch: {epoch + 1}, Average Loss: {avg_loss:.4f}")


In [9]:

# with open('train.json', 'r', encoding='utf-8', errors='ignore') as file:
#     train = json.load(file)
# # Since it's too computationally expensive to run this model I won't be doing any validation testing.
# # with open('val.json', 'r', encoding='utf-8', errors='ignore') as file:
# #     val = json.load(file)

# with open('test.json', 'r', encoding='utf-8', errors='ignore') as file:
#     test = json.load(file)

# # Convert to DataFrame
# df_train = pd.DataFrame(train)
# # df_val = pd.DataFrame(val)
# df_test = pd.DataFrame(test)

from datasets import load_dataset

# Load the Gigaword dataset with custom code execution enabled
dataset = load_dataset("gigaword", trust_remote_code=True)



In [10]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# Extract training data and limit to 70,000 samples
train_data = dataset['train']
train_dialogues = train_data['document'][:30000]
train_summaries = train_data['summary'][:30000]

print(train_dialogues[0])
print(train_summaries[0])

# Limit to 70,000 samples
train_dialogues = [item for item in train_dialogues]
train_summaries = [item for item in train_summaries]
# val_dialogues = [item['document'] for item in val_data]
# val_summaries = [item['summary'] for item in val_data]
# Prepare the dataset
train_dataset = ConversationDataset(train_dialogues, train_summaries, tokenizer)
# Train the model
train_model(model)

  scaler = GradScaler()


australia 's current account deficit shrunk by a record #.## billion dollars -lrb- #.## billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
australian current account deficit narrows sharply


  with autocast():
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [12:41<00:00,  6.57it/s]


Epoch: 1, Average Loss: 0.6145


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [11:08<00:00,  7.47it/s]


Epoch: 2, Average Loss: 0.5352


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [11:07<00:00,  7.49it/s]

Epoch: 3, Average Loss: 0.5137





In [11]:
input_text = "white house hopeful barack obama professes no anxiety about polls that show his longstanding lead evaporating, but senior democrats are rattled at the republicans ' Sarah UNK charge ."

inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

# Generate output
outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)

# Decode and print the generated text
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generated text: brit 's to for to in to


In [11]:
with open('train.json', 'r', encoding='utf-8', errors='ignore') as file:
    train = json.load(file)
# Since it's too computationally expensive to run this model I won't be doing any validation testing.
# with open('val.json', 'r', encoding='utf-8', errors='ignore') as file:
#     val = json.load(file)

with open('test.json', 'r', encoding='utf-8', errors='ignore') as file:
    test = json.load(file)

# Convert to DataFrame
df_train = pd.DataFrame(train)
# df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)

In [12]:
# Fine tuning
train_dataset = ConversationDataset(df_train['dialogue'].tolist(), df_train['summary'].tolist(), tokenizer)
train_model(model)

  scaler = GradScaler()
  with autocast():
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [05:09<00:00,  7.93it/s]


Epoch: 1, Average Loss: 0.9571


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [05:05<00:00,  8.04it/s]


Epoch: 2, Average Loss: 0.8297


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 2456/2456 [05:04<00:00,  8.05it/s]

Epoch: 3, Average Loss: 0.7730





In [17]:
conversations = [
    "A: Hi Tom, are you busy tomorrow’s afternoon?\r\nB: I’m pretty sure I am. What’s up?\r\nA: Can you go with me to the animal shelter?.\r\nB: What do you want to do?\r\nA: I want to get a puppy for my son.\r\nB: That will make him so happy.\r\nA: Yeah, we’ve discussed it many times. I think he’s ready now.\r\nB: That’s good. Raising a dog is a tough issue. Like having a baby ;-) \r\nA: I'll get him one of those little dogs.\r\nB: One that won't grow up too big;-)\r\nA: And eat too much;-))\r\nB: Do you know which one he would like?\r\nA: Oh, yes, I took him there last Monday. He showed me one that he really liked.\r\nB: I bet you had to drag him away.\r\nA: He wanted to take it home right away ;-).\r\nB: I wonder what he'll name it.\r\nA: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))",
    "Emma: I’ve just fallen in love with this advent calendar! Awesome! I wanna one for my kids!\r\nRob: I used to get one every year as a child! Loved them! \r\nEmma: Yeah, i remember! they were filled with chocolates!\r\nLauren: they are different these days! much more sophisticated! Haha!\r\nRob: yeah, they can be fabric/ wooden, shop bought/ homemade, filled with various stuff\r\nEmma: what do you fit inside?\r\nLauren: small toys, Christmas decorations, creative stuff, hair bands & clips, stickers, pencils & rubbers, small puzzles, sweets\r\nEmma: WOW! That’s brill! X\r\nLauren: i add one more very special thing as well- little notes asking my children to do something nice for someone else\r\nRob: i like that! My sister adds notes asking her kids questions about christmas such as What did the 3 wise men bring? etc\r\nLauren: i reckon it prepares them for Christmas \r\nEmma: and makes it more about traditions and being kind to other people\r\nLauren: my children get very excited every time they get one!\r\nEmma: i can see why! :)",
    "Behrouz: This model is very weak.\nJunaid: Yea bro I have to agree.\nBehrouz: So should we train another one and scrap this?\nJunaid: Nah It is what it is."
]

# Tokenize and generate summaries
model.eval()  # Set the model to evaluation mode
summaries = []
for convo in conversations:
    inputs = tokenizer(convo, max_length=1024, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
    # Decode the summary and add it to the list
    summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))


for i, conversation in enumerate(conversations):
    print(f" Conversation {i + 1} \n{conversation}\n")

 Conversation 1 
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))

 Conversation 2 
Emma: I’ve just fallen in love with this advent calendar! Awesome! I wanna one for my kids!
Rob: I used to get one every year as a child! Loved them! 
Emma: Yeah, i remember! th

In [18]:
print("------SUMMARY RESULTS------")
for i, summary in enumerate(summaries):
    print(f"Conversation {i + 1} \n Summary: {summary} \n")

------SUMMARY RESULTS------
Conversation 1 
 Summary: Anna is going to the animal tree. She is going to the restaurant. She is going to the store. 

Conversation 2 
 Summary: Emma is in love with this semester seats. She is going to get a baby and a baby for her children. 

Conversation 3 
 Summary: Eaid is very slow. She will be train another one. 



In [19]:
model.save_pretrained("save/model_reduced")
tokenizer.save_pretrained("save/model_reduced")

('save/model_reduced\\tokenizer_config.json',
 'save/model_reduced\\special_tokens_map.json',
 'save/model_reduced\\vocab.json',
 'save/model_reduced\\merges.txt',
 'save/model_reduced\\added_tokens.json')