### **Importing Libraries**

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import nltk
nltk.download('punkt')
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Model, GPT2Config, AdamW
from nltk.tokenize import word_tokenize
import numpy as np
import torch.optim as optim
# from rouge_score import rouge_scorer
import string
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os
import re
import random
# !pip install peft
from peft import LoraConfig, get_peft_model, TaskType
import csv
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


[nltk_data] Downloading package punkt to /home/chetan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### **Dataset Loading**

In [3]:
def load_csv(file_path):
    data = []

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)

        for row in reader:
            data.append({
                'id': row['id'],
                'article': row['article'],
                'highlights': row['highlights']
            })

    return data


train_data = load_csv('./train.csv')
test_data = load_csv('./test.csv')
val_data = load_csv('./validation.csv')


In [4]:
def clean_text(text):
    pattern = r"(?i)(PUBLISHED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4}\s*.\s*\|\s*.\s*UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(By\s*.\s*[A-Za-z\s]+.)|" \
              r"(\([A-Za-z\s]*CNN\)\s*--)|" \
              r"(Follow\s*@@[A-Za-z0-9_]+)|" \
              r"(UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(Last\s*updated\s*at\s*\d{1,2}:\d{2}\s*(AM|PM)\s*on\s*\d{1,2}(st|nd|rd|th)\s*\w+\s\d{4}\s*.)|" \
              r"(\(CNN\))"

    cleaned_text = re.sub(pattern, '', text).strip()
    cleaned_text = cleaned_text.lower()

    return cleaned_text

def clean_articles(data):
    for entry in data:
        entry['article'] = clean_text(entry['article'])
        entry['highlights'] = clean_text(entry['highlights'])

    return data

def write_csv(file_path, cleaned_data):
    with open(file_path, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['id', 'article', 'highlights'])
        writer=writer
        writer.writeheader()

        for row in cleaned_data:
            writer.writerow(row)

# train_data = clean_articles(train_data)
# test_data = clean_articles(test_data)
# val_data = clean_articles(val_data)

# write_csv("./Cleaned_Dataset/train.csv", train_data)
# write_csv("./Cleaned_Dataset/test.csv", test_data)
# write_csv("./Cleaned_Dataset/validation.csv", val_data)

# def write_csv(file_path, cleaned_data, percentage=1):
#     # Calculate how many rows to write based on the percentage
#     data_size = len(cleaned_data)
#     num_rows = data_size * percentage // 100

#     with open(file_path, mode='w', encoding='utf-8', newline='') as file:
#         writer = csv.DictWriter(file, fieldnames=['id', 'article', 'highlights'])
#         writer.writeheader()

#         # Write only the first 'num_rows' rows of the data
#         for row in cleaned_data[:num_rows]:
#             writer.writerow(row)

# # Assuming train_data, test_data, val_data are your datasets
# write_csv("./train.csv", train_data)
# write_csv("./test.csv", test_data)
# write_csv("./validation.csv", val_data)



### **Intialize special Tokens**

In [5]:

# import tokenizer for padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({
    'additional_special_tokens': ['[SUM]']  # Add summary token
})

# # Initializing Pad tokens
# pad_token = tokenizer.eos_token_id
# print(pad_token)
# tokenizer.add_tokens([pad_token])




1

### **Tokenizing the Data**

In [5]:
def convertCSV(data):
    inp = []
    out = []
    for row in data:
        inp.append(row['article'])
        out.append(row['highlights'])

    return inp, out

inp_train, out_train = convertCSV(train_data)
inp_test, out_test = convertCSV(test_data)

train_size = int(0.1 * len(inp_train))
inp_train_10 = inp_train[:train_size]
out_train_10 = out_train[:train_size]

test_size = int(0.1 * len(inp_test))
inp_test_10 = inp_test[:test_size]
out_test_10 = out_test[:test_size]

# print(inp_train[0])
def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word not in string.punctuation]
#Using NLTK Tokenize

inp_train = [remove_punctuation(word_tokenize(sentence)) for sentence in inp_train_10]
inp_test = [remove_punctuation(word_tokenize(sentence)) for sentence in inp_test_10]
out_train = [remove_punctuation(word_tokenize(sentence)) for sentence in out_train_10]
out_test = [remove_punctuation(word_tokenize(sentence)) for sentence in out_test_10]

print(len(inp_train))
# print(out_train[0])

max_len = 0
for i in inp_train:
    # if max_len < len(i):
    max_len += len(i)

# print(max_len/len(inp_train))

# def tokenize(data,max_len = 1000):
def prepare_data(articles, summaries, max_len=1024):
    input_ids = []
    labels = []
    attention_masks = []

    for article, summary in zip(articles, summaries):
        # Format: Article [SUM] Summary
        article_tokens = tokenizer.encode(article, truncation=True, max_length=max_len-200)
        summary_tokens = tokenizer.encode(summary, truncation=True, max_length=200)

        # Create input sequence: article [SUM] summary
        input_sequence = (
            article_tokens +
            [tokenizer.additional_special_tokens_ids[0]] +  # [SUM] token
            summary_tokens
        )

        # Ensure we don't exceed max_len
        if len(input_sequence) > max_len:
            input_sequence = input_sequence[:max_len]

        # Create attention mask for the truncated sequence
        attention_mask = [1] * len(input_sequence)

        # Create labels array of the same length as input_sequence
        labels_array = (
            [-100] * len(article_tokens) +  # Don't compute loss for article tokens
            [-100] * 1 +  # Don't compute loss for [SUM] token
            summary_tokens  # Compute loss for summary tokens
        )

        # Truncate labels if needed
        if len(labels_array) > max_len:
            labels_array = labels_array[:max_len]

        # Pad all sequences to max_len
        padding_length = max_len - len(input_sequence)

        input_sequence = input_sequence + [tokenizer.pad_token_id] * padding_length
        attention_mask = attention_mask + [0] * padding_length
        labels_array = labels_array + [-100] * (max_len - len(labels_array))

        # Verify lengths before adding to lists
        if len(input_sequence) == max_len and len(attention_mask) == max_len and len(labels_array) == max_len:
            input_ids.append(torch.tensor(input_sequence))
            attention_masks.append(torch.tensor(attention_mask))
            labels.append(torch.tensor(labels_array))

    # Convert lists to tensors
    input_ids_tensor = torch.stack(input_ids)
    attention_masks_tensor = torch.stack(attention_masks)
    labels_tensor = torch.stack(labels)

    # print(f"Input shape: {input_ids_tensor.shape}")
    # print(f"Attention mask shape: {attention_masks_tensor.shape}")
    # print(f"Labels shape: {labels_tensor.shape}")

    return input_ids_tensor, attention_masks_tensor, labels_tensor

# Usage
train_inp, masks_train, train_out = prepare_data(inp_train, out_train, 1024)
test_inp, masks_test, test_out = prepare_data(inp_test, out_test, 1024)

# Usage
train_inp, masks_train, train_out = prepare_data(inp_train, out_train, 1024)
test_inp, masks_test, test_out = prepare_data(inp_test, out_test, 1024)
# print(train_inp[0])
# print(train_out[0][1023])


287


### **Model**

In [6]:
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
base_model.resize_token_embeddings(len(tokenizer))

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get LoRA model
model = get_peft_model(base_model, lora_config)
model.to(device)



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
     

### **Evaluation Metric**

In [7]:
def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    return average_rouge1, average_rouge2, average_rougeL

### **Train Model**

In [9]:
# model = GPT2SoftPrompt("gpt2", num_prompts)

# # Freeze GPT-2 model weights
# for param in model.gpt2_model.parameters():
#     param.requires_grad = False
# lora_model = lora_model.to(device)

train_dataset = torch.utils.data.TensorDataset(train_inp, masks_train, train_out)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)


num_epochs = 3
learning_rate = 5e-5
clip_value = 1.0
# criterion = nn.CrossEntropyLoss(ignore_index)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)# prompt_id = prompt_id.to(device)
train_losses = []
# val_losses = []


for epoch in range(num_epochs):
    train_rouge1_scores = []
    train_rouge2_scores = []
    train_rougeL_scores = []
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for batch in progress_bar:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1} Average Loss: {avg_loss:.4f}')
        #     # Convert tensor predictions and references to lists
        #     predictions = logits.argmax(dim=-1).squeeze(0).tolist()
        #     references = summary.squeeze(0).tolist()

        #     # Decode predictions and references, ignoring pad tokens
        #     decoded_predictions = tokenizer.decode([token for token in predictions if token != pad_token])
        #     decoded_references = tokenizer.decode([token for token in references if token != pad_token])

        #     rouge1, rouge2, rougeL = calculate_rouge_scores([decoded_predictions], [decoded_references])
        #     train_rouge1_scores.append(rouge1)
        #     train_rouge2_scores.append(rouge2)
        #     train_rougeL_scores.append(rougeL)


        # # # Calculate average training loss
        # avg_train_loss = total_loss / len(train_inp)  # Use len(train_inp) for average
        # train_losses.append(avg_train_loss)
        # avg_train_rouge1_score = sum(train_rouge1_scores) / len(train_rouge1_scores)
        # avg_train_rouge2_score = sum(train_rouge2_scores) / len(train_rouge2_scores)
        # avg_train_rougeL_score = sum(train_rougeL_scores) / len(train_rougeL_scores)

        # print("Average Training ROUGE-1 Score:", avg_train_rouge1_score)
        # print("Average Training ROUGE-2 Score:", avg_train_rouge2_score)
        # print("Average Training ROUGE-L Score:", avg_train_rougeL_score)



        # Validation loop
        # model.eval()
        # total_val_loss = 0
        # correct = 0
        # total = 0

        # with torch.no_grad():
        #     for batch in dataloader_val:
        #         context_words, target_words = batch
        #         context_words = context_words.to(device)
        #         target_words = target_words.to(device)

        #         outputs = model(context_words, target_words[:, :-1])

        #         outputs = outputs.contiguous().view(-1, outputs.size(-1))
        #         target_words_out = target_words[:, 1:].contiguous().view(-1)

        #         loss = criterion(outputs, target_words_out)

        #         total_val_loss += loss.item()

        #         _, predicted = torch.max(outputs, 1)
        #         total += target_words_out.size(0)
        #         correct += (predicted == target_words_out).sum().item()


        # avg_val_loss = total_val_loss / len(dataloader_val)
        # val_losses.append(avg_val_loss)
        # accuracy = 100 * correct / total

        # print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f},')
            #    Val Loss: {avg_val_loss:.4f}, Val Accuracy: {accuracy:.2f}%')



Epoch 1/3: 100%|██████████| 72/72 [01:28<00:00,  1.23s/it, loss=5.27]


Epoch 1 Average Loss: 18.3177


Epoch 2/3: 100%|██████████| 72/72 [01:24<00:00,  1.17s/it, loss=5.61]


Epoch 2 Average Loss: 5.2630


Epoch 3/3: 100%|██████████| 72/72 [01:24<00:00,  1.18s/it, loss=4.51]

Epoch 3 Average Loss: 4.9562





In [10]:
# lora_model.eval()

# # Input text for summarization
article = "archaeologists have uncovered the complete skeleton of a 17th-century camel that was likely used in the second ottoman-habsburg war. they believe the camel - the first intact camel skeleton found in central europe - may have been left in the town of tulln for trading after the siege of vienna in 1683. in a country where cows dominate the rural landscape, the discovery in an austrian cellar shocked scientists. the researchers described it as a 'sunken ship in the desert'. archaeologists have uncovered the complete skeleton of an 'alien' 17th-century camel that was likely used in the second ottoman-habsburg war. they believe the camel - the first intact camel skeleton found in central europe - may have been left in the town of tulln for trading after the siege of vienna in 1683 . genetic analysis of the beast show that it was a bactrian-dromedary hybrid – a breed popular in the ottoman army at the time. 'the partly excavated skeleton was at first suspected to be a large horse or cattle,' said alfred galik, a researcher at the university of veterinary medicine vienna. 'but one look at the cervical vertebrae, the lower jaw and the metacarpal bones immediately revealed that this was a camel.' the camel was male, around seven years old and most likely castrated. the camel was male, around seven years old and most likely castrated. along with dna evidence, the shape of the animal's skull indicated it was a hybrid . the cross-breed camel had been most likely used as a riding and transport animal 683. pictured are various views of the camel's metacarpus, which is part of its legs . the battle that took place on 11 september 1683 after vienna had been seiged  it was won  historians claim the battle marked the turning-point in the ottoman–habsburg wars, a 300-year struggle between the holy roman empire and the ottoman empire . the loot that fell into the hands of the holy league troops and the viennese was large. king john sobieski vividly described in a letter to his wife a few days after the battle: . 'ours are treasures unheard of... tents, sheep, cattle and no small number of camels... it is victory as nobody ever knew before.' galik and his team also said the cross-breed camel had been most likely used as a riding and transport animal 683. the remarkable find was made during an archaeological dig that took place amid preparations for a new shopping centre in the town. if modern-day scientists were stumped  it would have been an even greater shock for residents of 17th-century tulln. 'the animal was certainly exotic for the people of tulln. they probably didn't know what to feed it or whether one could eat it,' galik said. while roman-era camel bones occasionally surface in austria, serbia and belgium, the tulln discovery was the first complete camel skeleton to emerge in central europe. 'this means that the animal was not killed and then butchered. it may have been acquired as part of an exchange,' said galik . the remarkable find was made during an archaeological dig that took place amid preparations for a new shopping centre in the town. pictured the camel's shoulder blades . in addition to horses, the ottoman army also used camels for transportation and as riding animals. in cases of scarcity, the soldiers also ate the animal's flesh. the camel was likely used in the 1683 battle of vienna, which took place on 11 september after vienna had been seiged  it was won  historians claim the battle marked the turning-point in the ottoman–habsburg wars, a 300-year struggle between the holy roman empire and the ottoman empire . the loot that fell into the hands of the holy league troops and the viennese was large. king john sobieski vividly described in a letter to his wife a few days after the battle: . 'ours are treasures unheard of... tents, sheep, cattle and no small number of camels... it is victory as nobody ever knew before.' in a country where cows dominate the rural landscape, the discovery in an austrian cellar shocked scientists. the researchers described it as a 'sunken ship in the desert'. the find was made it the town of tulln . genetic analysis of the beast show that it was a bactrian (pictured)-dromedary hybrid – a breed popular in the ottoman army at the time ."
# # Tokenize and encode the input text
# input_ids = tokenizer.encode(input_text, truncation=True, max_length=512)
# input_ids = torch.tensor(input_ids)
# input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids
# input_ids = input_ids.to(device)

# # Convert the input_ids to a PyTorch tensor

# with torch.no_grad():
#     generated_ids = lora_model.generate(
#         input_ids,
#         max_length=513,  # Adjust as needed for summary length
#         num_beams=5,     # Beam search for better generation quality
#         early_stopping=True
#     )


# # Get the token IDs with the highest probability for each position
# # predicted_token_ids = torch.argmax(pred_logits, dim=-1)

# # Convert token IDs into words using the tokenizer
# predicted_tokens = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# print(predicted_tokens)
model.eval()

# Encode article
inputs = tokenizer.encode(
    article,
    truncation=True,
    max_length=1024-150,
    return_tensors='pt'
).to(device)

# Add summary token
input_ids = torch.cat([
    inputs,
    torch.tensor([[tokenizer.additional_special_tokens_ids[0]]]).to(device)
], dim=1)

# Generate summary
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        max_length=input_ids.shape[1] + 150,
        min_length=input_ids.shape[1] + 30,
        num_beams=4,
        length_penalty=2.0,
        no_repeat_ngram_size=3,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode and extract summary
decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
summary = decoded.split('[SUM]')[1].strip()
print(summary)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


triumph as
