### **Importing Libraries**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Model, GPT2Config, AdamW
from nltk.tokenize import word_tokenize
import numpy as np
import torch.optim as optim
from rouge_score import rouge_scorer
import string
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 
import os
import re
import random
import csv
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

[nltk_data] Downloading package punkt to /home/chetan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/chetan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


cuda


### **Dataset Loading**

In [2]:
def load_csv(file_path):
    data = []

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            data.append({
                'id': row['id'],
                'article': row['article'],
                'highlights': row['highlights']
            })

    return data


train_data = load_csv('./Cleaned_Dataset/train.csv')
test_data = load_csv('./Cleaned_Dataset/test.csv')
val_data = load_csv('./Cleaned_Dataset/validation.csv')


In [3]:
def clean_text(text):
    pattern = r"(?i)(PUBLISHED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4}\s*.\s*\|\s*.\s*UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(By\s*.\s*[A-Za-z\s]+.)|" \
              r"(\([A-Za-z\s]*CNN\)\s*--)|" \
              r"(Follow\s*@@[A-Za-z0-9_]+)|" \
              r"(UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(Last\s*updated\s*at\s*\d{1,2}:\d{2}\s*(AM|PM)\s*on\s*\d{1,2}(st|nd|rd|th)\s*\w+\s\d{4}\s*.)|" \
              r"(\(CNN\))"
    
    cleaned_text = re.sub(pattern, '', text).strip()
    cleaned_text = cleaned_text.lower()
    
    return cleaned_text

def clean_articles(data):
    for entry in data:
        entry['article'] = clean_text(entry['article'])
        entry['highlights'] = clean_text(entry['highlights'])
    
    return data

def write_csv(file_path, cleaned_data):
    with open(file_path, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['id', 'article', 'highlights'])
        writer=writer
        writer.writeheader()
        
        for row in cleaned_data:
            writer.writerow(row)

# train_data = clean_articles(train_data)
# test_data = clean_articles(test_data)
# val_data = clean_articles(val_data)

# write_csv("./Cleaned_Dataset/train.csv", train_data)
# write_csv("./Cleaned_Dataset/test.csv", test_data)
# write_csv("./Cleaned_Dataset/validation.csv", val_data)
            
# def write_csv(file_path, cleaned_data, percentage=1):
#     # Calculate how many rows to write based on the percentage
#     data_size = len(cleaned_data)
#     num_rows = data_size * percentage // 100

#     with open(file_path, mode='w', encoding='utf-8', newline='') as file:
#         writer = csv.DictWriter(file, fieldnames=['id', 'article', 'highlights'])
#         writer.writeheader()
        
#         # Write only the first 'num_rows' rows of the data
#         for row in cleaned_data[:num_rows]:
#             writer.writerow(row)

# # Assuming train_data, test_data, val_data are your datasets
# write_csv("./train.csv", train_data)
# write_csv("./test.csv", test_data)
# write_csv("./validation.csv", val_data)



### **Intialize special Tokens**

In [4]:
# import tokenizer for padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add the special <SEP> token to the tokenizer
special_tokens_dict = {'sep_token': '<SEP>','pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens_dict)
# Verify that the <SEP> token has been added
sep_token_id = tokenizer.convert_tokens_to_ids('<SEP>')
pad_token_id = tokenizer.convert_tokens_to_ids('<PAD>')
print(f"<SEP> token ID: {sep_token_id}")
print(f"<PAD> token ID: {pad_token_id}")


# Initializing Pad tokens
pad_token = tokenizer.eos_token_id
# tokenizer.add_tokens([pad_token])


<SEP> token ID: 50257
<PAD> token ID: 50258




### **Tokenizing the Data**

In [5]:
def convertCSV(data):
    inp = []
    out = []
    for row in data:
        inp.append(row['article'])
        out.append(row['highlights'])
    
    return inp, out

inp_train, out_train = convertCSV(train_data)
inp_test, out_test = convertCSV(test_data)

train_size = int(0.004 * len(inp_train))
inp_train_10 = inp_train[:train_size]
out_train_10 = out_train[:train_size]

test_size = int(0.004 * len(inp_test))
inp_test_10 = inp_test[:test_size]
out_test_10 = out_test[:test_size]

# print(inp_train[0])
def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word not in string.punctuation]
#Using NLTK Tokenize

inp_train = [remove_punctuation(word_tokenize(sentence)) for sentence in inp_train_10]
inp_test = [remove_punctuation(word_tokenize(sentence)) for sentence in inp_test_10]
out_train = [remove_punctuation(word_tokenize(sentence)) for sentence in out_train_10]
out_test = [remove_punctuation(word_tokenize(sentence)) for sentence in out_test_10]

print(len(inp_train))
# print(out_train[0])

max_len = 0
for i in inp_train:
    # if max_len < len(i):
    max_len += len(i)
    
print(max_len/len(inp_train))

# def tokenize(data,max_len = 1000):

# def prepare_data(sentences,pad, max_len=1024):
#     all_indices = []
#     for _, sentence in enumerate(sentences):

#         tokens = tokenizer.encode(sentence,truncation=True,max_length=max_len)
#         padded_tokens = torch.tensor(tokens + [pad] * (max_len - len(tokens)))
        
#         all_indices.append(padded_tokens)
        
#     return all_indices


# train_inp = prepare_data(inp_train,pad_token,1024)
# test_inp = prepare_data(inp_test,pad_token,1024)
# train_out = prepare_data(out_train,pad_token,1024)
# test_out = prepare_data(out_test,pad_token, 1024)
def prepare_data(articles, summaries, pad_token_id, max_len=1024):
    all_data = []
    for article, summary in zip(articles, summaries):
        input_text = f"{article} <SEP> {summary}"
        
        tokens = tokenizer.encode(input_text, truncation=True, max_length=max_len)
        
        input_ids = tokens[:-1]
        labels = tokens[1:]
        
        padded_input_ids = input_ids + [pad_token_id] * (max_len - 1 - len(input_ids))
        padded_labels = labels + [pad_token_id] * (max_len - 1 - len(labels))
        
        # Convert to tensors
        input_tensor = torch.tensor(padded_input_ids)
        label_tensor = torch.tensor(padded_labels)
        
        all_data.append((input_tensor, label_tensor))
    
    return all_data
# Prepare the training and testing datasets
train_data = prepare_data(inp_train, out_train, pad_token_id, 1024)
test_data = prepare_data(inp_test, out_test, pad_token_id, 1024)

# print(train_inp[0])
# print(train_out[0][1023])


1148
670.3214285714286


### **Model**

In [6]:
class GPT2SummarizationFineTune(nn.Module):
    def __init__(self, model_name="gpt2"):
        super().__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_name)
        
        for param in self.gpt2.parameters():
            param.requires_grad = False
        
        for param in self.gpt2.lm_head.parameters():
            param.requires_grad = True
        
        for param in self.gpt2.transformer.h[-1].parameters():
            param.requires_grad = True

    def forward(self, input_ids, labels=None):
        if labels is not None:
            outputs = self.gpt2(input_ids=input_ids, labels=labels)
        else:
            outputs = self.gpt2(input_ids=input_ids)
        return outputs

### **Dataset**

In [7]:
class SummarizationDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

### **Evaluation Metric**

In [8]:
def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    return average_rouge1, average_rouge2, average_rougeL

### **Train Model**

In [9]:
# model = GPT2SummarizationFineTune("gpt2")

# # # Freeze GPT-2 model weights
# # for param in model.gpt2_model.parameters():
# #     param.requires_grad = False
# model = model.to(device)

# num_epochs = 1
# learning_rate = 2e-3
# clip_value = 1.0
# criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# prompt_id = prompt_id.to(device)
# train_losses = []
# # val_losses = []


# for epoch in range(num_epochs):
#     train_rouge1_scores = []
#     train_rouge2_scores = []
#     train_rougeL_scores = []
#     model.train()
#     total_loss = 0
#     with tqdm(enumerate(zip(train_inp, train_out)), total=len(train_inp), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as progress:
#         train_percentage_matched = 0
#         train_percentage_matched_ct = 0
#         for _, (article, summary) in progress:
#             context_words = article.to(device)
#             target_words = summary.to(device)

#             optimizer.zero_grad()  # Zero the gradients

#             # Forward pass
#             outputs = model(context_words, labels=target_words)
#             logits = outputs.logits if hasattr(outputs, "logits") else outputs.last_hidden_state

#             # Reshape outputs and targets for loss calculation
#             # outputs = outputs.view(-1, outputs.size(-1))
#             # target_words_out = target_words[:, 1:].view(-1)

#             # Calculate loss
#             loss = criterion(outputs.logits, target_words)

#             # Backward pass and optimization step
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

#             optimizer.step()

#             total_loss += loss.item()  # Accumulate loss

#             # Convert tensor predictions and references to lists
#             predictions = logits.argmax(dim=-1).squeeze(0).tolist()
#             references = summary.squeeze(0).tolist()

#             # Decode predictions and references, ignoring pad tokens
#             decoded_predictions = tokenizer.decode([token for token in predictions if token != pad_token])
#             decoded_references = tokenizer.decode([token for token in references if token != pad_token])

#             rouge1, rouge2, rougeL = calculate_rouge_scores([decoded_predictions], [decoded_references])
#             train_rouge1_scores.append(rouge1)
#             train_rouge2_scores.append(rouge2)
#             train_rougeL_scores.append(rougeL)


#         # # Calculate average training loss
#         avg_train_loss = total_loss / len(train_inp)  # Use len(train_inp) for average
#         train_losses.append(avg_train_loss)
#         avg_train_rouge1_score = sum(train_rouge1_scores) / len(train_rouge1_scores)
#         avg_train_rouge2_score = sum(train_rouge2_scores) / len(train_rouge2_scores)
#         avg_train_rougeL_score = sum(train_rougeL_scores) / len(train_rougeL_scores)

#         print("Average Training ROUGE-1 Score:", avg_train_rouge1_score)
#         print("Average Training ROUGE-2 Score:", avg_train_rouge2_score)
#         print("Average Training ROUGE-L Score:", avg_train_rougeL_score)



#         # Validation loop
#         # model.eval()
#         # total_val_loss = 0
#         # correct = 0
#         # total = 0

#         # with torch.no_grad():
#         #     for batch in dataloader_val:
#         #         context_words, target_words = batch
#         #         context_words = context_words.to(device)
#         #         target_words = target_words.to(device)

#         #         outputs = model(context_words, target_words[:, :-1])

#         #         outputs = outputs.contiguous().view(-1, outputs.size(-1))
#         #         target_words_out = target_words[:, 1:].contiguous().view(-1)

#         #         loss = criterion(outputs, target_words_out)

#         #         total_val_loss += loss.item()

#         #         _, predicted = torch.max(outputs, 1)
#         #         total += target_words_out.size(0)
#         #         correct += (predicted == target_words_out).sum().item()


#         # avg_val_loss = total_val_loss / len(dataloader_val)
#         # val_losses.append(avg_val_loss)
#         # accuracy = 100 * correct / total

#         print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f},')
#             #    Val Loss: {avg_val_loss:.4f}, Val Accuracy: {accuracy:.2f}%')


def train_model(model, train_dataloader, num_epochs, device, learning_rate=2e-3, pad_token=None):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        with tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}") as progress:
            for batch_idx, (input_ids, labels) in enumerate(progress):
                input_ids = input_ids.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()
                
                # Forward pass
                outputs = model(input_ids)
                logits = outputs.logits
                
                # Reshape logits and labels for loss calculation
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()
                
                # Calculate loss
                loss = criterion(shift_logits.view(-1, shift_logits.size(-1)),
                               shift_labels.view(-1))
                
                # Backward pass
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                
                total_loss += loss.item()
                
                progress.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    return model


In [10]:
# model.eval()

# # Input text for summarization
# input_text = "the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october. the state health department has issued an advisory of exposure for anyone who attended five churches and took communion. bishop john folda (pictured) of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a . state immunization program manager molly howell says the risk is low, but officials feel it's important to alert people to the possible exposure. the diocese announced on monday that bishop john folda is taking time off after being diagnosed with hepatitis a. the diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in italy last month. symptoms of hepatitis a include fever, tiredness, loss of appetite, nausea and abdominal discomfort. fargo catholic diocese in north dakota (pictured) is where the bishop is located ."
# # Tokenize and encode the input text
# input_ids = tokenizer.encode(input_text, truncation=True, max_length=1023)

# # Convert the input_ids to a PyTorch tensor
# input_ids = torch.tensor(input_ids)

# # Generate a summary
# with torch.no_grad():
#     # Assuming single prompt
#     outputs = model(input_ids.to(device),prompt_id)
#     pred_logits = outputs.logits
#     # print(pred_logits.shape)


# # Get the token IDs with the highest probability for each position
# predicted_token_ids = torch.argmax(pred_logits, dim=-1)

# # Convert token IDs into words using the tokenizer
# predicted_tokens = tokenizer.decode([token for token in predicted_token_ids if token != pad_token])
# print(predicted_tokens)

def generate_summary(model, article, tokenizer, device, max_length=150):
    model.eval()
    
    # Prepare input text with SEP token
    input_text = f"{article} <SEP>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    # Generate summary
    with torch.no_grad():
        outputs = model.gpt2.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            # eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            no_repeat_ngram_size=3,
            # Stop at <SEP> token if encountered again
            eos_token_id=sep_token_id
        )
    
    # Decode the generated summary
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract only the summary part (after <SEP>)
    summary = generated_text.split("<SEP>")[-1].strip()
    
    return summary

### **Main Function**

In [11]:

# First, add the special token to the tokenizer
# tokenizer.add_special_tokens({'additional_special_tokens': ['<SEP>']})
# Resize model embeddings to account for new token

# Prepare the data
model = GPT2SummarizationFineTune("gpt2")
model.gpt2.resize_token_embeddings(len(tokenizer))

# Create dataset and dataloader
train_dataset = SummarizationDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Train the model
model = train_model(
    model=model,
    train_dataloader=train_dataloader,
    num_epochs=1,
    device=device,
    learning_rate=2e-3,
    pad_token=tokenizer.pad_token_id
)

# # Generate example
# article = "Your test article here..."
# summary = generate_summary(model, article, tokenizer, device)
# print(f"Generated Summary: {summary}")


Epoch 1/1:   0%|          | 0/287 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 39.31 MiB is free. Including non-PyTorch memory, this process has 3.71 GiB memory in use. Of the allocated memory 3.56 GiB is allocated by PyTorch, and 66.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)