### **Importing Libraries**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import nltk
# nltk.download('punkt')
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Model, GPT2Config, AdamW
from nltk.tokenize import word_tokenize
import numpy as np
import torch.optim as optim
import string
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 
import os
import re
import csv
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# import tokenizer for padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

cuda




### **Dataset Loading**

In [2]:
def load_csv(file_path):
    data = []

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            data.append({
                'id': row['id'],
                'article': row['article'],
                'highlights': row['highlights']
            })

    return data


train_data = load_csv('./Dataset/train.csv')
test_data = load_csv('./Dataset/test.csv')
val_data = load_csv('./Dataset/validation.csv')


In [None]:
def clean_text(text):
    pattern = r"(?i)(PUBLISHED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4}\s*.\s*\|\s*.\s*UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(By\s*.\s*[A-Za-z\s]+.)|" \
              r"(\([A-Za-z\s]*CNN\)\s*--)|" \
              r"(Follow\s*@@[A-Za-z0-9_]+)|" \
              r"(UPDATED:\s*.\s*\d{1,2}:\d{2}\s*(EST|PST),\s*\d{1,2}\s\w+\s\d{4})|" \
              r"(Last\s*updated\s*at\s*\d{1,2}:\d{2}\s*(AM|PM)\s*on\s*\d{1,2}(st|nd|rd|th)\s*\w+\s\d{4}\s*.)|" \
              r"(\(CNN\))"
    
    cleaned_text = re.sub(pattern, '', text).strip()
    cleaned_text = cleaned_text.lower()
    
    return cleaned_text

def clean_articles(data):
    for entry in data:
        entry['article'] = clean_text(entry['article'])
        entry['highlights'] = clean_text(entry['highlights'])
    
    return data

# def write_csv(file_path, cleaned_data):
#     with open(file_path, mode='w', encoding='utf-8', newline='') as file:
#         writer = csv.DictWriter(file, fieldnames=['id', 'article', 'highlights'])
#         writer=writer
#         writer.writeheader()
        
#         for row in cleaned_data:
#             writer.writerow(row)

train_data = clean_articles(train_data)
test_data = clean_articles(test_data)
val_data = clean_articles(val_data)

# write_csv("./Cleaned_Dataset/train.csv", train_data)
# write_csv("./Cleaned_Dataset/test.csv", test_data)
# write_csv("./Cleaned_Dataset/validation.csv", val_data)


### **Intialize special Tokens**

In [None]:
prompt_tokken = "[SUMMARIZE]"
soft_prompt_vocab = ["[SUMMARIZE]"]
soft_prompt_word2idx = {word: idx for idx, word in enumerate(soft_prompt_vocab)}

num_prompts = len([soft_prompt_word2idx[word] for word in prompt_tokken.split()])
prompt_id = torch.tensor([soft_prompt_word2idx[word] for word in prompt_tokken.split()])

# Initializing Pad tokens
pad_token = tokenizer.eos_token
tokenizer.add_tokens([pad_token])


### **Tokenizing the Data**

In [None]:
def convertCSV(data):
    inp = []
    out = []
    for row in data:
        inp.append(row['article'])
        out.append(row['highlights'])
    
    return inp, out

inp_train, out_train = convertCSV(train_data)
inp_test, out_test = convertCSV(test_data)

train_size = int(0.1 * len(inp_train))
inp_train_10 = inp_train[:train_size]
out_train_10 = out_train[:train_size]

test_size = int(0.1 * len(inp_test))
inp_test_10 = inp_test[:test_size]
out_test_10 = out_test[:test_size]

# print(inp_train[0])

#Using NLTK Tokenize

inp_train = [word_tokenize(sentence) for sentence in inp_train_10]
inp_test = [word_tokenize(sentence) for sentence in inp_test_10]
out_train = [word_tokenize(sentence) for sentence in out_train_10]
out_test = [word_tokenize(sentence) for sentence in out_test_10]

# print(inp_train[0])


# def tokenize(data,max_len = 1000):

def prepare_data(sentences,pad, max_len=1024):
    all_indices = []
    for _, sentence in enumerate(sentences):

        tokens = tokenizer.encode(sentence,add_special_tokens=True,truncation=True,max_length=max_len)
        padded_tokens = torch.tensor(tokens + [tokenizer.convert_tokens_to_ids(pad)] * (max_len - len(tokens)))
        
        all_indices.append(padded_tokens)
        
    return all_indices


train_inp = prepare_data(inp_train,pad_token,1024-num_prompts)
test_inp = prepare_data(inp_test,pad_token,1024-num_prompts)
train_out = prepare_data(out_train,pad_token,1024)
test_out = prepare_data(out_test,pad_token, 1024)
print(len(train_inp[0]))


### **Model**

In [None]:
class GPT2SoftPrompt(torch.nn.Module):
    def __init__(self, model, num_prompts, emb_size = 768):
        super().__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model)
        self.prompt = torch.nn.Embedding(num_prompts, emb_size)