**Imports and Data Loading**

In [1]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

**Amazon Reviews Dataset**

In [3]:
reviews = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
len(reviews)

568454

In [None]:
reviews.columns

In [None]:
reviews.Text.values[:5]

In [None]:
reviews.Summary.values[:5]

In [98]:
len(reviews['Text'][0])

263

In [107]:
max_len = np.max([len(tokenizer.encode(length)) for length in reviews['Text']])
max_len

1384

In [108]:
mean_len = np.mean([len(tokenizer.encode(length)) for length in reviews['Text']])
mean_len

93.542

**Pre Processing**

In [5]:
reviews.dropna(subset=['Text', 'Summary'], inplace=True)
reviews.drop_duplicates(subset=['Text', 'Summary'], inplace=True)

In [6]:
len(reviews)

394967

In [7]:
reviews = reviews.head(10000)

In [8]:
# import spacy
# import string
# import pandas as pd

# # Load the spaCy model
# nlp = spacy.load("en_core_web_sm")

**Data Preprocessing**

In [9]:
# def preprocess_text(text):
#     # Create a spaCy document object
#     doc = nlp(text)
    
#     # Generate a list of lemmatized tokens that are not punctuation, special characters, or stop words
#     tokens = [
#         token.lemma_.lower()  # Use lemma and lower case
#         for token in doc
#         if token.text.isalnum() and not token.is_punct and not token.is_space and not token.is_stop
#     ]
    
#     # Join tokens to form the back the sentence
#     clean_text = ' '.join(tokens)
#     return clean_text

In [10]:
# reviews['Text'] = reviews['Text'].apply(preprocess_text)
# reviews['Text'] = reviews['Text'].apply(preprocess_text)

In [11]:
reviews.reset_index(drop=True, inplace=True)

**Setup and Training**

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")



In [13]:
model.load_state_dict(torch.load("/kaggle/working/gpt2_model_state_dict.pth"))

<All keys matched successfully>

In [14]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token

'<|endoftext|>'

In [23]:
# model = model.to(device)
# optimizer = optim.AdamW(model.parameters(), lr=5e-4)
optimizer = optim.AdamW(model.parameters(), lr=1e-7, weight_decay=0.00001)

In [17]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(reviews, test_size=0.25, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [18]:
class GPT2SummaryDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=100):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['Text'][idx]
        summary = self.data['Summary'][idx]
        combined_text = text + " " + self.tokenizer.eos_token + " " + summary
        encoding = self.tokenizer.encode_plus(
            combined_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        labels = input_ids.clone()
        labels = torch.roll(labels, -1)
        labels[-1] = -100  
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
    
train_dataset = GPT2SummaryDataset(tokenizer, train_df)
test_dataset = GPT2SummaryDataset(tokenizer, test_df)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

**GPT-2 Model Finetuning**

In [20]:
from tqdm import tqdm 

def train_model(model, data_loader, optimizer, device, epochs=1):
    model.train()
    for epoch in tqdm(range(epochs)):
        idx = 0
        for batch in data_loader:
            idx += 1
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
#             print("Input: " + tokenizer.decode(input_ids[0]))
#             logits = outputs.logits
#             predicted_ids = torch.argmax(logits, dim=-1)
#             print("Output: " + tokenizer.decode(predicted_ids[0]))

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            if idx % 5 == 0:
                print(f"Loss: {loss.item()}, {idx}")

In [None]:
train_model(model, train_loader, optimizer, device, epochs=10)

In [116]:
model_path = "/kaggle/working/gpt2_model_state_dict.pth"
torch.save(model.state_dict(), model_path)

In [22]:
# from transformers import AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

model.load_state_dict(torch.load("/kaggle/working/gpt2_model_state_dict.pth"))
model = model.to(device)

# model.eval()

**Review Summarization**

In [82]:
def model_infer(model, tokenizer, text, max_length=100, device='cpu'):
    model.eval()
    model.to(device)

    encoding = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length',
        add_special_tokens=True
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length + 2,
        pad_token_id=tokenizer.eos_token_id
    )

    review_text_len = len(text.split(tokenizer.pad_token)[0])

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)[review_text_len:]

    return summary

In [74]:
from tqdm import tqdm
# from rouge import Rouge

def test_model(model, test_loader, tokenizer, device='cpu'):
    model.eval()
    model.to(device)
    output_summaries = []
    real_summaries = []
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
#         print(tokenizer.decode(input_ids[0]))

        for i in range(input_ids.shape[0]):
            input_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
            summary = model_infer(model, tokenizer, input_text, device=device)
            output_summaries.append(summary)
#             print(tokenizer.decode(input_ids[i]))
#             print()
            real_summary = tokenizer.decode(input_ids[i]).rsplit(tokenizer.pad_token)[-1]
            real_summaries.append(real_summary)

    return output_summaries, real_summaries

output_summaries, real_summaries = test_model(model, test_loader, tokenizer, device=device)

# rouge = Rouge()
# scores = rouge.get_scores(output_summaries, real_summaries, avg=True)
# print(scores)

100%|██████████| 24/24 [01:07<00:00,  2.82s/it]


In [None]:
# !pip install rouge_score
# !pip install rouge

In [35]:
from rouge_score import rouge_scorer

def compute_rouge_scores(references, predictions):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    return avg_scores

In [117]:
rouge_scores = compute_rouge_scores(real_summaries, output_summaries)
print(rouge_scores)

{'rouge1': 0.6514285714285715, 'rouge2': 0.5766666666666665, 'rougeL': 0.6514285714285715}


In [126]:
n = 20

sample_reviews = [reviews.iloc[i]['Text'] + tokenizer.pad_token + " " for i in range(n)]
sample_summaries = [reviews.iloc[i]['Summary'] for i in range(n)]
sample_inputs = [reviews.iloc[i]['Text'] + " " + tokenizer.pad_token + " " + reviews.iloc[i]['Summary'] for i in range(n)]

output_summaries = []
for review in sample_inputs:
    summary = model_infer(model, tokenizer, review, max_length=100, device=device)
    output_summaries.append(summary)

In [127]:
sample_summaries[:10]

['Good Quality Dog Food',
 'Not as Advertised',
 '"Delight" says it all',
 'Cough Medicine',
 'Great taffy',
 'Nice Taffy',
 'Great!  Just as good as the expensive brands!',
 'Wonderful, tasty taffy',
 'Yay Barley',
 'Healthy Dog Food']

In [128]:
output_summaries[:10]

[' Good Quality Dog Food Quality Food',
 ' Not as Advertised any.',
 '',
 ' Cough Medicine Sassr',
 ' Great taffy a.',
 '',
 ' Great!  Just as good as the expensive brands! Very!',
 ' Wonderful, tasty taffy fullied',
 ' Yay Barley  for',
 ' Healthy Dog Food Good Dog']

In [129]:
references = sample_summaries
predictions = output_summaries
rouge_scores = compute_rouge_scores(references, predictions)
print(rouge_scores)

{'rouge1': 0.6705396723906955, 'rouge2': 0.6140873015873015, 'rougeL': 0.6705396723906955}
