In [1]:
#make imports

import os
import sys
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer

In [2]:
#import data

data_fp = "Reviews.csv"

df = pd.read_csv(data_fp, on_bad_lines="skip")
df.dropna(inplace=True)
print(df.head)

<bound method NDFrame.head of             Id   ProductId          UserId                      ProfileName  \
0            1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1            2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2            3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3            4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4            5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   
...        ...         ...             ...                              ...   
568449  568450  B001EO7N10  A28KG5XORO54AY                 Lettie D. Carter   
568450  568451  B003S1WTCU  A3I8AFVPEE8KI5                        R. Sawyer   
568451  568452  B004I613EE  A121AA1GQV751Z                    pksd "pk_007"   
568452  568453  B004I613EE   A3IBEVCTXKNOH          Kathy A. Welch "katwel"   
568453  568454  B001LR2CU2  A3LGQPJCZVL9UC                         srfell17   

        HelpfulnessNu

In [3]:
def preprocess_text(text):

    #remove the html tags / links
    text = re.sub(r'<.*?>', '', text)

    #remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    #normalize the text
    text = text.lower()

    return text

In [4]:
#preprocess the 'Text' and 'Summary' columns and store them in new columns 'clean_text' and 'clean_summary'

df['clean_text'] = df['Text'].apply(preprocess_text)

In [5]:
df['clean_summary'] = df['Summary'].apply(preprocess_text)

In [6]:
#pick a subset of the dataframe to work with

df = df.sample(3200)

In [7]:
#make train-test split

from sklearn.model_selection import train_test_split

X = df['clean_text'].tolist()
y = df['clean_summary'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
#import tokenizer and model
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|endoftext|>")
model = GPT2LMHeadModel.from_pretrained("gpt2")

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
#implement custom dataset class
from torch.utils.data import Dataset, DataLoader

class SummaryDataset(Dataset):

      def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = 100

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoded_input = self.prepare_sequence(text)
        encoded_label = self.prepare_sequence(label)

        return encoded_input, encoded_label
    
    def __len__(self):
        return len(self.labels)
    
    def prepare_sequence(self, text):

        # add "TL;DR " at the end so that
        # the model understands the designated summarization task

        # find length of encoded " TL;DR " and encoded text
        tldr_length = len(self.tokenizer.encode(" TL;DR "))
        encoded_text_length = len(self.tokenizer.encode(text))

        if (encoded_text_length + tldr_length >= self.max_length):
            text = text[:self.max_length - tldr_length]
            text += " TL;DR "
        else:
            text += " TL;DR"

        # tokenize the text
        tokenized_text = self.tokenizer.encode(text)

        # pad and truncate the tokenized text
        if len(tokenized_text) < self.max_length:
            padded_text = tokenized_text + [self.tokenizer.pad_token_id] * (self.max_length - len(tokenized_text))
        else:
            padded_text = tokenized_text[:self.max_length]

        return torch.tensor(padded_text)

In [10]:
#initialize custom dataset class into a dataloader

BATCH_SIZE = 8

train_dataset = SummaryDataset(X_train, y_train, tokenizer)
test_dataset = SummaryDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=False)

In [11]:
#load optimizer and loss function
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

#send model to device
model = model.to("mps")

In [12]:
from tqdm import tqdm

def run_epochs(model, optimizer, train_loader):

    # train and evaluate the model
    num_epochs = 3

    for epoch in tqdm(range(num_epochs), desc="Epochs"):

        model.train()
        train_loss = 0

        for batch in train_loader:

            #zero the gradients
            optimizer.zero_grad()

            input_batch, label_batch = batch
            input_batch = input_batch.to("mps")
            label_batch = label_batch.to("mps")

            #forward pass to calculate loss
            outputs = model(input_batch)

            logits = outputs[0]

            loss = criterion(logits.view(-1, logits.size(-1)), label_batch.view(-1))

            #backward pass to calculate gradients
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            torch.mps.empty_cache()
        
        # calculate average train_loss for each epoch
        train_loss /= len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss}")

In [13]:
# run the training loop

run_epochs(model, optimizer, train_loader)

Epochs:  33%|███▎      | 1/3 [12:11<24:22, 731.10s/it]

Epoch 1/3 - Train Loss: 6.32480997244517


Epochs:  67%|██████▋   | 2/3 [24:00<11:58, 718.28s/it]

Epoch 2/3 - Train Loss: 5.359549331665039


Epochs: 100%|██████████| 3/3 [36:05<00:00, 721.79s/it]

Epoch 3/3 - Train Loss: 5.195158469676971





In [14]:
# save the model

torch.save(model.state_dict(), "gpt2_finetuned.pth")

In [34]:
# calculate rouge score on test set

from rouge import Rouge

def calculate_rouge_score(model):
    
        model.eval()
    
        predictions = []
        actuals = []

        for text, summary in zip(X_test, y_test):

            text += " TL;DR "
            
            encoded_text = tokenizer(text, return_tensors="pt")
            encoded_text_length = len(encoded_text)
            encoded_text = encoded_text.to("mps")

            encoded_summary_length = len(tokenizer.encode(summary, return_tensors="pt"))

            # generate the summary
            summary_ids = model.generate(**encoded_text, max_new_tokens = encoded_summary_length, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)
            predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

            predicted_summary = predicted_summary[encoded_text_length:]

            predictions.append(predicted_summary)
            actuals.append(summary)

            print("the length of generated summary is: ", len(predicted_summary))
            print("the length of actual summary is: ", len(summary))
            print(text)
            print("the generated summary is: ", predicted_summary)
            print("the actual summary is: ", summary)

        rouge = Rouge()
        scores = rouge.get_scores(predictions, actuals, avg=True)

        return scores

# calculate the rouge score
rouge_score = calculate_rouge_score(model)
print(rouge_score)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


the length of generated summary is:  1542
the length of actual summary is:  8
i love manuka honey and use it for many purpose i wa on vacation last week and forgot to bring my jar of apihealth active manuka honey with me so i went onto amazon from my iphone and found one that i could have shipped quickly on my amazon prime account which wa the manuka health new zealand active mgo the price at is the going rate albeit expensive so i didnt give that a thought when i ordered itbutwhat i didnt notice is that it only gram and the high quality active manuka honey for which the going rate is is for gram so for a product that is already too expensive theyre selling half the size for the same price making it outrageously expensive in addition when i got home i checked on the amha active manuka honey assoc web site and manuka health new zealand is not a licensee nor do they use the umf standard therefore there is no way to verify the quality or the true activity level of the manuka honey in the 

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


the length of generated summary is:  239
the length of actual summary is:  11
i order these treat because i have a picky chihuahua and they are the only treat that she will eat and can eatshe will eat or at a time running from her bowl to the carpet especially when i come home from workkeep these treat coming TL;DR 
the generated summary is:  order these treat because i have a picky chihuahua and they are the only treat that she will eat and can eatshe will eat or at a time running from her bowl to the carpet especially when i come home from workkeep these treat coming TL;DR DR
the actual summary is:  pennys only


KeyboardInterrupt: 

: 

In [None]:
# print summaries of few texts from the test data

for i in range(5):

    input_text = X_test[i]
    input_text = tokenizer.encode(input_text, return_tensors="pt").to("mps")

    output = model.generate(input_text, max_length=100)
    output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Input Text: {X_test[i]}")
    print(f"Predicted Summary: {output}")
    print(f"Actual Summary: {y_test[i]}")
    print("\n")