In [1]:
#make imports

import os
import sys
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/co

In [3]:
#import data

data_fp = "/kaggle/input/review-data/Reviews.csv"

df = pd.read_csv(data_fp, on_bad_lines="skip")
df.dropna(inplace=True)
print(df.head)

<bound method NDFrame.head of             Id   ProductId          UserId                      ProfileName  \
0            1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1            2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2            3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3            4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4            5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   
...        ...         ...             ...                              ...   
568449  568450  B001EO7N10  A28KG5XORO54AY                 Lettie D. Carter   
568450  568451  B003S1WTCU  A3I8AFVPEE8KI5                        R. Sawyer   
568451  568452  B004I613EE  A121AA1GQV751Z                    pksd "pk_007"   
568452  568453  B004I613EE   A3IBEVCTXKNOH          Kathy A. Welch "katwel"   
568453  568454  B001LR2CU2  A3LGQPJCZVL9UC                         srfell17   

        HelpfulnessNu

In [4]:
def preprocess_text(text):

    #remove the html tags / links
    text = re.sub(r'<.*?>', '', text)

    #remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    #normalize the text
    text = text.lower()

    return text

In [5]:
#pick a subset of the dataframe to work with

df = df.sample(1600)

In [6]:
df['clean_summary'] = df['Summary'].apply(preprocess_text)

In [7]:
#preprocess the 'Text' and 'Summary' columns and store them in new columns 'clean_text' and 'clean_summary'

df['clean_text'] = df['Text'].apply(preprocess_text)

In [8]:
#make train-test split

from sklearn.model_selection import train_test_split

X = df['clean_text'].tolist()
y = df['clean_summary'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
#make reviews list with appropriate task designator

reviews = [(text + " TL;DR " + summary) for text, summary in zip(X_train, y_train)]

In [10]:
#print average length of the text + tldr + summary

print(np.mean([len(review.split()) for review in reviews]))

81.50833333333334


In [11]:
#import tokenizer and model

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
#set gpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [13]:
#implement custom dataset class
from torch.utils.data import Dataset, DataLoader

class SummaryDataset(Dataset):

    def __init__(self, reviews, tokenizer):
        self.reviews = reviews
        self.max_length = 200
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.seqs = []

        for review in self.reviews:
            
            #encode the text and add eos to the end
            encoded_review = self.tokenizer.encode(review + self.eos)

            #pad and truncate
            padded_seq = self.prepare_sequence(encoded_review)

            #create a tensor and add to the list of sequences
            self.seqs.append(padded_seq)

    def __getitem__(self, idx):
        return self.seqs[idx]
    
    def __len__(self):
        return len(self.seqs)
    
    def prepare_sequence(self, text):

        # find length of encoded " TL;DR " and encoded text
        tldr_length = len(self.tokenizer.encode(" TL;DR "))
        encoded_text_length = len(text) - tldr_length

        if (encoded_text_length <= self.max_length):
            difference = self.max_length - encoded_text_length
            sequence = text + [self.eos_id]*difference
        else:
            sequence = text[:self.max_length+3] + [self.eos_id]
            
        labels = sequence[1:] + [self.eos_id]

        return {
            'input_ids': torch.tensor(sequence).to(device),
            'labels': torch.tensor(labels).to(device)
        }

In [14]:
#init custom dataset class into a dataloader

BATCH_SIZE = 24

train_dataset = SummaryDataset(reviews, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
#load optimizer 
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)

#send model to device
model = model.to(device)

In [16]:
from tqdm import tqdm

def run_epochs(model, optimizer, train_loader, num_epochs):

    for epoch in tqdm(range(num_epochs), desc="Epochs"):

        model.train()
        train_loss = 0

        for idx,batch in enumerate(train_loader):

            #zero the gradients
            optimizer.zero_grad()

            #forward pass to calculate loss
            outputs = model(**batch)
            loss = outputs[0]

            #backward pass to calculate gradients
            loss.backward()
            optimizer.step()

            torch.cuda.empty_cache()

            if (idx%500 == 0):
                print(f"Loss: {loss.item()}")

In [17]:
run_epochs(model, optimizer, train_loader, 5)

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Loss: 11.383487701416016


Epochs:  20%|██        | 1/5 [01:08<04:32, 68.12s/it]

Loss: 2.850433826446533


Epochs:  40%|████      | 2/5 [02:19<03:29, 69.90s/it]

Loss: 2.6383824348449707


Epochs:  60%|██████    | 3/5 [03:31<02:21, 70.85s/it]

Loss: 2.605344533920288


Epochs:  80%|████████  | 4/5 [04:43<01:11, 71.32s/it]

Loss: 2.5957252979278564


Epochs: 100%|██████████| 5/5 [05:55<00:00, 71.11s/it]


In [18]:
#generate summaries from the validation set and calculate rouge scores

model.eval()

def generate_summary(review_text):
    # tokenize input review
    inputs = tokenizer.encode(review_text+" TL;DR ", return_tensors='pt').to(device)

    # generate summary
    with torch.no_grad():
        summary_ids = model.generate(inputs, max_length=len(inputs[0])+30, num_beams=10, repetition_penalty=3.0, length_penalty=3.0, early_stopping=False)
    
    # decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary
    
# Example review
sample_review = X_test[15]

# Generate summary
gen_summary = generate_summary(sample_review).split("TL;DR")[-1].strip()
print("Original review text was: ", sample_review)
print("Original summary was: ", y_test[15])
print("The generated summary is: ", gen_summary)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original review text was:  i steep in half the water for min and drink it down it certainly seems to open up my passage way and moreso clear my chest fairly quickly its a simple easy fast remedy when you feel like heck all over i wa shocked how it opened up my airway a if i took a shot of an inhaler but how nice it wa not i didnt need to drink it all day either a cup or two a day did me good on a typical congestive day try it at least once a an alternative to heavier drug
Original summary was:  opens up the airway
The generated summary is:  tea great very best is k product love taste coffee so flavor delicious chip this be justant barented badel favorite tasty price better they potato


In [20]:
#calculate rouge scores
!pip install rouge_score

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_1_scores = []
rouge_2_scores = []
rouge_L_scores = []

for review_text, summary in zip(X_test, y_test):
    generated_summary = generate_summary(review_text[:200])
    scores = scorer.score(summary[:200], generated_summary) #reference summary, generated summary
    rouge_1_scores.append(scores["rouge1"])
    rouge_2_scores.append(scores["rouge2"])
    rouge_L_scores.append(scores["rougeL"])



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [21]:
#print average rouge scores

np_rouge1 = np.array(rouge_1_scores)
np_rouge2 = np.array(rouge_2_scores)
np_rougeL = np.array(rouge_L_scores)

average_precision = np.mean(np_rouge1[:, 0])  #precision is at index 0
average_recall = np.mean(np_rouge1[:, 1])  #recall is at index 1
average_f1 = np.mean(np_rouge1[:, 2])  #F1-score is at index 2

# Print average scores
print("Average Precision Rouge1:", average_precision)
print("Average Recall Rouge1:", average_recall)
print("Average F1-Score Rouge1:", average_f1)
print("\n\n\n")

average_precision = np.mean(np_rouge2[:, 0])  #precision is at index 0
average_recall = np.mean(np_rouge2[:, 1])  #recall is at index 1
average_f1 = np.mean(np_rouge2[:, 2])  #F1-score is at index 2

# Print average scores
print("Average Precision Rouge2:", average_precision)
print("Average Recall Rouge2:", average_recall)
print("Average F1-Score Rouge2:", average_f1)
print("\n\n\n")

average_precision = np.mean(np_rougeL[:, 0])  #precision is at index 0
average_recall = np.mean(np_rougeL[:, 1])  #recall is at index 1
average_f1 = np.mean(np_rougeL[:, 2])  #F1-score is at index 2

# Print average scores
print("Average Precision RougeL:", average_precision)
print("Average Recall RougeL:", average_recall)
print("Average F1-Score RougeL:", average_f1)
print("\n\n\n")

Average Precision Rouge1: 0.03824570665682176
Average Recall Rouge1: 0.581611007048507
Average F1-Score Rouge1: 0.07036157672454921




Average Precision Rouge2: 0.008621854834841156
Average Recall Rouge2: 0.15591332972582972
Average F1-Score Rouge2: 0.015921809058515916




Average Precision RougeL: 0.030266984003462373
Average Recall RougeL: 0.47758377733377727
Average F1-Score RougeL: 0.055790309895016106






In [22]:
#save the model

torch.save(model.state_dict(), "finetuned_final_final.pth")