In [1]:
#make imports

import os
import sys
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer

In [2]:
#import data

data_fp = "Reviews.csv"

df = pd.read_csv(data_fp, on_bad_lines="skip")
df.dropna(inplace=True)
print(df.head)

<bound method NDFrame.head of             Id   ProductId          UserId                      ProfileName  \
0            1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1            2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2            3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3            4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4            5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   
...        ...         ...             ...                              ...   
568449  568450  B001EO7N10  A28KG5XORO54AY                 Lettie D. Carter   
568450  568451  B003S1WTCU  A3I8AFVPEE8KI5                        R. Sawyer   
568451  568452  B004I613EE  A121AA1GQV751Z                    pksd "pk_007"   
568452  568453  B004I613EE   A3IBEVCTXKNOH          Kathy A. Welch "katwel"   
568453  568454  B001LR2CU2  A3LGQPJCZVL9UC                         srfell17   

        HelpfulnessNu

In [3]:
def preprocess_text(text):

    #remove the html tags / links
    text = re.sub(r'<.*?>', '', text)

    #remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    #normalize the text
    text = text.lower()

    return text

In [4]:
#preprocess the 'Text' and 'Summary' columns and store them in new columns 'clean_text' and 'clean_summary'

df['clean_text'] = df['Text'].apply(preprocess_text)

In [5]:
df['clean_summary'] = df['Summary'].apply(preprocess_text)

In [6]:
#pick a subset of the dataframe to work with

df = df.sample(3200)

In [7]:
#make train-test split

from sklearn.model_selection import train_test_split

X = df['clean_text'].tolist()
y = df['clean_summary'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
#import tokenizer and model
import torch
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|endoftext|>")
model = GPT2Model.from_pretrained("gpt2")

In [9]:
#implement custom dataset class
from torch.utils.data import Dataset, DataLoader

class SummaryDataset(Dataset):

    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = 100

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoded_input = self.prepare_sequence(text)
        encoded_label = self.prepare_sequence(label)

        return encoded_input, encoded_label
    
    def __len__(self):
        return len(self.labels)
    
    def prepare_sequence(self, text):
        # tokenize the text
        tokenized_text = self.tokenizer.encode(text, add_special_tokens=True)

        # pad and truncate the tokenized text
        if len(tokenized_text) < self.max_length:
            padded_text = tokenized_text + [self.tokenizer.pad_token_id] * (self.max_length - len(tokenized_text))
        else:
            padded_text = tokenized_text[:self.max_length]

        return torch.tensor(padded_text)

In [10]:
#initialize custom dataset class into a dataloader

BATCH_SIZE = 8

train_dataset = SummaryDataset(X_train, y_train, tokenizer)
test_dataset = SummaryDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)

In [18]:
#load optimizer and loss function
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

#send model to device
model = model.to("mps")

In [19]:
from tqdm import tqdm

def run_epochs(model, optimizer, train_loader):

    # train and evaluate the model
    num_epochs = 3

    for epoch in tqdm(range(num_epochs), desc="Epochs"):

        model.train()
        train_loss = 0

        for batch in train_loader:

            #zero the gradients
            optimizer.zero_grad()

            input_batch, label_batch = batch
            input_batch = input_batch.to("mps")
            label_batch = label_batch.to("mps")

            #forward pass to calculate loss
            outputs = model(input_batch)

            logits = outputs[0]

            loss = criterion(logits.view(-1, logits.size(-1)), label_batch.view(-1))

            #backward pass to calculate gradients
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            torch.mps.empty_cache()
        
        # calculate average train_loss for each epoch

In [20]:
# run the training loop

run_epochs(model, optimizer, train_loader)

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1649 > 1024). Running this sequence through the model will result in indexing errors
Epochs: 100%|██████████| 3/3 [10:02<00:00, 200.69s/it]


In [21]:
# save the model

torch.save(model.state_dict(), "gpt2_finetuned.pth")