In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

Loading the Dataset

In [None]:
reviews_df = pd.read_csv('/content/drive/MyDrive/code/IR ass 4/Reviews.csv')

Taking a smaller Dataset

In [None]:
reviews_small_df = reviews_df.head(15000)

Cleaning and PreProcessing the Text and Summary columns

In [2]:
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Functions to clean and preprocess the text

In [None]:
def preprocess_text(text):
    # Converting to lowercase
    text = text.lower()
    # Removing special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Removing HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Removing accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Performing Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Performing Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Joining tokens back into text
    cleaned_text = ' '.join(lemmatized_text)
    return cleaned_text

In [None]:
reviews_small_df['Text'] = reviews_small_df['Text'].fillna("")
reviews_small_df['Summary'] = reviews_small_df['Summary'].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_small_df['Text'] = reviews_small_df['Text'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_small_df['Summary'] = reviews_small_df['Summary'].fillna("")


In [None]:
reviews_small_df['Text'] = reviews_small_df['Text'].apply(preprocess_text)
reviews_small_df['Summary'] = reviews_small_df['Summary'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_small_df['Text'] = reviews_small_df['Text'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_small_df['Summary'] = reviews_small_df['Summary'].apply(preprocess_text)


In [None]:
print(reviews_small_df['Text'].nunique)

<bound method IndexOpsMixin.nunique of 0        bought several vitality canned dog food produc...
1        product arrived labeled jumbo salted peanutsth...
2        confection around century light pillowy citrus...
3        looking secret ingredient robitussin believe f...
4        great taffy great price wide assortment yummy ...
                               ...                        
14995    could longer find rice select whole wheat orzo...
14996    discovered local qfc grocery store place ive e...
14997    hard find whole wheat orzo supermarket happy f...
14998    product delicious healthier regular orzo espec...
14999    love whole wheat pasta husband diagnosis diabe...
Name: Text, Length: 15000, dtype: object>


**MODEL TRAINING**

Initializing a GPT-2 tokenizer and model from Hugging Face

In [4]:
!pip install transformers



In [9]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [29]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',padding_side='left')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Dividing the Dataset into Training and Testing (75:25)

In [10]:
from sklearn.model_selection import train_test_split

In [None]:
# Spliting the dataset into training and testing sets (75:25)
training_data, testing_data = train_test_split(reviews_small_df, test_size=0.25, random_state=42)

Implementing a custom dataset class to prepare the data for training.

In [None]:
from torch.utils.data import Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Text']
        summary = self.data.iloc[index]['Summary']

        # Tokenizing the review and summary
        inputs = self.tokenizer(text, summary, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")

        return {"input_ids": inputs["input_ids"].flatten(), "attention_mask": inputs["attention_mask"].flatten(), "labels": inputs["input_ids"].flatten()}

Fine-tuning the GPT-2 model on the review dataset to generate summaries. Also experimenting with the different hyperparameters.

In [5]:
!pip install transformers[torch] -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
!pip install accelerate -U



In [None]:
out_result="/content/drive/MyDrive/code/IR ass 4/Results"
log_dir="/content/drive/MyDrive/code/IR ass 4/Logs"
save_model="/content/drive/MyDrive/code/IR ass 4/Model"

In [31]:
from transformers import Trainer, TrainingArguments


# Checking if the tokenizer has a padding token; if not, set it to eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Checking again if the tokenizer has a padding token; if still not, add a special token for padding
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:


# Creating instances of dataset and dataloader
training_dataset = CustomDataset(training_data, tokenizer, max_length=128)
testing_dataset = CustomDataset(testing_data, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
        output_dir=out_result,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        save_steps=2000,
        save_total_limit=5,
        logging_steps=2000,
        logging_dir=log_dir,
        learning_rate=5e-5,
        evaluation_strategy="steps",
        eval_steps=2000
)

# Defining Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=testing_dataset
)

trainer.train()
trainer.save_model(save_model)

Step,Training Loss,Validation Loss
2000,2.0789,1.982101
4000,1.9017,1.955564


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Calculating Rouge Score

In [7]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=88dc50e23768c84ad4c02f021fcd23fd92b28f285666a832e3c848c7358be0d8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [32]:
# Function to generate summaries using the trained model
def generate_summary(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Generate summary
    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=128, num_beams=4, early_stopping=True)

    # Decode the generated summary
    generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_summary

In [33]:
from rouge_score import rouge_scorer
import torch

model_path="/content/drive/MyDrive/code/IR ass 4/Model/"
model_f = GPT2LMHeadModel.from_pretrained(model_path)
# Iterating through the test set and generating summaries, then computing ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
device = "cuda"

input_text = "Tree provides us with oxygen"
actual_summary = "Oxygen gives us life"
predicted_summary = generate_summary(input_text, model_f, tokenizer, device)
print(predicted_summary)
scores = scorer.score(actual_summary, predicted_summary)
rouge_scores.append(scores)
#for i in range(len(testing_data)):
#    input_text = testing_data.iloc[i]['Text']
#    actual_summary = testing_data.iloc[i]['Summary']
#    predicted_summary = generate_summary(input_text, model, tokenizer, device)
#    scores = scorer.score(actual_summary, predicted_summary)
#    rouge_scores.append(scores)

rouge_review = pd.DataFrame(rouge_scores)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tree provides us with oxygen free energy great way start daygreat energy


In [28]:
rouge_review

Unnamed: 0,rouge1,rouge2,rougeL
0,"(0.4, 0.5, 0.4444444444444445)","(0.0, 0.0, 0.0)","(0.2, 0.25, 0.22222222222222224)"


In [None]:
rouge_review.to_csv("/content/drive/MyDrive/code/IR ass 4/Rouge.csv", index = False)