*To clear the graphics memory*

In [None]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers torch datasets

In [None]:
!pip install transformers[torch] -U

In [44]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel

*Cleaning the dataset*

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR_assignment4/Reviews.csv')

In [None]:
df.head()

In [None]:
# # Remove duplicate entries
df.drop_duplicates(inplace=True)

# # Remove missing values
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
# # Define function for text cleaning and preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    text = ' '.join(tokens)
    return text

# # Apply preprocessing to 'Text' column
df['Clean_Text'] = df['Text'].apply(preprocess_text)

# # Apply preprocessing to 'Summary' column
df['Clean_Summary'] = df['Summary'].apply(preprocess_text)

# # Display the cleaned and preprocessed data
print(df[['Clean_Text', 'Clean_Summary']].head())

In [None]:
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews.csv', index = False)

In [None]:
df.head()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews.csv')

In [None]:
df = df.drop(columns=['Text', 'Summary'])

In [None]:
df.to_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews1.csv', index = False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews1.csv')

In [None]:
df = df.drop(columns=['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'])

In [None]:
df.to_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews2.csv')

*Initialize the GPT2 model and tokenizer*

In [45]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

*Created custom dataset class along with encoded class*

In [46]:
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import pandas as pd

class ReviewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, special_token='[PAD]'):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length
        self.special_token = special_token

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data_row = self.dataframe.iloc[idx]
        text = data_row['Clean_Text']
        summary = data_row['Clean_Summary']

        if pd.isna(summary):
            summary = self.special_token

        encoded_text = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
        )
        encoded_summary = self.tokenizer.encode_plus(
            summary,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
        )

        return {
            'text':text,
            'summary':summary,
            'input_ids': encoded_text['input_ids'].flatten(),
            'attention_mask': encoded_text['attention_mask'].flatten(),
            'labels': encoded_summary['input_ids'].flatten(),
        }


In [47]:
df = pd.read_csv('/content/drive/MyDrive/IR_assignment4/Clean_reviews2.csv')
df = df[:10000]


train_dataset, test_dataset = random_split(df, [int(0.75 * len(df)), len(df) - int(0.75 * len(df))])

train_data = ReviewsDataset(df.iloc[train_dataset.indices], tokenizer, special_token='[PAD]')
test_data = ReviewsDataset(df.iloc[test_dataset.indices], tokenizer, special_token='[PAD]')

*Model training along with different hyper parameters*

In [None]:
from transformers import GPT2Tokenizer, GPT2Model


# Define hyperparameters
learning_rate = 5e-4
batch_size = 8
num_epochs = 5
weight_decay = 0.01
warmup_steps = 500
optimizer = "adamw"  # AdamW optimizer
dropout_rate = 0.1
max_seq_length = 512


from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results1',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs1',
    logging_steps=10,
    learning_rate=learning_rate,
    logging_first_step=True,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,

)

# Start training
trainer.train()
trainer.save_model('/content/drive/MyDrive/IR_assignment4/savedModel')
tokenizer.save_pretrained('/content/drive/MyDrive/IR_assignment4/savedModel')

*Loading the model*

In [None]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
from transformers import pipeline

# Load the saved tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/IR_assignment4/saved_model")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/IR_assignment4/saved_model")


*Text Summarisation using generated model*

In [None]:

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)



In [None]:
input_text = """""" + "\nTL;DR:\n"

pipe_out = pipe(input_text, max_length=512, clean_up_tokenization_spaces=True)

In [None]:

generated_summary = pipe_out[0]['generated_text']


In [None]:
generated_summary

In [None]:
!pip install rouge-score

In [31]:
from rouge_score import rouge_scorer

def Rouge_scores(given, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(given, generated)
    Scores = {}
    for key in scores:
        keys = key.upper().replace("ROUGE", "ROUGE-")
        Scores[keys] = {
            'Precision': round(scores[key].precision, 3),
            'Recall': round(scores[key].recall, 3),
            'F1-Score': round(scores[key].fmeasure, 3)
        }

    return Scores





In [32]:
given_summary = '''The Fender CD-60S Dreadnought Acoustic Guitar is a
great instrument for beginners. It has a solid construction, produces a rich sound,
and feels comfortable to play. However, some users have reported issues with the
tuning stability.'''
generated_summary = 'The Fender CD-60S Acoustic Guitar is suitable for beginners, but there are reported tuning stability issues.'


In [33]:
rouge_results = Rouge_scores(given_summary, generated_summary)



In [None]:
for score_type, values in rouge_results.items():

    print(f"{score_type}: Precision: {values['Precision']}, Recall: {values['Recall']}, F1-Score: {values['F1-Score']}")