In [1]:

!pip install datasets
!pip install wandb
!pip install evaluate
!pip install rouge_score



In [2]:
import pandas as pd
import numpy as np
import nltk
import transformers
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from transformers import TrainingArguments, Trainer
import wandb

## Data pre-processing

# Loading Datasets

In [3]:
# Load the datasets
df = load_from_disk("/content/drive/MyDrive/datasets_finetuning_BART/training_dataset_splits")

df


DatasetDict({
    train: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 7738
    })
    validation: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 773
    })
    test: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 1159
    })
})

In [4]:
df['train'][1]['clean_review_text']

'im heaven thyroid problem affecting skin nothing mean nothing would work matter topical rx doctors gave horrible dry eczema like patches skin well josie maran thank bottom heart product saved skin healing healing husband family coworkers complimented skin askedif ive surgery kind andor lost weight nothing else prove stuff works fantastic skin nice compliments like thatyou gave back confidence drops morning night skin neck lovely line free im glowing patches hypothyroidism disappearing healthy ps always wear argan oil night use products extra benefits needs'

In [5]:
df['train'][1]['clean_summary']

'excellent must'

In [6]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

In [7]:
#  preprocessing function
def preprocess_function(batch):
    model_inputs = tokenizer(batch['clean_review_text'], padding="max_length", max_length=1024, truncation=True
    )
    labels = tokenizer(text_target=batch["clean_summary"],  padding="max_length", max_length=128, truncation=True
    )
    # Replace pad token IDs in the labels with -100 to ignore them in loss computation
    labels["input_ids"] = [[(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [8]:
# tokenize dataset
tokenized_dataset = df.map(preprocess_function, batched=True)

Map:   0%|          | 0/773 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7738
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 773
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1159
    })
})

In [10]:
# Remove unused columns from the tokenized datasets
tokenized_dataset = tokenized_dataset.remove_columns(["__index_level_0__", "clean_review_text", "clean_summary"])

In [14]:
# Remove unused columns from the tokenized datasets
tokenized_dataset = tokenized_dataset.remove_columns(["id"])

# Fine-tuning

## Training

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/datasets_finetuning_BART/training",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    remove_unused_columns=False
)


In [16]:
# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer

)

# Train the model
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,3.5335
1000,3.1706
1500,2.9443
2000,2.9304
2500,2.4383
3000,2.4916
3500,2.4215




TrainOutput(global_step=3870, training_loss=2.7982961570877745, metrics={'train_runtime': 622.1701, 'train_samples_per_second': 24.874, 'train_steps_per_second': 6.22, 'total_flos': 9436281427722240.0, 'train_loss': 2.7982961570877745, 'epoch': 2.0})

# Evaluate the model

In [17]:
# Evaluate the model
eval_results = trainer.evaluate()

eval_results

{'eval_loss': 2.633557081222534,
 'eval_runtime': 12.6783,
 'eval_samples_per_second': 91.416,
 'eval_steps_per_second': 11.437,
 'epoch': 2.0}

## Save the model

In [18]:
# Save the model and tokenizer after training
model.save_pretrained("/content/drive/MyDrive/datasets_finetuning_BART/summarization_model")
tokenizer.save_pretrained("/content/drive/MyDrive/datasets_finetuning_BART/summarization_model")

('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/tokenizer_config.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/special_tokens_map.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/vocab.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/merges.txt',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/added_tokens.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/tokenizer.json')

## Testing the model to summarize review

In [30]:
# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/datasets_finetuning_BART/summarization_model")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/datasets_finetuning_BART/summarization_model")

# Function to summarize a review
def summarize(review):
    # Tokenize the input review text
    inputs = tokenizer(review, max_length=1024, truncation=True, return_tensors="pt")

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=40, min_length=10, length_penalty=2.0, num_beams=8, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [31]:
# Example of a review
review = """
Oh wow, I love my new phone, the iphone 13. It takes clear pictures, it is lightweight, and I enjoy it. Though it's battery doesn't last as
long as my previous phone Redmi Note 11. Overall, I love it!.
"""

# Get the summary
summary = summarize(review)
print("Summary:", summary)

Summary: great new phone iphone stylus


In [32]:
# Load the validation dataset
dataset = load_from_disk("/content/drive/MyDrive/datasets_BART_project/validation_dataset")

In [33]:
from tqdm import tqdm


# Generate and compare summaries
generated_summaries = []
ground_truth_summaries = []


for sample in tqdm(dataset):
    # Summarize the 'clean_review_text'
    generated_summary = summarize(sample["clean_review_text"])
    generated_summaries.append(generated_summary)

    # Append the 'clean_summary' as ground truth
    ground_truth_summaries.append(sample["clean_summary"])

100%|██████████| 233/233 [01:25<00:00,  2.73it/s]


In [34]:
# compare the summaries
for i in range(10):
    print(f"Review: {dataset[i]['clean_review_text']}")
    print(f"Generated Summary: {generated_summaries[i]}")
    print(f"Ground Truth Summary: {ground_truth_summaries[i]}")
    print("---- * 50")

Review: product great son really dry skin hands always really dry product amazing fragance
Generated Summary: amazing product dry skin great dry skin
Ground Truth Summary: great
------------------------------------------------------------------------------
Review: believe fantastic product affordable price ill save lengthy review say buy
Generated Summary: fantastic product affordable price review say buy
Ground Truth Summary: get
------------------------------------------------------------------------------
Review: usually wouldve given product stars love tatcha products work wonders oily acneprone sensitive skin cream used keep oil bay never ever broke ive using water cream years ran last bottle repurchased usual days using new bottle started breaking painful cystic acne face oily broken places never breaks much pain refused believe cream hadnt introduced new products routine decided stop using days see breakouts subsided low behold face clearing im oily using different moisturizer i

## Evaluating the summaries

In [37]:
# Loading the ROGUE metric

rouge_score = evaluate.load("rouge")
scores = rouge_score.compute(
    predictions=[generated_summary], references=[ground_truth_summaries]
)
scores

{'rouge1': 0.4, 'rouge2': 0.25, 'rougeL': 0.4, 'rougeLsum': 0.4}