In [1]:

!pip install datasets
!pip install wandb
!pip install evaluate
!pip install rouge_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import pandas as pd
import numpy as np
import nltk
import transformers
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from transformers import TrainingArguments, Trainer
import wandb

## Data pre-processing

# Loading Datasets

In [3]:
# Load the datasets
df = load_from_disk("/content/drive/MyDrive/datasets_finetuning_BART/training_dataset_splits")

df


DatasetDict({
    train: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 7738
    })
    validation: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 773
    })
    test: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
        num_rows: 1159
    })
})

In [4]:
df['train'][1]['clean_review_text']

'im heaven thyroid problem affecting skin nothing mean nothing would work matter topical rx doctors gave horrible dry eczema like patches skin well josie maran thank bottom heart product saved skin healing healing husband family coworkers complimented skin askedif ive surgery kind andor lost weight nothing else prove stuff works fantastic skin nice compliments like thatyou gave back confidence drops morning night skin neck lovely line free im glowing patches hypothyroidism disappearing healthy ps always wear argan oil night use products extra benefits needs'

In [5]:
df['train'][1]['clean_summary']

'excellent must'

In [6]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
#  preprocessing function
def preprocess_function(batch):
    model_inputs = tokenizer(batch['clean_review_text'], padding="max_length", max_length=1024, truncation=True
    )
    labels = tokenizer(text_target=batch["clean_summary"],  padding="max_length", max_length=128, truncation=True
    )
    # Replace pad token IDs in the labels with -100 to ignore them in loss computation
    labels["input_ids"] = [[(label if label != tokenizer.pad_token_id else -100) for label in label_seq] for label_seq in labels["input_ids"]]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [8]:
# tokenize dataset
tokenized_dataset = df.map(preprocess_function, batched=True)

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7738
    })
    validation: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 773
    })
    test: Dataset({
        features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1159
    })
})

In [10]:
# Remove unused columns from the tokenized datasets
tokenized_dataset = tokenized_dataset.remove_columns(["__index_level_0__", "clean_review_text", "clean_summary"])

In [11]:
# Remove unused columns from the tokenized datasets
tokenized_dataset = tokenized_dataset.remove_columns(["id"])

# Fine-tuning

## Training

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/datasets_finetuning_BART/training",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    remove_unused_columns=False
)


In [13]:
# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer

)

# Train the model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,3.5328
1000,3.2288
1500,2.9959
2000,2.9331
2500,2.4433
3000,2.4956
3500,2.4347




TrainOutput(global_step=3870, training_loss=2.8158371089964875, metrics={'train_runtime': 644.7155, 'train_samples_per_second': 24.004, 'train_steps_per_second': 6.003, 'total_flos': 9436281427722240.0, 'train_loss': 2.8158371089964875, 'epoch': 2.0})

# Evaluate the model

In [14]:
# Evaluate the model
eval_results = trainer.evaluate()

eval_results

{'eval_loss': 2.6335978507995605,
 'eval_runtime': 12.7433,
 'eval_samples_per_second': 90.95,
 'eval_steps_per_second': 11.378,
 'epoch': 2.0}

## Save the model

In [20]:
import torch

# Save the model's state_dict
torch.save(model.state_dict(), '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/fine_tuned_model.pth')

# Save the tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model')



('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/tokenizer_config.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/special_tokens_map.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/vocab.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/merges.txt',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/added_tokens.json',
 '/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/tokenizer.json')

In [23]:
# Define the epoch value before saving the model
epoch = 1
torch.save(model.state_dict(), f'/content/drive/MyDrive/datasets_finetuning_BART/summarization_model/pytorch_model_{epoch}.bin')


In [25]:
#saving the model
model.save_pretrained('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model')


## Testing the model to summarize review

In [26]:
# Load the saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model')
model = AutoModelForSeq2SeqLM.from_pretrained('/content/drive/MyDrive/datasets_finetuning_BART/summarization_model')

# Function to summarize a review
def summarize(review):
    # Tokenize the input review text
    inputs = tokenizer(review, max_length=1024, truncation=True, return_tensors="pt")

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=40, min_length=10, length_penalty=2.0, num_beams=8, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [27]:
# Example of a review
review = """
Oh wow, I love my new phone, the iphone 13. It takes clear pictures, it is lightweight, and I enjoy it. Though it's battery doesn't last as
long as my previous phone Redmi Note 11. Overall, I love it!.
"""

# Get the summary
summary = summarize(review)
print("Summary:", summary)

Summary: great new phone iphone stylus


In [28]:
# Load the validation dataset
dataset = load_from_disk("/content/drive/MyDrive/datasets_BART_project/validation_dataset")

In [29]:
from tqdm import tqdm


# Generate and compare summaries
generated_summaries = []
ground_truth_summaries = []


for sample in tqdm(dataset):
    # Summarize the 'clean_review_text'
    generated_summary = summarize(sample["clean_review_text"])
    generated_summaries.append(generated_summary)

    # Append the 'clean_summary' as ground truth
    ground_truth_summaries.append(sample["clean_summary"])

100%|██████████| 233/233 [01:24<00:00,  2.75it/s]


In [30]:
# compare the summaries
for i in range(10):
    print(f"Review: {dataset[i]['clean_review_text']}")
    print(f"Generated Summary: {generated_summaries[i]}")
    print(f"Ground Truth Summary: {ground_truth_summaries[i]}")
    print("------------------------------------------------------------------------------" )

Review: product great son really dry skin hands always really dry product amazing fragance
Generated Summary: amazing product dry skin great dry skin
Ground Truth Summary: great
------------------------------------------------------------------------------
Review: believe fantastic product affordable price ill save lengthy review say buy
Generated Summary: fantastic product affordable price great price
Ground Truth Summary: get
------------------------------------------------------------------------------
Review: usually wouldve given product stars love tatcha products work wonders oily acneprone sensitive skin cream used keep oil bay never ever broke ive using water cream years ran last bottle repurchased usual days using new bottle started breaking painful cystic acne face oily broken places never breaks much pain refused believe cream hadnt introduced new products routine decided stop using days see breakouts subsided low behold face clearing im oily using different moisturizer im h

## Evaluating the summaries

In [31]:
# Loading the ROGUE metric

rouge_score = evaluate.load("rouge")
scores = rouge_score.compute(
    predictions=[generated_summary], references=[ground_truth_summaries]
)
scores

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.4, 'rouge2': 0.25, 'rougeL': 0.4, 'rougeLsum': 0.4}