In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [1]:
import torch
import pandas as pd
from torchmetrics.text import ROUGEScore
from transformers import BartForConditionalGeneration, BartTokenizerFast
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("../../data/labelled/metadata/splits/test.csv")

In [4]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device

'cuda'

In [3]:
model_path = "../../train/finetune/product_descriptions/bart_product_info1"

In [4]:
model = BartForConditionalGeneration.from_pretrained(model_path)

In [7]:
%%capture
model.to(device)

In [5]:
tokenizer = BartTokenizerFast.from_pretrained(model_path)

In [9]:
rouge = ROUGEScore()

In [10]:
batch_size = 8

In [None]:
rouge_scores = {"rouge1":[], "rouge2":[], "rougeL": []}
for idx in tqdm(range(0, len(df), batch_size)):
    batch = df[idx:idx+batch_size]
    product_descriptions = batch.product_info.to_list()
    actual_summary = batch.summary.to_list()

    input_ids = tokenizer(product_descriptions, max_length=512, return_tensors="pt", padding=True, truncation=True)["input_ids"].to(device)
    summary_ids = model.generate(input_ids, min_length=0, max_length=256)
    preds = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    for pred, target in zip(preds, actual_summary):
        rouge_all = rouge(pred, target)
        rouge_scores['rouge1'].append(rouge_all['rouge1_fmeasure'].item())
        rouge_scores['rouge2'].append(rouge_all['rouge2_fmeasure'].item())
        rouge_scores['rougeL'].append(rouge_all['rougeL_fmeasure'].item())


In [12]:
df['rouge1'] = rouge_scores['rouge1']
df['rouge2'] = rouge_scores['rouge2']
df['rougeL'] = rouge_scores['rougeL']

In [13]:
df.to_csv("test_scores.csv", index=False)

In [14]:
df = pd.read_csv("test_scores.csv")

In [15]:
df.head()

Unnamed: 0,product_info,summary,rouge1,rouge2,rougeL
0,<b>CONTENTS - In the Friendly Swede Retail pa...,The Friendly Swede Retail packaging includes a...,0.78733,0.666667,0.742081
1,EMPIRE's two piece snap-on cases are made of h...,EMPIRE's two piece snap-on cases provide maxim...,1.0,1.0,1.0
2,This protector case is custom made for your de...,"A custom-made protector case for your device, ...",1.0,1.0,1.0
3,"120 Volt 15 Amp Surge Protected, 19 Inch Rack ...",Surge-protected power distribution unit for 19...,0.327869,0.067797,0.229508
4,"Your digital data, photos, videos, email, docu...",The Maxtor OneTouch III provides an automated ...,0.53211,0.261682,0.385321


In [16]:
scores = {"rouge1": df.rouge1.mean(), "rouge2": df.rouge2.mean(), "rougeL": df.rougeL.mean()}

In [17]:
scores

{'rouge1': 0.7250186611744471,
 'rouge2': 0.5554247786211719,
 'rougeL': 0.6442226495213327}

In [6]:
## uploaded model to HF hub
model_name = "saichandrapandraju/bart-summarization-amazon-product-info"

In [9]:
model = BartForConditionalGeneration.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

In [10]:
tokenizer = BartTokenizerFast.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]