In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

In [2]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data,
        text_max_token_len = 2000,
        summary_max_token_len = 150
    ):
        self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['text']

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [3]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(self,train_path,val_path,batch_size=16, text_max_token_len = 2000, summary_max_token_len = 150):
        super().__init__()
        self.train_path,self.val_path= train_path,val_path
        self.batch_size = batch_size
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def setup(self,stage=None):
        train = pd.read_csv(self.train_path)
        val = pd.read_csv(self.val_path)
        self.train_dataset = SummaryDataset(data=train,
                                            text_max_token_len=self.text_max_token_len,
                                            summary_max_token_len=self.summary_max_token_len)
        self.val_dataset = SummaryDataset(data=val,
                                          text_max_token_len=self.text_max_token_len,
                                          summary_max_token_len=self.summary_max_token_len)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, shuffle=False, num_workers=4)


In [4]:
class T5SumModel(pl.LightningModule):
    def __init__(self, lr=0.0001):
        super().__init__()
        self.lr = lr
        self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
    
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=self.lr)
    
    def training_step(self, batch, batch_size):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        decoder_attention_mask = batch['decoder_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_size):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        decoder_attention_mask = batch['decoder_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

In [5]:
# Load checkpoint
trained_model = T5SumModel.load_from_checkpoint(
'/kaggle/input/fork-of-t5-fine-tuned-load/lightning_logs/version_0/checkpoints/epoch=2-step=77937.ckpt')

  return self.fget.__get__(instance, owner)()


In [6]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")

#trained_model.freeze()

def summarizeText(text, mymodel):
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    generated_ids = mymodel.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    return "".join(preds)

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
#train=pd.read_csv('train.csv')
val=pd.read_csv('/kaggle/input/fine-tuning-t5-arabic-summarization-2/val.csv')

In [13]:
import torch

# Check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the appropriate device
model.to(device)

def summarizeText(text, mymodel):
    # Move input tensors to the same device as the model
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    ).to(device)

    generated_ids = mymodel.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    return "".join(preds)

test = val.sample(10)
for i in range(len(test)):
    sample_row = test.iloc[i]
    text = sample_row['text']
    print('Text : ', text)
    print('\n')
    print('Summary (Ground truth) : ',sample_row['summary'])
    print('\n')
    summary_without_tuning = summarizeText(text, model)
    print("Summary before tuning", summary_without_tuning)
    print('\n')
    summary_with_tuning = summarizeText(text, trained_model.model)  # Use the same model for tuning
    print("Summary after tuning", summary_with_tuning)
    print('\n')
    print('-------------------------------------------------------------------------------------------------')


Text :  لسان الحمل يأتي من نبات له أوراق مسطحة وطويلة ونحيلة وتنمو في كل مكان تقريبا بما في ذلك معظم حدائق المنازل. هناك عدة أنواع منها ولكن جميعها لها عروق عمودية في أوراقها.بعض الناس يعتقدون أنها أعشاب ضارة ولكن لها العديد من الفوائد الطبية. يمكن استخدام النبات على الجرح مباشرة لحمايته وتعزيز عملية الالتئام.ولأوراقها أيضا خواص المضادات الحيوية.  لا يدرك الكثير من الناس أن النبات الموجود في حديقتهم هو لسان الحمل ويرجع ذلك في الأساس إلى وجود طعام آخر يسمى "موز الجنة" وهو فاكهة تشبه الموز، ويحمل كلاهما نفس الاسم العلمي (Plantain). أوراق لسان الحمل هي أعشاب وتختلف عن نبات موز الجنة الذي هو فاكهة.  إن لم تجد عشبة لسان الحمل في مكان حولك فيمكنك طلب أوراق النبات المجففة والأعشاب من بائعي الأعشاب الطبيعية. يمكنك أيضا شراء المراهم والدهانات المصنوعة من لسان الحمل. اقطف عشرة أوراق من لسان الحمل الطازجة. اغل الأوراق بتمهل في 2.5 سم من الماء تقريبا حتى تصبح لينة. خذ الأوراق وقم بهرسها باستخدام ظهر ملعقة ثم اترك العجينة لتبرد.  بعد أن يبرد الخليط أضف بضع قطرات من أي زيت من اختيارك. من الخيارات ال

In [9]:
test = val.sample(100)
sum_results=[[]]
for i in range(len(test)):
    sample_row = test.iloc[i]
    text = sample_row['text']
    summary_with_tuning = summarizeText(text, trained_model.model) 
    sum_results.append(summary_with_tuning)
    

In [12]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store individual ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate through test samples and corresponding summaries
for i in range(len(test)):
    sample_row = test.iloc[i]
    text = sample_row['text']
    summary_with_tuning = summarizeText(text, trained_model.model)
    
    summary = sample_row['summary']

    # Compute ROUGE scores for each sample
    scores = scorer.score(summary, summary_with_tuning)
    
    # Store individual ROUGE scores
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Compute average ROUGE scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print("Average ROUGE-1 F1 Score:", avg_rouge1)
print("Average ROUGE-2 F1 Score:", avg_rouge2)
print("Average ROUGE-L F1 Score:", avg_rougeL)


Average ROUGE-1 F1 Score: 0.027266899766899767
Average ROUGE-2 F1 Score: 0.004318181818181817
Average ROUGE-L F1 Score: 0.027266899766899767
