### ***Import Libraries***

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from datasets import load_metric

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# %matplotlib inline
# %config InlineBackend.figure_format='retina'
# sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# rcParams['figure.figsize'] = 16, 10

### ***Load Model***

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="ar_AR", tgt_lang="ar_AR")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

### ***Load Dataset***

In [None]:
data = pd.read_csv("/kaggle/input/arabicsummarization/summarizdataset.csv")

data.head()

In [None]:
data.drop('type',inplace=True,axis=1)

In [None]:
data.drop('text',inplace=True,axis=1)

In [None]:
data

### ***Data Preprocessing***

In [None]:
# Data Preprocessing
# First Checking 'duplicated rows' and removing them if they are exist
dupl = data[data.duplicated()]
if len(dupl)>0:
    data=data.drop_duplicates()
    print(len(dupl))
    
# !pip install nltk
import re
import nltk
### from nltk.stem.isri import ISRIStemmer
### st = ISRIStemmer()

# Data cleaning
def clean_text(text):
    # Remove special characters and punctuation and "Arabic digits"
    text = re.sub(r'[^\u0621-\u064A \u0660-\u0669 0-9 \s ( ) : . ، ؛ ]+','',text)
    
    # Remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\.{2,}', '', text)

    return text

# Data preprocessing
def pre_process(text):
    tokens = nltk.word_tokenize(text)
    stop_words = ["و", "في", "من", "على", "إلى", "عن", "فيه", "عليه", "هو", "هي"]
    text = " ".join([word for word in tokens if word not in stop_words])

    return text

data['Processed Text'] = data['Processed Text'].apply(clean_text)

print(data['Processed Text'])

# Apply the clean_text function to the content column
data['Processed Text'] = data['Processed Text'].apply(clean_text)
print(data['Processed Text'])


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('arabic'))
preprocessed_text=[]
def preprocess_arabic_Remove_stop(text):
    
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Stem the words
    #words = [stemmer.stem(word) for word in words]
    
    # Join the words back into a single string
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

data['Processed Text'] = data['Processed Text'].apply(preprocess_arabic_Remove_stop)
data['Processed Text'] = data['Processed Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head()

In [None]:
!pip install rouge_score

In [None]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data=data,
        tokenizer=tokenizer,
        text_max_token_len = 800,
        summary_max_token_len = 150
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['Processed Text']

        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = tokenizer(
            data_row['summarizer'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        labels = summary_encoding['input_ids']
        labels[labels == tokenizer.pad_token_id] = -100
        
        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [None]:
# Data Spliting into train and val  
df_train, df_test = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = SummaryDataset(data=df_train)
test_dataset = SummaryDataset(data=df_test)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)
eval_dataloader = DataLoader(test_dataset, batch_size=2)

In [None]:
df_train.to_csv('train.csv')
df_test.to_csv('val.csv')

In [None]:
df_train

In [None]:
df_test

### ***Train mbart Model*** 

In [None]:
num_epochs = 4

num_training_steps = num_epochs * len(train_dataloader)

optimizer = AdamW(model.parameters())
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#         print(predictions)
#         print(batch["labels"])
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        
        optimizer.zero_grad()
        progress_bar.update()
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f'./t5-Arabic.pth')

    print(f'epoch: {epoch + 1} -- loss: {loss}')

In [None]:
metric= load_metric("rouge")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
def summarizeText(text, model=model):
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    generated_ids = model.generate(
        input_ids=text_encoding['input_ids'].to(device),
        attention_mask=text_encoding['attention_mask'].to(device),
        max_length=150,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )    

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    return "".join(preds)


In [None]:
path_input = '/kaggle/working/val.csv'

# Read the CSV file
dataset = pd.read_csv(path_input)

dataset.head()

In [None]:
dataset.drop('Unnamed: 0',inplace=True,axis=1)

In [None]:
dataset

In [None]:
# Data Preprocessing
# First Checking 'duplicated rows' and removing them if they are exist
dupl = dataset[dataset.duplicated()]
if len(dupl)>0:
    dataset=dataset.drop_duplicates()
    print(len(dupl))
    
    # Second Checking 'empty cells' and removing them if they are exist
    # Here we deal with empty cells (either removing them Or setting them with default data) but we won't do that because data is cleaned

    # Third Cleaning data from (Arabic and English digits, special characters, and extra spaces)
import re
def clean_text(text):

        # Remove special characters and punctuation and "Arabic digits"
    text = re.sub(r'[^\u0621-\u064A\u0660 - \0669\s]+', '', text)

        # Remove "English digits"
    text = re.sub('\d+', '', text)

        # Remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()

    return text

# Apply the clean_text function to the content column
dataset['Processed Text'] = dataset['Processed Text'].apply(clean_text)
dataset['Processed Text']

In [None]:
import pandas as pd

# Function to apply the model to each instance
def apply_model(instance):
    # Apply your model to the instance and return the output
    output = summarizeText(instance)
    return output

# Create an empty list to store the model outputs
model_outputs = []

# Iterate over the dataframe and apply the model to each instance
for index, row in tqdm(dataset.iterrows()):
    instance = row['Processed Text']  # Assuming 'text' is the column containing the input data
    output = apply_model(instance)
    model_outputs.append(output)

# Add the model outputs as a new column to the dataframe
dataset['summarizer'] = model_outputs

# Display the updated dataframe
print(dataset['summarizer'])


In [None]:
#Data Preprocessing
    #First Checking 'duplicated rows' and removing them if they are exist
dupl = data[data.duplicated()]
if len(dupl)>0:
    data=data.drop_duplicates()
    print(len(dupl))
    
    #Second Checking 'empty cells' and removing them if they are exist
    #Here we deal with empty cells (either removing them Or setting them with default data) but we won't do that because data is cleaned

    #Third Cleaning data from (Arabic and English digits, special characters, and extra spaces)
import re
def clean_text(text):

        # Remove special characters and punctuation and "Arabic digits"
    text = re.sub(r'[^\u0621-\u064A\u0660 - \0669\s]+', '', text)

        # Remove "English digits"
    ##text = re.sub('\d+', '', text)

        # Remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()

    return text

# Apply the clean_text function to the content column
dataset['summarizer'] = dataset['summarizer'].apply(clean_text)
print(dataset['summarizer'])