## Project Overview
##### This project involves preprocessing Arabic text, generating summaries using the pre-trained mBART model, and applying these processes to a dataset.

## 1. Import Libraries

In [1]:
import pandas as pd
import re
import regex
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from transformers import MBartForConditionalGeneration, MBartTokenizer
import torch
import warnings

warnings.filterwarnings('ignore', message="The tokenizer class you load from this checkpoint is not the same type as the class this function is called from.*")




## 2. Load mBART Model and Tokenizer

In [2]:
tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50')
model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50').to("cuda")

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

##  3. Preprocess Arabic Text

In [3]:
def preprocess_arabic_text(text):
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"[!؟،؛\.\(\)\[\]\{\}<>\"\'\`~@#\$%\^&\*\+=_\|\\/]+", " ", text)
    text = re.sub(r"[ٌٍَُِّْ]", "", text)
    text = re.sub(r"[^\u0621-\u064A0-9\s]", "", text)
    text = regex.sub(r"[^\p{Script=Arabic}\s.,]","",text)
    text = regex.sub(r"\p{M}", "", text)
    text = regex.sub(r"[\u064b-\u065f\u0640]","", text)
    text = text.replace('""', '')
    text = regex.sub(r"^\d+\s+", "", text)
    text = regex.sub(r"\n+", "\n", text)
    text = regex.sub(r"\t+", "", text)
    text = regex.sub(r'[^a-zA-Z0-9ء-ي\s]', '', text)
    text = regex.sub(r"\s+", " ", text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

## 4. Generate Summary Function

In [4]:
def generate_summary(text, min_len, max_len):
    # Load the pre-trained mBART model and tokenizer
    tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50')
    model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50').to("cuda")
    # Tokenize the input text
    inputs = tokenizer.encode_plus(text, return_tensors='pt', max_length=2000, truncation=True)
    # Move the input to the GPU
    inputs = {k: v.to('cuda') for k, v in inputs.items()}
    # Generate the summary
    summary_ids = model.generate(inputs['input_ids'], num_beams=60, max_length=max_len, min_length=min_len, early_stopping=True,)
    # Move the summary IDs to the CPU
    summary_ids = summary_ids[0].cpu()
    # Decode the generated summary
    summary = tokenizer.decode(summary_ids, skip_special_tokens=True)

    return summary


## 5. Generate Summaries for DataFrame

In [5]:
def generate_summaries(df):
    # Add columns for minimum and maximum summary lengths
    df["len_para"]=df['paragraph'].apply(len)
    df["min_len"]=df["len_para"]*.06
    df["max_len"]=df["len_para"]*.13
    df["min_len"]=df["min_len"].apply(lambda x: round(x))
    df["max_len"]=df["max_len"].apply(lambda x: round(x))
    df['summary'] = df.apply(lambda row: generate_summary(row['paragraph'], min_len=row['min_len'], max_len=row['max_len']), axis=1)
    df['summary'] = df['summary'].apply(preprocess_arabic_text)
    df=df.drop(['min_len','max_len','len_para'],axis=1)
    return  df


# Load the dataset
df = pd.read_csv("/kaggle/input/labeled/labeled.csv")
df = df[:10]  # Selecting a subset of the dataset for summary generation
result = generate_summaries(df)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result

## 6. Display Original Text and Summaries

In [6]:
def display_original_and_summary(df):
    for index, row in df.iterrows():
        print(f"Original Text (Paragraph {index+1}):")
        print(row['paragraph'])
        print("\nGenerated Summary:")
        print(row['summary'])
        print("\n" + "-"*50 + "\n")

# Displaying the original texts and their summaries
display_original_and_summary(result)


Original Text (Paragraph 1):
وتحت عنوان من الكارثة إلى التحدى يبدأ الكاتب عرض الكتاب الرابع ، حيث يوضح كيف كانت إسرائيل فرحة بنصرها عام 67 وأنها ارتاحت لاعتقادها بأن هناك وقتا طويلا وطويلا جدا قبل أن يفيق العرب من صدمة 67، وكيف أن القوات الجوية للجمهورية العربية المتحدة قد فاجأتها بعد شهر واحد من نهاية حرب 67 بهجوم جوى عنيف على مواقعها فى سيناء وكان هذا إعلانا عن بداية حرب من نوع جديد هى حرب الاستنزاف التى استمرت حتى تم وقف إطلاق النار بين الطرفين فى 8 أغسطس 1970، ثم وفاة عبدالناصر وتولى أنور السادات حكم مصر واستعداده للحرب . ويتعرض الكاتب أيضا وبصورة سريعة لفلسطين والأردن وسوريا قبل أن ينتقل إلى الكتاب الخامس عن حرب أكتوبر ، حيث يعرض الخطط والاستعدادات المصرية ثم الاستعدادات الإسرائيلية ثم يبدأ بعرض وقائع الحرب بداية من الضربة الجوية وانهيار خط بارليف واختراقه ، ويتوقف الكاتب عند يوم 8 أكتوبر ، ويقول : إن هذا اليوم كان اسوأ هزيمة فى تاريخ الجيش الإسرائيلى ثم ينتقل بنا المؤلف إلى الجبهة السورية ثم يعود ثانية إلى يوميات الحرب حتى 7 9 أكتوبر إلى 9 13 أكتوبر ثم 14 أكتوبر ، ثم يعرض للثغرة 