In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from nltk.stem import ISRIStemmer
from pathlib import Path

stemmer = ISRIStemmer()
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

pd.set_option('display.max_rows', None)


df = pd.read_csv('WikiLingua_cleaned.csv')

print(df.tail())


2023-06-29 02:08:33.448448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-29 02:08:36.544281: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-29 02:08:36.784408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /tmp/.mount_CursorQ9GjqY/usr/lib:
2023-06-29 02:08:36.784437: I tensorflow/compiler

          idx                                           document  \
29193  9993_1  قد تصبح لديك مناطق من الماء الآسن إذا تشكلت كت...   
29194  9993_2  . يجب ألا تنمو الطحالب إذا بقيت مطلعًا على كيم...   
29195  9994_0  عند الانتهاء من تناول الوجبة وتدوين الملاحظات،...   
29196  9994_1  يجب أن تحصل على نفس التجربة التي سيحصل عليها أ...   
29197  9994_2  عندما يصل الطبق إلى طاولتك، دوّن ملاحظتك عن مظ...   

                                                 summary  
29193  قم بتحسين الدورة للتعامل مع مع البقع الصغيرة م...  
29194   حافظ على ماء حمام السباحة أضف مبيد الطحالب كو...  
29195  قم ببعض البحث. افتتح التقييم بجملة جذابة. صف 3...  
29196  تجنب إخبار العاملين بالمطعم أنك ناقد أو مقيّم ...  
29197  لاحظ طريقة تقديم الطبق. استمتع بأول قضمات. اكت...  


In [3]:
with open(Path("Stopwords/Stopwords_List.txt"), "r", encoding="utf-8") as f:
    arabic_stopwords = set(f.read().splitlines())

def preprocess(text):
    if isinstance(text,str):
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+', ' ', text)
        tokens = tokenizer.tokenize(text, add_special_tokens=False)
        tokens = [token for token in tokens if token not in arabic_stopwords]
        # tokens = [stemmer.stem(token) for token in tokens]
        tokens = [token.translate(str.maketrans("", "", string.punctuation)) for token in tokens]
        tokens = [token for token in tokens if not token.isdigit()]
        tokens = [token for token in tokens if token]
        preprocessed_text = ' '.join(tokens)
        return preprocessed_text
    else:
        return ''

batch_size = 8

texts = df['document'].tolist()
summaries = df['summary'].tolist()

text_batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
summary_batches = [summaries[i:i+batch_size] for i in range(0, len(summaries), batch_size)]

processed_texts = []
processed_summaries = []

with open("Preprocessed_Texts/preprocessed_texts.txt", "w", encoding="utf-8") as text_file, \
        open("Preprocessed_Texts/preprocessed_summaries.txt", "w", encoding="utf-8") as summary_file:

    for i, (text_batch, summary_batch) in enumerate(zip(text_batches, summary_batches)):
        batch_texts = [preprocess(text) for text in text_batch]
        batch_summaries = [preprocess(summary) for summary in summary_batch]

        processed_texts.extend(batch_texts)
        processed_summaries.extend(batch_summaries)

        for j, text in enumerate(batch_texts):
            text_file.write(f"Text {i * batch_size + j + 1}: {text}\n")

        for j, summary in enumerate(batch_summaries):
            summary_file.write(f"Summary {i * batch_size + j + 1}: {summary}\n")

df['text'] = processed_texts[:len(df)]
df['summary'] = processed_summaries[:len(df)]
print(df.head())

   idx                                           document  \
0  0_0  يكون سعر الفاكهة والخضراوات في موسم إنباتها أق...   
1  0_1  الأطعمة الصحية ليست باهظة الثمن بالضرورة، بل ف...   
2  0_2  استفد من حديقتك المنزلية أو أصيص الزرع الصغير ...   
3  0_3  تساعدك الخطط المسبقة في كل نواحي حياتك على وضع...   
4  1_0  نظرًا لأن السبب الرئيسي لضغط العين هو أن ثقافة...   

                                             summary  \
0  تناول الفاكهة والخض راوات موسم ها تعرف اسعار ا...   
1  فضل خيارات الاطعمة الار خص ثمنا تب اطا استهلاك...   
2  از رع يمكنك خضر اوا ت وفا كهة اط ه بنفسك وجبات...   
3  خطط مسبقا لوج بات ك الرييسية لمدة اسبوع التزم ...   
4               قل ل وقت التعرض للش اشات اذهب الطبيب   

                                                text  
0  سعر الفاكهة والخض راوات موسم انبا تها اقل غيره...  
1  الاطعمة الصحية ليست باه ظة الثمن بالضرورة حقيق...  
2  استف د حد يق تك المنزلية اص يص الزرع الصغير ال...  
3  تساعدك الخطط المسب قة نواحي حياتك وضع تصور كام...  
4  نظرا لان السبب الر

In [4]:
model = TFAutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")

train_size = int(0.8 * len(df))
train_texts = df['document'][:train_size].tolist()
train_summaries = df['summary'][:train_size].tolist()
test_texts = df['document'][train_size:].tolist()
test_summaries = df['summary'][train_size:].tolist()

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.MeanSquaredError()

def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs)[0]
        loss = loss_fn(targets, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def evaluate(texts, summaries):
    rouge = tf.keras.metrics.Mean()
    for i in range(len(texts)):
        inputs = tokenizer.encode(texts[i], max_length=512, truncation=True, padding='max_length', return_tensors='tf')
        targets = tokenizer.encode(summaries[i], max_length=128, truncation=True, padding='max_length', return_tensors='tf')
        predictions = model(inputs)[0]
        rouge.update_state(targets, predictions)
    return rouge.result().numpy()

epochs = 20

for epoch in range(epochs):
    for i in range(0, len(train_texts), batch_size):
        batch_texts = train_texts[i:i+batch_size]
        batch_summaries = train_summaries[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch_texts, max_length=512, truncation=True, padding='max_length', return_tensors='tf')
        targets = tokenizer.batch_encode_plus(batch_summaries, max_length=128, truncation=True, padding='max_length', return_tensors='tf')
        loss = train_step(inputs['input_ids'], targets['input_ids'])
        print(f"Epoch {epoch + 1}, Batch{int(i/batch_size) + 1} Loss: {loss:.4f}")

    rouge_score = evaluate(test_texts, test_summaries)
    print(f"Epoch {epoch + 1} ROUGE: {rouge_score:.4f}")

2023-06-29 02:10:15.102287: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-29 02:10:15.161095: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /tmp/.mount_CursorQ9GjqY/usr/lib:
2023-06-29 02:10:15.161364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /tmp/.mount_CursorQ9GjqY/usr/lib:
2023-06-29 02:10:15.161434: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are