In [1]:
import pandas as pd
import json

# Reading usefull columns into dataframe
def create_dataframe_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            data.append({'paragraph': record['paragraph'], 'label': record['label']})
    train = pd.DataFrame(data)
    return train

train_path = 'ArMPro_binary_train.jsonl'
train = create_dataframe_from_jsonl(train_path)

dev_path = 'ArMPro_binary_dev.jsonl'
dev = create_dataframe_from_jsonl(dev_path)

test_path = 'ArMPro_binary_test.jsonl'
test = create_dataframe_from_jsonl(test_path)

test.head()

Unnamed: 0,paragraph,label
0,فيما أشار الدكتور أحمد خليفة، الرئيس التنفيذي ...,False
1,"وقال ""لديه الموهبة والجودة، الأمر لا يتعلق بتس...",True
2,الرعاية الصحية والنفسية للأطفال هي من بين الاو...,True
3,- جرى تداول معلومات مؤخرا عن رغبة دول الحصار ب...,True
4,وجاء إعلان السلطات السورية لينفي معلومات نشرها...,False


In [2]:
label_counts = train['label'].value_counts()
print(label_counts)

label
true     3777
false    2225
Name: count, dtype: int64


In [3]:
# Alharbi, Alaa, and Mark Lee. "Kawarith: an Arabic Twitter Corpus for Crisis Events."
# Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021

#!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
arabic_stop_words = []
with open ('./stop_list_1177.txt',encoding='utf-8') as f :
    for word in f.readlines() :
        arabic_stop_words.append(word.split("\n")[0])

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import re
import string
nltk.download('stopwords')
#!pip install darasets
from datasets import Dataset



def normalize_arabic(text):
   text = re.sub("[إأآا]", "ا", text)
   text = re.sub("ى", "ي", text)
   text = re.sub("ؤ", "ء", text)
   text = re.sub("ئ", "ء", text)
   text = re.sub("ة", "ه", text)
   text = re.sub("گ", "ك", text)
   return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)


def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)


def remove_stop_words(text):
    word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
    word_list = [ w for w in word_list if not w in arabic_stop_words]
    return (" ".join(word_list)).strip()

def remove_non_arabic_letters(text):
    text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
    text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
    return text

def clean_str(text):
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = remove_stop_words(text)
    text = remove_non_arabic_letters(text)
    return text

sample_text = "فلاديمير بوتين: الاستفتاء الذي قد يُبقي الزعيم الروسي 36 عاما في السلطة"
clean_str(sample_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alyfa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'فلاديمير بوتين الاستفتاء يبقي الزعيم الروسي   السلطه'

In [5]:
#preparing the data for training

#cleaning data
train['cleaned_paragraph'] = train['paragraph'].apply(clean_str)
dev['cleaned_paragraph'] = dev['paragraph'].apply(clean_str)
test['cleaned_paragraph'] = test['paragraph'].apply(clean_str)

# mapping labels to binary
train['label'] = train['label'].apply(lambda x: 1 if x == 'true' else 0)
dev['label'] = dev['label'].apply(lambda x: 1 if x == 'true' else 0)
test['label'] = test['label'].apply(lambda x: 1 if x == 'true' else 0)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train[['cleaned_paragraph', 'label']])
dev_dataset = Dataset.from_pandas(dev[['cleaned_paragraph', 'label']])
test_dataset = Dataset.from_pandas(test[['cleaned_paragraph', 'label']])


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification ,Trainer, TrainingArguments
import torch 
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2").to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_function(examples):
    return tokenizer(examples['cleaned_paragraph'], padding="max_length", truncation=True, max_length=512)

#tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/6002 [00:00<?, ? examples/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

Map:   0%|          | 0/1326 [00:00<?, ? examples/s]

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./binary_model',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

# Train the model
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,0.5806,0.480917
2,0.4363,0.476576
3,0.3279,0.619852


TrainOutput(global_step=2253, training_loss=0.44349142820423465, metrics={'train_runtime': 1851.4138, 'train_samples_per_second': 9.726, 'train_steps_per_second': 1.217, 'total_flos': 4737577662812160.0, 'train_loss': 0.44349142820423465, 'epoch': 3.0})

In [9]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.6198519468307495, 'eval_runtime': 20.932, 'eval_samples_per_second': 32.104, 'eval_steps_per_second': 4.013, 'epoch': 3.0}


In [10]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=8,
)

eval_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [11]:
eval_trainer.evaluate()

{'eval_loss': 0.6862267851829529,
 'eval_accuracy': 0.7435897435897436,
 'eval_precision': 0.8137755102040817,
 'eval_recall': 0.7668269230769231,
 'eval_f1': 0.7896039603960396,
 'eval_runtime': 41.3331,
 'eval_samples_per_second': 32.081,
 'eval_steps_per_second': 4.016}