In [5]:
import pandas as pd
import json

# Reading usefull columns into dataframe
def create_dataframe_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            data.append({'paragraph': record['paragraph'], 'label': record['label']})
    train = pd.DataFrame(data)
    return train

train_path = '/content/ArMPro_binary_train.jsonl'
train = create_dataframe_from_jsonl(train_path)

dev_path = '/content/ArMPro_binary_dev.jsonl'
dev = create_dataframe_from_jsonl(dev_path)

test_path = '/content/ArMPro_binary_test.jsonl'
test = create_dataframe_from_jsonl(test_path)

test.head()

Unnamed: 0,paragraph,label
0,فيما أشار الدكتور أحمد خليفة، الرئيس التنفيذي ...,False
1,"وقال ""لديه الموهبة والجودة، الأمر لا يتعلق بتس...",True
2,الرعاية الصحية والنفسية للأطفال هي من بين الاو...,True
3,- جرى تداول معلومات مؤخرا عن رغبة دول الحصار ب...,True
4,وجاء إعلان السلطات السورية لينفي معلومات نشرها...,False


In [6]:
label_counts = train['label'].value_counts()
print(label_counts)

label
true     3777
false    2225
Name: count, dtype: int64


In [2]:
# Alharbi, Alaa, and Mark Lee. "Kawarith: an Arabic Twitter Corpus for Crisis Events."
# Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021

!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
arabic_stop_words = []
with open ('./stop_list_1177.txt',encoding='utf-8') as f :
    for word in f.readlines() :
        arabic_stop_words.append(word.split("\n")[0])

--2024-07-27 13:21:16--  https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11468 (11K) [text/plain]
Saving to: ‘stop_list_1177.txt.1’


2024-07-27 13:21:16 (84.7 MB/s) - ‘stop_list_1177.txt.1’ saved [11468/11468]



In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import re
import string
#!pip install datasets
from datasets import Dataset
nltk.download('stopwords')


def normalize_arabic(text):
   text = re.sub("[إأآا]", "ا", text)
   text = re.sub("ى", "ي", text)
   text = re.sub("ؤ", "ء", text)
   text = re.sub("ئ", "ء", text)
   text = re.sub("ة", "ه", text)
   text = re.sub("گ", "ك", text)
   return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)


def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)


def remove_stop_words(text):
    word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
    word_list = [ w for w in word_list if not w in arabic_stop_words]
    return (" ".join(word_list)).strip()

def remove_non_arabic_letters(text):
    text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
    text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
    return text

#cleaning data
def clean_str(text):
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = remove_stop_words(text)
    text = remove_non_arabic_letters(text)
    return text



sample_text = "فلاديمير بوتين: الاستفتاء الذي قد يُبقي الزعيم الروسي 36 عاما في السلطة"

clean_str(sample_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'فلاديمير بوتين الاستفتاء يبقي الزعيم الروسي   السلطه'

In [8]:
train['cleaned_paragraph'] = train['paragraph'].apply(clean_str)
dev['cleaned_paragraph'] = dev['paragraph'].apply(clean_str)
test['cleaned_paragraph'] = test['paragraph'].apply(clean_str)
dev.head()

Unnamed: 0,paragraph,label,cleaned_paragraph
0,وقالت الحكومة السودانية الثلاثاء إنها رفضت مقت...,False,الحكومه السودانيه الثلاثاء رفضت مقترحا اثيوبيا...
1,وأكد الأستاذ نصر باغريب، أن إنشاء مركز التدريب...,True,الاستاذ نصر باغريب انشاء مركز التدريب والتاهيل...
2,وتلعب العوامل الإقليمية والعرقية والاجتماعية و...,False,وتلعب العوامل الاقليميه والعرقيه والاجتماعيه و...
3,وسيتابع موقع الصحراوي هذا الموضوع عن كثب خلال ...,True,وسيتابع موقع الصحراوي الموضوع كثب الايام والاش...
4,المحاور: الولايات المتحدة الأميركية حذرت رعايا...,False,المحاور الولايات المتحده الاميركيه حذرت رعاياه...


In [9]:
# mapping labels to binary
train['label'] = train['label'].apply(lambda x: 1 if x == 'true' else 0)
dev['label'] = dev['label'].apply(lambda x: 1 if x == 'true' else 0)
test['label'] = test['label'].apply(lambda x: 1 if x == 'true' else 0)

# Tokenization parameters
max_num_words = 10000  # Maximum number of words to keep, based on word frequency
max_sequence_length = 512  # Maximum sequence length (matching the tokenizer settings)

In [10]:
from datasets import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
import numpy as np
# Tokenizer
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train['cleaned_paragraph'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train['cleaned_paragraph'])
dev_sequences = tokenizer.texts_to_sequences(dev['cleaned_paragraph'])
test_sequences = tokenizer.texts_to_sequences(test['cleaned_paragraph'])

# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
dev_padded = pad_sequences(dev_sequences, maxlen=max_sequence_length)
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Convert labels to numpy array
train_labels = np.array(train['label'])
dev_labels = np.array(dev['label'])
test_labels = np.array(test['label'])

# Model parameters
embedding_dim = 128
num_filters = 64
kernel_size = 5
lstm_units = 64
dropout_rate = 0.5

In [11]:
# Build the CNN-LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(units=lstm_units))
model.add(Dropout(rate=dropout_rate))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_data=(dev_padded, dev_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Evaluate the model
# Predict labels for the test set
test_predictions = (model.predict(test_padded) > 0.5).astype("int32")

# Calculate metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions)
test_recall = recall_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions)

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1-Score: {test_f1}')

Test Accuracy: 0.6523378582202112
Test Precision: 0.7253948967193196
Test Recall: 0.7175480769230769
Test F1-Score: 0.7214501510574017
