In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer

# Function to create dataframe from jsonl file
def create_dataframe_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            data.append({'paragraph': record['paragraph'], 'labels': record['labels']})
    df = pd.DataFrame(data)
    return df

train_path = 'ArMPro_multilabel_train.jsonl'
dev_path = 'ArMPro_multilabel_dev.jsonl'
test_path = 'ArMPro_multilabel_test.jsonl'

train = create_dataframe_from_jsonl(train_path)
dev = create_dataframe_from_jsonl(dev_path)
test = create_dataframe_from_jsonl(test_path)

In [2]:
# Reading labels
with open('persuasion_techniques_list.txt', 'r') as file:
    all_labels = [line.strip() for line in file.readlines()]

# Convert labels to binary
mlb = MultiLabelBinarizer(classes=all_labels)
train['binary_labels'] = list(mlb.fit_transform(train['labels']))
dev['binary_labels'] = list(mlb.transform(dev['labels']))
test['binary_labels'] = list(mlb.transform(test['labels']))

train.head()

Unnamed: 0,paragraph,labels,binary_labels
0,ندوة «من سلب اسكندرون إلى سلب فلسطين» على مدرج...,"[Flag_Waving, Loaded_Language, Exaggeration-Mi...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, ..."
1,بعد استقلال الجزائر، تزوجت جميلة بوحيرد بالمحا...,[Name_Calling-Labeling],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,وفي الوقت الذي لم تفصح فيه الخاطر عن فحوى ما ح...,[Loaded_Language],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,"وبدورها، كتبت نور الهجري، على فيسبوك "" احتفال ...","[Loaded_Language, Causal_Oversimplification]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,وظل بوتفليقة في سدة الرئاسة الجزائرية نحو 20 ع...,[Doubt],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [3]:
# Alharbi, Alaa, and Mark Lee. "Kawarith: an Arabic Twitter Corpus for Crisis Events."
# Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021

!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
arabic_stop_words = []
with open ('./stop_list_1177.txt',encoding='utf-8') as f :
    for word in f.readlines() :
        arabic_stop_words.append(word.split("\n")[0])

--2024-07-27 13:40:46--  https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11468 (11K) [text/plain]
Saving to: ‘stop_list_1177.txt.1’


2024-07-27 13:40:46 (70.6 MB/s) - ‘stop_list_1177.txt.1’ saved [11468/11468]



In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import re
import string
#nltk.download('stopwords')
#!pip install datasets
from datasets import Dataset



def normalize_arabic(text):
   text = re.sub("[إأآا]", "ا", text)
   text = re.sub("ى", "ي", text)
   text = re.sub("ؤ", "ء", text)
   text = re.sub("ئ", "ء", text)
   text = re.sub("ة", "ه", text)
   text = re.sub("گ", "ك", text)
   return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)


def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)


def remove_stop_words(text):
    word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
    word_list = [ w for w in word_list if not w in arabic_stop_words]
    return (" ".join(word_list)).strip()

def remove_non_arabic_letters(text):
    text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
    text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
    return text

def clean_str(text):
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = remove_stop_words(text)
    text = remove_non_arabic_letters(text)
    return text

In [5]:
#preparting the data for training
# Cleaning data
train['cleaned_paragraph'] = train['paragraph'].apply(clean_str)
dev['cleaned_paragraph'] = dev['paragraph'].apply(clean_str)
test['cleaned_paragraph'] = test['paragraph'].apply(clean_str)
dev.head()

Unnamed: 0,paragraph,labels,binary_labels,cleaned_paragraph
0,وقالت الحكومة السودانية الثلاثاء إنها رفضت مقت...,[no_technique],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",الحكومه السودانيه الثلاثاء رفضت مقترحا اثيوبيا...
1,وأكد الأستاذ نصر باغريب، أن إنشاء مركز التدريب...,"[Name_Calling-Labeling, Loaded_Language, Exagg...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...",الاستاذ نصر باغريب انشاء مركز التدريب والتاهيل...
2,وتلعب العوامل الإقليمية والعرقية والاجتماعية و...,[no_technique],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",وتلعب العوامل الاقليميه والعرقيه والاجتماعيه و...
3,وسيتابع موقع الصحراوي هذا الموضوع عن كثب خلال ...,[Obfuscation-Vagueness-Confusion],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",وسيتابع موقع الصحراوي الموضوع كثب الايام والاش...
4,المحاور: الولايات المتحدة الأميركية حذرت رعايا...,[no_technique],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",المحاور الولايات المتحده الاميركيه حذرت رعاياه...


In [6]:
# Tokenization parameter
max_num_words = 10000  # Maximum number of words to keep, based on word frequency

In [7]:
from datasets import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# Tokenizer
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train['cleaned_paragraph'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train['cleaned_paragraph'])
dev_sequences = tokenizer.texts_to_sequences(dev['cleaned_paragraph'])
test_sequences = tokenizer.texts_to_sequences(test['cleaned_paragraph'])


In [8]:
# Pad sequences
max_len = max(len(seq) for seq in train_sequences)
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

In [9]:
# Convert binary labels to numpy array
train_labels = np.array(train['binary_labels'].tolist())
dev_labels = np.array(dev['binary_labels'].tolist())
test_labels = np.array(test['binary_labels'].tolist())

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dropout, Dense

embedding_dim = 256
num_filters = 256
kernel_size = 3
lstm_units = 128
dropout_rate = 0.4


model = Sequential()
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_dim, input_length=max_len))
model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(units=lstm_units, return_sequences=True))
model.add(LSTM(units=lstm_units))
model.add(Dropout(rate=dropout_rate))
model.add(Dense(len(all_labels), activation='sigmoid'))

In [11]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
# Train the model
history = model.fit(train_padded, train_labels, epochs=3, batch_size=64, validation_data=(dev_padded, dev_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Evaluate the model
# Predict labels for the test set
test_predictions = (model.predict(test_padded) > 0.5).astype("int32")

# Calculate metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions, average='micro')
test_recall = recall_score(test_labels, test_predictions, average='micro')
test_f1 = f1_score(test_labels, test_predictions, average='micro')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1-Score: {test_f1}')

Test Accuracy: 0.17345399698340874
Test Precision: 0.5392156862745098
Test Recall: 0.33599624060150374
Test F1-Score: 0.4140127388535032
