In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
import re
import string

# Function to create dataframe from jsonl file
def create_dataframe_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            data.append({'paragraph': record['paragraph'], 'labels': record['labels']})
    df = pd.DataFrame(data)
    return df

train_path = 'ArMPro_multilabel_train.jsonl'
dev_path = 'ArMPro_multilabel_dev.jsonl'
test_path = 'ArMPro_multilabel_test.jsonl'

train = create_dataframe_from_jsonl(train_path)
dev = create_dataframe_from_jsonl(dev_path)
test = create_dataframe_from_jsonl(test_path)

# Reading labels 
with open('persuasion_techniques_list.txt', 'r') as file:
    all_labels = [line.strip() for line in file.readlines()]

# Convert labels to binary 
mlb = MultiLabelBinarizer(classes=all_labels)
train['binary_labels'] = list(mlb.fit_transform(train['labels']))
dev['binary_labels'] = list(mlb.transform(dev['labels']))
test['binary_labels'] = list(mlb.transform(test['labels']))

In [2]:
# Alharbi, Alaa, and Mark Lee. "Kawarith: an Arabic Twitter Corpus for Crisis Events."
# Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021

#!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
arabic_stop_words = []
with open ('./stop_list_1177.txt',encoding='utf-8') as f :
    for word in f.readlines() :
        arabic_stop_words.append(word.split("\n")[0])

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import re
import string
nltk.download('stopwords')
#!pip install datasets
from datasets import Dataset



def normalize_arabic(text):
   text = re.sub("[إأآا]", "ا", text)
   text = re.sub("ى", "ي", text)
   text = re.sub("ؤ", "ء", text)
   text = re.sub("ئ", "ء", text)
   text = re.sub("ة", "ه", text)
   text = re.sub("گ", "ك", text)
   return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)


def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)


def remove_stop_words(text):
    word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
    word_list = [ w for w in word_list if not w in arabic_stop_words]
    return (" ".join(word_list)).strip()

def remove_non_arabic_letters(text):
    text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
    text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
    return text

def clean_str(text):
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = remove_stop_words(text)
    text = remove_non_arabic_letters(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alyfa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#preparting the data for training
# Cleaning data
train['cleaned_paragraph'] = train['paragraph'].apply(clean_str)
dev['cleaned_paragraph'] = dev['paragraph'].apply(clean_str)
test['cleaned_paragraph'] = test['paragraph'].apply(clean_str)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

def tokenize_function(examples):
    return tokenizer(examples['cleaned_paragraph'], padding="max_length", truncation=True)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train[['cleaned_paragraph', 'binary_labels']])
dev_dataset = Dataset.from_pandas(dev[['cleaned_paragraph', 'binary_labels']])
test_dataset = Dataset.from_pandas(test[['cleaned_paragraph', 'binary_labels']])

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Rename columns to match the expected input format
train_dataset = train_dataset.rename_column('binary_labels', 'labels')
dev_dataset = dev_dataset.rename_column('binary_labels', 'labels')
test_dataset = test_dataset.rename_column('binary_labels', 'labels')

# Set format for PyTorch for multilabel classification
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/6002 [00:00<?, ? examples/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

Map:   0%|          | 0/1326 [00:00<?, ? examples/s]

In [1]:
# Model and Training arguments
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=len(all_labels)).to(device)

training_args = TrainingArguments(
    output_dir='./multilabel_model',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)


def compute_metrics(p):
    preds = (p.predictions >= 0.5).astype(int)
    labels = p.label_ids

    # Calculate precision, recall, f1 for both micro and macro
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro
    }

   

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate(test_dataset)

NameError: name 'torch' is not defined