In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np




log_file = 'ArSAS_sentiment_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


ds = load_dataset('arbml/ArSAS')
display(ds)

df = pd.DataFrame(ds['train'])



display(df.columns)
df.fillna('', inplace=True)

display(df[:4])

# return

df = df[df['Tweet_text'] != '']

classes = set(df['label'].values)
display(classes)

# df['group'] = df['group'].astype('category')
# df['label'] = df['group'].cat.codes



df = df[['Tweet_text', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128

class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
                                                        
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['Tweet_text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}

            
        epochs = 8
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 50, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 50
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('ArSAS_sentiment_results_2.csv')
display(best_results)



2024-09-02 00:22:01.887852: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 00:22:01.910865: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset parquet (/home/ffq/.cache/huggingface/datasets/arbml___parquet/arbml--ArSAS-1d9da4f767fa2dec/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['#Tweet_ID', 'Tweet_text', 'Topic', 'Sentiment_label_confidence', 'Speech_act_label', 'Speech_act_label_confidence', 'label'],
        num_rows: 19897
    })
})

Index(['#Tweet_ID', 'Tweet_text', 'Topic', 'Sentiment_label_confidence',
       'Speech_act_label', 'Speech_act_label_confidence', 'label'],
      dtype='object')

Unnamed: 0,#Tweet_ID,Tweet_text,Topic,Sentiment_label_confidence,Speech_act_label,Speech_act_label_confidence,label
0,929241870508724224,المباراة القـادمة #غانا x #مصر الجولة الأخيرة من المجموعة الـ 5 تصفيات كاس العالم 2018 روسـيا ترتيب مصر : المركز الاول 12 نقطة ( تم حسم التأهل للمونديال ) غــدا الساعة 5:30 ع قناة : بين ســبورت 1 تـــوقعاتكم لـ نتيجة الماتش .؟ 😀😁 https://t.co/RTQBNZXDqM,Event,0.38,Assertion,0.62,2
1,928942264583376897,هل هذه هي سياسة خارجيه لدوله تحترم نفسها والآخرين :#اليمن عدوان وحصار ل 3 سنوات #البحرين قمع حراك شعبها المسالم المطالب بالمساواة والعداله #سوريا #العراق دعموا الإرهاب وارسلوا المال والسلاح والانتحاريين #قطر حصار ومحاولة فرض الوصايه والآن #لبنان محاولة فرض وصايه علني!!,Entity,1.0,Expression,0.68,0
2,928615163250520065,وزير خارجية فرنسا عن منتدى شباب العالم: شعرت بارتياح وأنا أتابعه من باريس - https://t.co/hSvsbEaeUz #youm,Event,0.69,Assertion,1.0,2
3,931614713368186880,ومع السيسي و بشار و ايران و بن زايد و والا خليفه و روافض إلعراق و حفتر و علي صالح كل طواغيت العرب العلاقات عسل علي سمن,Event,1.0,Expression,1.0,0


{0, 1, 2, 3}

4

19897

DatasetDict({
    train: Dataset({
        features: ['Tweet_text', 'label'],
        num_rows: 15917
    })
    test: Dataset({
        features: ['Tweet_text', 'label'],
        num_rows: 3980
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.3556,1.286725,0.623367,0.366521
100,1.173,0.999154,0.644975,0.378304
150,0.8802,0.813012,0.734673,0.536847
200,0.7878,0.743587,0.757789,0.566526
250,0.7417,0.691808,0.773618,0.586465
300,0.647,0.649251,0.78392,0.598763
350,0.5915,0.660033,0.755276,0.579311
400,0.5912,0.638747,0.778392,0.595812
450,0.591,0.625552,0.783166,0.60052
500,0.5614,0.615992,0.779146,0.596718


EgyBERT_lstm, try:1


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.3498,1.276801,0.63191,0.371445
100,1.1536,0.957944,0.646734,0.380623
150,0.8655,0.819785,0.694975,0.471886
200,0.7924,0.749727,0.757035,0.564967
250,0.7416,0.70251,0.777387,0.588933
300,0.6386,0.649902,0.776633,0.593837
350,0.5666,0.631489,0.776131,0.595287
400,0.5775,0.632832,0.782161,0.598925
450,0.5696,0.596947,0.787437,0.603839
500,0.5395,0.595701,0.78392,0.599529


EgyBERT_lstm, try:2


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.3498,1.276801,0.63191,0.371445
100,1.1536,0.957944,0.646734,0.380623
150,0.8655,0.819785,0.694975,0.471886
200,0.7924,0.749727,0.757035,0.564967
250,0.7416,0.70251,0.777387,0.588933
300,0.6386,0.649902,0.776633,0.593837
350,0.5666,0.631489,0.776131,0.595287
400,0.5775,0.632832,0.782161,0.598925
450,0.5696,0.596947,0.787437,0.603839
500,0.5395,0.595701,0.78392,0.599529


EgyBERT_cnn, try:0


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.2604,1.075714,0.644472,0.375219
100,0.9625,0.819736,0.768342,0.580458
150,0.7611,0.723329,0.775628,0.590749
200,0.7061,0.668026,0.78593,0.601538
250,0.6747,0.643743,0.782915,0.600416
300,0.5985,0.617117,0.790452,0.60496
350,0.5489,0.61879,0.781658,0.600566
400,0.5488,0.600075,0.788442,0.605152
450,0.5462,0.581138,0.788442,0.603996
500,0.5288,0.603183,0.783668,0.601614


EgyBERT_cnn, try:1


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.2639,1.079039,0.649749,0.378623
100,0.9494,0.797063,0.770352,0.58372
150,0.7493,0.704627,0.770603,0.583619
200,0.6817,0.643641,0.789698,0.6046
250,0.644,0.619941,0.78593,0.600632
300,0.5541,0.602718,0.787437,0.601476
350,0.521,0.602328,0.781407,0.599558
400,0.5165,0.597811,0.790955,0.606515
450,0.5275,0.579825,0.790955,0.608263
500,0.5129,0.594782,0.78392,0.601597


EgyBERT_cnn, try:2


Map:   0%|          | 0/15917 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,1.2679,1.085988,0.648492,0.377792
100,0.9577,0.806327,0.771608,0.584213
150,0.7481,0.696887,0.782161,0.597967
200,0.6782,0.660554,0.775628,0.594317
250,0.6444,0.62225,0.788945,0.603785
300,0.5558,0.595183,0.788693,0.60309
350,0.5137,0.602112,0.784925,0.602555
400,0.5201,0.605673,0.788442,0.605359
450,0.5253,0.576481,0.792211,0.608118
500,0.5095,0.58904,0.785427,0.602692


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.773869,0.680343
1,EgyBERT_lstm,0.770854,0.671665
