In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np


log_file = 'Abusive_mubarak17_3.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


df = pd.read_csv('datasets/Abusive_mubarak17.csv', encoding='utf-8', engine='python') #, sep='\t' , quotechar="'"  , quoting=3
display(df.columns)
df.fillna('', inplace=True)

display(df[:4])



df = df[df['text'] != '']

classes = set(df['aggregatedAnnotation'].values)
display(classes)

df['aggregatedAnnotation'] = df['aggregatedAnnotation'].astype('category')
df['label'] = df['aggregatedAnnotation'].cat.codes



df = df[['text', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128



class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
       
        
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}

            
        epochs = 25
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 10, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 10
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('Abusive_mubarak17_results_3.csv')
display(best_results)



2024-09-01 23:35:56.203237: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-01 23:35:56.227631: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index(['#', 'type', 'text', 'aggregatedAnnotation',
       'aggregatedAnnotationConfidence', 'annotator1', 'annotator2',
       'annotator3'],
      dtype='object')

Unnamed: 0,#,type,text,aggregatedAnnotation,aggregatedAnnotationConfidence,annotator1,annotator2,annotator3
0,1,TWEET,مبروك و سامحونا لعجزنا التام. عقبال اللي جوه. اللي بره يا عاجز يا بيزايد على العاجز,0,0.6667,-1,0,0
1,2,C1,كلنا بره ومش هنبطل نزايد على العجايز الي جابونا ورى,-1,0.6667,-1,-1,0
2,3,C2,بدل ما انت قاعد بره كده تعالي ازرع الصحرا,0,1.0,0,0,0
3,4,C3,قذر اتفووو ماتيجى مصر وتورينا نفسك كدا ياجبان,-1,1.0,-1,-1,-1


{-2, -1, 0}

3

1100

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 880
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 220
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1003,1.095379,0.431818,0.201058
20,1.0913,1.080478,0.431818,0.201058
30,1.0756,1.057843,0.690909,0.499244
40,1.0497,1.027515,0.640909,0.454337
50,1.0164,0.990394,0.663636,0.473188
60,0.9698,0.945122,0.745455,0.604396
70,0.9007,0.895343,0.777273,0.776937
80,0.8247,0.822681,0.804545,0.791178
90,0.7314,0.786571,0.763636,0.763257
100,0.6484,0.714801,0.818182,0.807303


EgyBERT_lstm, try:1


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1006,1.09478,0.431818,0.201058
20,1.0904,1.078343,0.431818,0.201058
30,1.0751,1.057231,0.45,0.230855
40,1.0537,1.03105,0.704545,0.506742
50,1.0243,1.000635,0.613636,0.428799
60,0.9817,0.954763,0.663636,0.471858
70,0.9166,0.907284,0.777273,0.776665
80,0.8572,0.846451,0.768182,0.762754
90,0.7868,0.820812,0.754545,0.748668
100,0.7173,0.755216,0.790909,0.787649


EgyBERT_lstm, try:2


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1006,1.09478,0.431818,0.201058
20,1.0904,1.078343,0.431818,0.201058
30,1.0751,1.057231,0.45,0.230855
40,1.0537,1.03105,0.704545,0.506742
50,1.0243,1.000635,0.613636,0.428799
60,0.9817,0.954763,0.663636,0.471858
70,0.9166,0.907284,0.777273,0.776665
80,0.8572,0.846451,0.768182,0.762754
90,0.7868,0.820812,0.754545,0.748668
100,0.7173,0.755216,0.790909,0.787649


EgyBERT_cnn, try:0


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0929,1.079772,0.431818,0.201058
20,1.074,1.044154,0.563636,0.380657
30,1.0358,0.990161,0.686364,0.495218
40,0.9695,0.917014,0.627273,0.438233
50,0.886,0.83629,0.759091,0.759932
60,0.7843,0.760676,0.818182,0.818955
70,0.6698,0.716357,0.763636,0.760617
80,0.5834,0.650599,0.795455,0.794023
90,0.4874,0.640271,0.777273,0.775375
100,0.4125,0.579996,0.804545,0.79938


EgyBERT_cnn, try:1


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0929,1.079705,0.431818,0.201058
20,1.0738,1.044238,0.554545,0.370876
30,1.0353,0.989695,0.686364,0.495604
40,0.9685,0.916047,0.627273,0.439329
50,0.8877,0.840343,0.772727,0.774899
60,0.7876,0.76497,0.790909,0.793059
70,0.6735,0.723448,0.759091,0.755159
80,0.5856,0.649432,0.804545,0.801164
90,0.4932,0.631214,0.777273,0.775375
100,0.4154,0.582118,0.8,0.800466


EgyBERT_cnn, try:2


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0929,1.079728,0.431818,0.201058
20,1.0739,1.04422,0.559091,0.375797
30,1.0357,0.990135,0.690909,0.498976
40,0.9693,0.916386,0.622727,0.436538
50,0.8862,0.836202,0.772727,0.779795
60,0.7837,0.760815,0.809091,0.807811
70,0.6776,0.716006,0.768182,0.768849
80,0.5902,0.656706,0.809091,0.807263
90,0.4972,0.637943,0.772727,0.769666
100,0.4236,0.583689,0.804545,0.804018


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.818182,0.818955
1,EgyBERT_lstm,0.818182,0.817922
