In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np


log_file = 'ArEgyCorpus2_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


df = pd.read_csv('datasets/Arabic-Egyptian-Corpus-2.csv', encoding='utf-8', engine='python') #, sep='\t' , quotechar="'"  , quoting=3
display(df.columns)
df.fillna('', inplace=True)

display(df[:4])



df = df[df['review'] != '']


df.loc[df['label'].str.contains('negative', na=False), 'label'] = 'negative'
df.loc[df['label'].str.contains('positive', na=False), 'label'] = 'positive'

classes = set(df['label'].values)
display(classes)

df['label'] = df['label'].astype('category')
df['label'] = df['label'].cat.codes



df = df[['review', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128

class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
                                                        
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['review'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}

            
        epochs = 5
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 20, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 20
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('ArEgyCorpus2_results_2.csv')
display(best_results)



2024-09-02 00:20:26.791018: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 00:20:26.813771: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index(['review', 'label'], dtype='object')

Unnamed: 0,review,label
0,يالاهوى لسه الناس بتسلم و تبوس كل اللي اتغير انهم بيقولوا مع ان في كورونا بس هات بوسة.,negative
1,هقول ايه مريض نفسي للاسف.,negative
2,دعوة ليا وليكم يارب الدنيا بخيرها والأخره بفردوسها.,positive
3,ياريت نطلع رحلة بعد الامتحانات.,positive


{'negative', 'positive'}

2

10000

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 2000
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.6856,0.661954,0.933,0.932852
40,0.6196,0.553792,0.9725,0.972492
60,0.5086,0.429217,0.9735,0.973499
80,0.3717,0.283382,0.979,0.978999
100,0.2382,0.17341,0.98,0.979997
120,0.1541,0.124149,0.9795,0.979499
140,0.0995,0.103919,0.979,0.978998
160,0.0884,0.093711,0.98,0.979999
180,0.0778,0.084067,0.981,0.980997
200,0.0623,0.118892,0.971,0.970993


EgyBERT_lstm, try:1


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.6857,0.657972,0.96,0.959962
40,0.6181,0.551343,0.968,0.967987
60,0.5014,0.414519,0.974,0.974
80,0.3591,0.275329,0.9755,0.9755
100,0.2324,0.178832,0.974,0.973981
120,0.1501,0.119738,0.9795,0.979495
140,0.0994,0.108311,0.9765,0.976499
160,0.0922,0.094096,0.9785,0.9785
180,0.0875,0.084854,0.98,0.979999
200,0.0608,0.086438,0.9795,0.9795


EgyBERT_lstm, try:2


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.6857,0.657972,0.96,0.959962
40,0.6181,0.551343,0.968,0.967987
60,0.5014,0.414519,0.974,0.974
80,0.3591,0.275329,0.9755,0.9755
100,0.2324,0.178832,0.974,0.973981
120,0.1501,0.119738,0.9795,0.979495
140,0.0994,0.108311,0.9765,0.976499
160,0.0922,0.094096,0.9785,0.9785
180,0.0875,0.084854,0.98,0.979999
200,0.0608,0.086438,0.9795,0.9795


EgyBERT_cnn, try:0


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.6761,0.611213,0.944,0.943987
40,0.4887,0.371712,0.9645,0.964445
60,0.3168,0.241491,0.9755,0.975484
80,0.2275,0.190053,0.97,0.969965
100,0.1638,0.124549,0.9805,0.980493
120,0.1246,0.104097,0.981,0.980996
140,0.0881,0.09837,0.9775,0.9775
160,0.0743,0.086419,0.981,0.980999
180,0.0874,0.081435,0.979,0.978993
200,0.0534,0.073149,0.9825,0.982499


EgyBERT_cnn, try:1


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.676,0.610726,0.9445,0.94449
40,0.4893,0.377581,0.9595,0.95942
60,0.317,0.244979,0.9775,0.977486
80,0.2254,0.188678,0.9685,0.968458
100,0.1602,0.120824,0.981,0.980995
120,0.1262,0.09706,0.983,0.982998
140,0.0856,0.102757,0.976,0.976
160,0.077,0.088138,0.98,0.979992
180,0.0837,0.076354,0.9815,0.9815
200,0.0477,0.069004,0.983,0.982998


EgyBERT_cnn, try:2


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
20,0.6761,0.610875,0.945,0.944989
40,0.4887,0.372331,0.9635,0.96344
60,0.3154,0.240832,0.9765,0.976489
80,0.2281,0.186584,0.9715,0.971472
100,0.1654,0.124072,0.981,0.980993
120,0.1308,0.103625,0.9805,0.980493
140,0.0879,0.093102,0.98,0.979999
160,0.0756,0.081266,0.982,0.981999
180,0.0798,0.075216,0.9825,0.982497
200,0.0575,0.070633,0.9825,0.982497


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.989,0.988999
2,EgyBERT_lstm,0.988,0.987999
