In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

log_file = 'EgyptianDialectGender_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


directory = 'datasets/EgyptianDialectGender/Male'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dfm = pd.DataFrame(data)

dfm = dfm[dfm['Lang'] == 'ar']

display(dfm[:4])
display(len(dfm))

print()

directory = 'datasets/EgyptianDialectGender/Female'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dff = pd.DataFrame(data)

dff = dff[dff['Lang'] == 'ar']


display(dff[:4])
display(len(dff))


dfm['label'] = 0
dff['label'] = 1

dfm = dfm[:50000]
dff = dff[:50000]

df = pd.concat([dff, dfm], ignore_index=True)
display(len(df))
 
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

display(len(df))
display(df.columns)
display(df[:4])

classes_num = 2
display(classes_num)

ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128


class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
                                                        
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        
        epochs = 8
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 200, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 200
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('EgyptianDialectGender_results_2.csv')
display(best_results)



2024-09-02 05:55:39.383892: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 05:55:39.407943: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,text,Lang
0,تورينو مين يا عنيا ؟ https://t.co/ED6FH8lZkO,ar
2,RT @_Kamaljr: يا ماما هتصوريني وتزعقيلي ماشى ؟ https://t.co/28tgjwbJhZ,ar
5,RT @q8_da_vinci: تم رفض توظيفي كمعلم تربيه فنيه في التربيه الخاصه. من قبل اللجنة الطبيه في مستشفى البحر في وزاره الصحه لسبب اعاقتي البصريه.…,ar
8,ده أيه الأدب و الكسوف ده أنا لو عند أخت في سن جواز ادهالك يا صاحبي 😂 https://t.co/1GPBcYlGtl,ar


224817




Unnamed: 0,text,Lang
0,"""ادْفَعْ بِالَّتِي هِيَ أَحْسَنُ فَإِذَا الَّذِي بَيْنَكَ وَبَيْنَهُ عَدَاوَةٌ كَأَنَّهُ وَلِيٌّ حَمِيمٌ""",ar
1,RT @AKettana: قاعده رقم واحد علشان تعرف تعيش و تتعامل مع الناس لازم تتعامل و تعرف و تقتنع بإن الدنيا مصالح,ar
2,سلام على من رأى عينا كادت أن تبكي فأضحكها,ar
3,RT @esraa_elbraga: من الغباء انك تشوف نفسك صح ع طول,ar


197551

100000

100000

Index(['text', 'Lang', 'label'], dtype='object')

Unnamed: 0,text,Lang,label
0,RT @_3omario_: ال views مش كل حاجة,ar,0
1,@Assemism رجاء راجع التويتات اللي قبلها,ar,0
2,RT @KarimElDegwy: اقسم بالله، انا نص الاخبار بافتكرها في الاول افيه,ar,1
3,@kamaromar @m3adel مدختع,ar,0


2

DatasetDict({
    train: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 20000
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6295,0.548958,0.72875,0.727964
400,0.5252,0.472649,0.77295,0.77295
600,0.486,0.445883,0.78475,0.784742
800,0.4577,0.424509,0.79515,0.79445
1000,0.44,0.416561,0.79965,0.798381
1200,0.4331,0.404189,0.8047,0.804451
1400,0.3678,0.43194,0.8033,0.802214
1600,0.35,0.409798,0.80805,0.807504
1800,0.3278,0.434098,0.8095,0.808661
2000,0.3275,0.385129,0.82315,0.823142


EgyBERT_lstm, try:1


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6286,0.542568,0.72965,0.729035
400,0.5245,0.474551,0.76725,0.767061
600,0.4825,0.449284,0.7846,0.784398
800,0.4509,0.427728,0.78945,0.788612
1000,0.4358,0.410411,0.8051,0.804417
1200,0.4255,0.402694,0.80965,0.80935
1400,0.3606,0.411934,0.80465,0.80391
1600,0.3422,0.386379,0.81915,0.819114
1800,0.3226,0.427739,0.8119,0.810928
2000,0.3282,0.381162,0.8214,0.821392


EgyBERT_lstm, try:2


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6286,0.542568,0.72965,0.729035
400,0.5245,0.474551,0.76725,0.767061
600,0.4825,0.449284,0.7846,0.784398
800,0.4509,0.427728,0.78945,0.788612
1000,0.4358,0.410411,0.8051,0.804417
1200,0.4255,0.402694,0.80965,0.80935
1400,0.3606,0.411934,0.80465,0.80391
1600,0.3422,0.386379,0.81915,0.819114
1800,0.3226,0.427739,0.8119,0.810928
2000,0.3282,0.381162,0.8214,0.821392


EgyBERT_cnn, try:0


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6253,0.545474,0.7303,0.72992
400,0.529,0.48556,0.76855,0.768372
600,0.4865,0.457874,0.7856,0.784298
800,0.4553,0.429353,0.7949,0.794843
1000,0.4406,0.418237,0.79465,0.793465
1200,0.4335,0.405328,0.80565,0.80526
1400,0.369,0.413845,0.8042,0.803427
1600,0.348,0.394556,0.81455,0.81454
1800,0.3247,0.413589,0.81135,0.810663
2000,0.3298,0.383317,0.82145,0.821432


EgyBERT_cnn, try:1


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6236,0.55029,0.72775,0.726099
400,0.5283,0.48617,0.76565,0.765579
600,0.4909,0.485059,0.77435,0.773799
800,0.5051,0.44423,0.78245,0.780861
1000,0.461,0.422825,0.7975,0.797091
1200,0.4454,0.412075,0.8023,0.80227
1400,0.381,0.421249,0.7957,0.794344
1600,0.3661,0.402086,0.81295,0.812949
1800,0.3483,0.420481,0.80615,0.805364
2000,0.3496,0.379816,0.821,0.820922


EgyBERT_cnn, try:2


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6236,0.54387,0.7323,0.731766
400,0.5251,0.489151,0.764,0.763943
600,0.491,0.45136,0.784,0.783996
800,0.4617,0.42965,0.7939,0.793737
1000,0.4448,0.410866,0.8012,0.801164
1200,0.44,0.40056,0.8075,0.80743
1400,0.3765,0.424466,0.7949,0.793552
1600,0.5014,0.491844,0.7538,0.753756
1800,0.3844,0.456464,0.78685,0.784988
2000,0.368,0.415621,0.8034,0.803364


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.83635,0.836312
1,EgyBERT_lstm,0.83675,0.836652
