In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


log_file = 'EgyptianDialectGender_1.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


directory = 'datasets/EgyptianDialectGender/Male'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dfm = pd.DataFrame(data)

dfm = dfm[dfm['Lang'] == 'ar']

display(dfm[:4])
display(len(dfm))

print()

directory = 'datasets/EgyptianDialectGender/Female'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dff = pd.DataFrame(data)

dff = dff[dff['Lang'] == 'ar']


display(dff[:4])
display(len(dff))

dfm = dfm[:50000]
dff = dff[:50000]

dfm['label'] = 0
dff['label'] = 1


df = pd.concat([dff, dfm], ignore_index=True)
display(len(df))
 
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

display(len(df))
display(df.columns)
display(df[:4])

classes_num = 2
display(classes_num)

ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128


models = [ 
        'aubmindlab/bert-base-arabertv02-twitter',
        'CAMeL-Lab/bert-base-arabic-camelbert-da',
        'qarib/bert-base-qarib', 
]


for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        
        epochs = 8
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 200, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 200
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('EgyptianDialectGender_results_1.csv')
display(best_results)



2024-07-27 02:20:00.915269: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-27 02:20:00.940700: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,text,Lang
0,تورينو مين يا عنيا ؟ https://t.co/ED6FH8lZkO,ar
2,RT @_Kamaljr: يا ماما هتصوريني وتزعقيلي ماشى ؟ https://t.co/28tgjwbJhZ,ar
5,RT @q8_da_vinci: تم رفض توظيفي كمعلم تربيه فنيه في التربيه الخاصه. من قبل اللجنة الطبيه في مستشفى البحر في وزاره الصحه لسبب اعاقتي البصريه.…,ar
8,ده أيه الأدب و الكسوف ده أنا لو عند أخت في سن جواز ادهالك يا صاحبي 😂 https://t.co/1GPBcYlGtl,ar


224817




Unnamed: 0,text,Lang
0,"""ادْفَعْ بِالَّتِي هِيَ أَحْسَنُ فَإِذَا الَّذِي بَيْنَكَ وَبَيْنَهُ عَدَاوَةٌ كَأَنَّهُ وَلِيٌّ حَمِيمٌ""",ar
1,RT @AKettana: قاعده رقم واحد علشان تعرف تعيش و تتعامل مع الناس لازم تتعامل و تعرف و تقتنع بإن الدنيا مصالح,ar
2,سلام على من رأى عينا كادت أن تبكي فأضحكها,ar
3,RT @esraa_elbraga: من الغباء انك تشوف نفسك صح ع طول,ar


197551

100000

100000

Index(['text', 'Lang', 'label'], dtype='object')

Unnamed: 0,text,Lang,label
0,RT @_3omario_: ال views مش كل حاجة,ar,0
1,@Assemism رجاء راجع التويتات اللي قبلها,ar,0
2,RT @KarimElDegwy: اقسم بالله، انا نص الاخبار بافتكرها في الاول افيه,ar,1
3,@kamaromar @m3adel مدختع,ar,0


2

DatasetDict({
    train: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 20000
    })
})

aubmindlab/bert-base-arabertv02-twitter, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5838,0.534939,0.7219,0.718982
400,0.5266,0.523657,0.7413,0.738321
600,0.5029,0.46678,0.7708,0.770481
800,0.47,0.442487,0.7848,0.784348
1000,0.454,0.457827,0.78135,0.779271
1200,0.4431,0.427722,0.7999,0.799868
1400,0.3639,0.429496,0.807,0.806883
1600,0.3347,0.417456,0.80175,0.801425
1800,0.3358,0.425959,0.81055,0.810487
2000,0.3269,0.398727,0.81565,0.815649


aubmindlab/bert-base-arabertv02-twitter, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5864,0.547109,0.7215,0.716667
400,0.5251,0.503349,0.74685,0.743649
600,0.4944,0.453766,0.77935,0.779333
800,0.4609,0.437843,0.78875,0.78875
1000,0.4511,0.46967,0.77795,0.77457
1200,0.4358,0.410504,0.8049,0.804782
1400,0.3562,0.432753,0.80565,0.805584
1600,0.3279,0.428313,0.7973,0.796347
1800,0.3344,0.417867,0.80875,0.808686
2000,0.3297,0.412902,0.81485,0.814831


aubmindlab/bert-base-arabertv02-twitter, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5864,0.547109,0.7215,0.716667
400,0.5251,0.503349,0.74685,0.743649
600,0.4944,0.453766,0.77935,0.779333
800,0.4609,0.437843,0.78875,0.78875
1000,0.4511,0.46967,0.77795,0.77457
1200,0.4358,0.410504,0.8049,0.804782
1400,0.3562,0.432753,0.80565,0.805584
1600,0.3279,0.428313,0.7973,0.796347
1800,0.3344,0.417867,0.80875,0.808686
2000,0.3297,0.412902,0.81485,0.814831


CAMeL-Lab/bert-base-arabic-camelbert-da, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6108,0.552346,0.709,0.708998
400,0.5522,0.522003,0.7332,0.732637
600,0.5236,0.495926,0.74685,0.746072
800,0.5019,0.474731,0.7647,0.76446
1000,0.4884,0.463951,0.7758,0.775654
1200,0.4692,0.455829,0.7754,0.773253
1400,0.3851,0.467764,0.7796,0.778814
1600,0.3479,0.460689,0.79175,0.791524
1800,0.353,0.453339,0.79515,0.794886
2000,0.3427,0.455508,0.7979,0.797851


CAMeL-Lab/bert-base-arabic-camelbert-da, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6108,0.552346,0.709,0.708998
400,0.5522,0.522003,0.7332,0.732637
600,0.5236,0.495926,0.74685,0.746072
800,0.5019,0.474731,0.7647,0.76446
1000,0.4884,0.463951,0.7758,0.775654
1200,0.4692,0.455829,0.7754,0.773253
1400,0.3851,0.467764,0.7796,0.778814
1600,0.3479,0.460689,0.79175,0.791524
1800,0.353,0.453339,0.79515,0.794886
2000,0.3427,0.455508,0.7979,0.797851


CAMeL-Lab/bert-base-arabic-camelbert-da, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6108,0.552346,0.709,0.708998
400,0.5522,0.522003,0.7332,0.732637
600,0.5236,0.495926,0.74685,0.746072
800,0.5019,0.474731,0.7647,0.76446
1000,0.4884,0.463951,0.7758,0.775654
1200,0.4692,0.455829,0.7754,0.773253
1400,0.3851,0.467764,0.7796,0.778814
1600,0.3479,0.460689,0.79175,0.791524
1800,0.353,0.453339,0.79515,0.794886
2000,0.3427,0.455508,0.7979,0.797851


qarib/bert-base-qarib, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5709,0.506017,0.74325,0.743163
400,0.4964,0.483371,0.77055,0.76889
600,0.4652,0.438674,0.78635,0.785864
800,0.445,0.431804,0.7958,0.794991
1000,0.4262,0.449502,0.79075,0.789189
1200,0.4124,0.384676,0.8185,0.818446
1400,0.3069,0.440827,0.81905,0.819048
1600,0.2668,0.489965,0.79255,0.788412
1800,0.2682,0.461142,0.81935,0.819201
2000,0.273,0.417349,0.8217,0.8217


qarib/bert-base-qarib, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5709,0.506017,0.74325,0.743163
400,0.4964,0.483371,0.77055,0.76889
600,0.4652,0.438674,0.78635,0.785864
800,0.445,0.431804,0.7958,0.794991
1000,0.4262,0.449502,0.79075,0.789189
1200,0.4124,0.384676,0.8185,0.818446
1400,0.3069,0.440827,0.81905,0.819048
1600,0.2668,0.489965,0.79255,0.788412
1800,0.2682,0.461142,0.81935,0.819201
2000,0.273,0.417349,0.8217,0.8217


qarib/bert-base-qarib, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5709,0.506017,0.74325,0.743163
400,0.4964,0.483371,0.77055,0.76889
600,0.4652,0.438674,0.78635,0.785864
800,0.445,0.431804,0.7958,0.794991
1000,0.4262,0.449502,0.79075,0.789189
1200,0.4124,0.384676,0.8185,0.818446
1400,0.3069,0.440827,0.81905,0.819048
1600,0.2668,0.489965,0.79255,0.788412
1800,0.2682,0.461142,0.81935,0.819201
2000,0.273,0.417349,0.8217,0.8217


Unnamed: 0,Model,Accuracy,F1
0,CAMeL-Lab/bert-base-arabic-camelbert-da,0.8149,0.814842
3,aubmindlab/bert-base-arabertv02-twitter,0.83235,0.832342
4,qarib/bert-base-qarib,0.82915,0.829129
