In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


log_file = 'EgyptianDialectGender_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


directory = 'datasets/EgyptianDialectGender/Male'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dfm = pd.DataFrame(data)

dfm = dfm[dfm['Lang'] == 'ar']

display(dfm[:4])
display(len(dfm))

print()

directory = 'datasets/EgyptianDialectGender/Female'

data = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            temp_data = {'text': '', 'Lang': ''}
            for line in file:
                if line.startswith('Tweet:'):
                    temp_data['text'] = line.split('Tweet:', 1)[1].strip()
                elif line.startswith('Lang:'):
                    temp_data['Lang'] = line.split('Lang:', 1)[1].strip()
                    data.append(temp_data.copy())
                    temp_data = {'text': '', 'Lang': ''}

dff = pd.DataFrame(data)

dff = dff[dff['Lang'] == 'ar']


display(dff[:4])
display(len(dff))


dfm['label'] = 0
dff['label'] = 1

dfm = dfm[:50000]
dff = dff[:50000]

df = pd.concat([dff, dfm], ignore_index=True)
display(len(df))
 
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

display(len(df))
display(df.columns)
display(df[:4])

classes_num = 2
display(classes_num)

ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128


models = [ 
        'faisalq/EgyBERT',
        'UBC-NLP/MARBERT',
        'UBC-NLP/MARBERTv2',  
]


for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        
        epochs = 8
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 200, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 200
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('EgyptianDialectGender_results_2.csv')
display(best_results)



2024-07-26 09:48:11.384795: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-26 09:48:11.407296: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,text,Lang
0,ÿ™Ÿàÿ±ŸäŸÜŸà ŸÖŸäŸÜ Ÿäÿß ÿπŸÜŸäÿß ÿü https://t.co/ED6FH8lZkO,ar
2,RT @_Kamaljr: Ÿäÿß ŸÖÿßŸÖÿß Ÿáÿ™ÿµŸàÿ±ŸäŸÜŸä Ÿàÿ™ÿ≤ÿπŸÇŸäŸÑŸä ŸÖÿßÿ¥Ÿâ ÿü https://t.co/28tgjwbJhZ,ar
5,RT @q8_da_vinci: ÿ™ŸÖ ÿ±ŸÅÿ∂ ÿ™Ÿàÿ∏ŸäŸÅŸä ŸÉŸÖÿπŸÑŸÖ ÿ™ÿ±ÿ®ŸäŸá ŸÅŸÜŸäŸá ŸÅŸä ÿßŸÑÿ™ÿ±ÿ®ŸäŸá ÿßŸÑÿÆÿßÿµŸá. ŸÖŸÜ ŸÇÿ®ŸÑ ÿßŸÑŸÑÿ¨ŸÜÿ© ÿßŸÑÿ∑ÿ®ŸäŸá ŸÅŸä ŸÖÿ≥ÿ™ÿ¥ŸÅŸâ ÿßŸÑÿ®ÿ≠ÿ± ŸÅŸä Ÿàÿ≤ÿßÿ±Ÿá ÿßŸÑÿµÿ≠Ÿá ŸÑÿ≥ÿ®ÿ® ÿßÿπÿßŸÇÿ™Ÿä ÿßŸÑÿ®ÿµÿ±ŸäŸá.‚Ä¶,ar
8,ÿØŸá ÿ£ŸäŸá ÿßŸÑÿ£ÿØÿ® Ÿà ÿßŸÑŸÉÿ≥ŸàŸÅ ÿØŸá ÿ£ŸÜÿß ŸÑŸà ÿπŸÜÿØ ÿ£ÿÆÿ™ ŸÅŸä ÿ≥ŸÜ ÿ¨Ÿàÿßÿ≤ ÿßÿØŸáÿßŸÑŸÉ Ÿäÿß ÿµÿßÿ≠ÿ®Ÿä üòÇ https://t.co/1GPBcYlGtl,ar


224817




Unnamed: 0,text,Lang
0,"""ÿßÿØŸíŸÅŸéÿπŸí ÿ®ŸêÿßŸÑŸéŸëÿ™ŸêŸä ŸáŸêŸäŸé ÿ£Ÿéÿ≠Ÿíÿ≥ŸéŸÜŸè ŸÅŸéÿ•Ÿêÿ∞Ÿéÿß ÿßŸÑŸéŸëÿ∞ŸêŸä ÿ®ŸéŸäŸíŸÜŸéŸÉŸé ŸàŸéÿ®ŸéŸäŸíŸÜŸéŸáŸè ÿπŸéÿØŸéÿßŸàŸéÿ©Ÿå ŸÉŸéÿ£ŸéŸÜŸéŸëŸáŸè ŸàŸéŸÑŸêŸäŸåŸë ÿ≠ŸéŸÖŸêŸäŸÖŸå""",ar
1,RT @AKettana: ŸÇÿßÿπÿØŸá ÿ±ŸÇŸÖ Ÿàÿßÿ≠ÿØ ÿπŸÑÿ¥ÿßŸÜ ÿ™ÿπÿ±ŸÅ ÿ™ÿπŸäÿ¥ Ÿà ÿ™ÿ™ÿπÿßŸÖŸÑ ŸÖÿπ ÿßŸÑŸÜÿßÿ≥ ŸÑÿßÿ≤ŸÖ ÿ™ÿ™ÿπÿßŸÖŸÑ Ÿà ÿ™ÿπÿ±ŸÅ Ÿà ÿ™ŸÇÿ™ŸÜÿπ ÿ®ÿ•ŸÜ ÿßŸÑÿØŸÜŸäÿß ŸÖÿµÿßŸÑÿ≠,ar
2,ÿ≥ŸÑÿßŸÖ ÿπŸÑŸâ ŸÖŸÜ ÿ±ÿ£Ÿâ ÿπŸäŸÜÿß ŸÉÿßÿØÿ™ ÿ£ŸÜ ÿ™ÿ®ŸÉŸä ŸÅÿ£ÿ∂ÿ≠ŸÉŸáÿß,ar
3,RT @esraa_elbraga: ŸÖŸÜ ÿßŸÑÿ∫ÿ®ÿßÿ° ÿßŸÜŸÉ ÿ™ÿ¥ŸàŸÅ ŸÜŸÅÿ≥ŸÉ ÿµÿ≠ ÿπ ÿ∑ŸàŸÑ,ar


197551

100000

100000

Index(['text', 'Lang', 'label'], dtype='object')

Unnamed: 0,text,Lang,label
0,RT @_3omario_: ÿßŸÑ views ŸÖÿ¥ ŸÉŸÑ ÿ≠ÿßÿ¨ÿ©,ar,0
1,@Assemism ÿ±ÿ¨ÿßÿ° ÿ±ÿßÿ¨ÿπ ÿßŸÑÿ™ŸàŸäÿ™ÿßÿ™ ÿßŸÑŸÑŸä ŸÇÿ®ŸÑŸáÿß,ar,0
2,RT @KarimElDegwy: ÿßŸÇÿ≥ŸÖ ÿ®ÿßŸÑŸÑŸáÿå ÿßŸÜÿß ŸÜÿµ ÿßŸÑÿßÿÆÿ®ÿßÿ± ÿ®ÿßŸÅÿ™ŸÉÿ±Ÿáÿß ŸÅŸä ÿßŸÑÿßŸàŸÑ ÿßŸÅŸäŸá,ar,1
3,@kamaromar @m3adel ŸÖÿØÿÆÿ™ÿπ,ar,0


2

DatasetDict({
    train: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['text', 'Lang', 'label'],
        num_rows: 20000
    })
})

faisalq/EgyBERT, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/EgyBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6027,0.530546,0.73245,0.732156
400,0.5379,0.510605,0.7371,0.737037
600,0.4912,0.47381,0.76725,0.767102
800,0.5559,0.485367,0.75675,0.753289
1000,0.4897,0.469202,0.76915,0.76673
1200,0.4615,0.426027,0.7945,0.794293
1400,0.4025,0.430163,0.7996,0.799222
1600,0.3817,0.42959,0.8017,0.801181
1800,0.3679,0.41795,0.80785,0.80785
2000,0.361,0.407849,0.8128,0.812598


faisalq/EgyBERT, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/EgyBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6052,0.545477,0.7291,0.729086
400,0.53,0.508176,0.7468,0.743895
600,0.5056,0.463896,0.774,0.773941
800,0.4802,0.479058,0.7636,0.76021
1000,0.461,0.425094,0.7978,0.797599
1200,0.4424,0.408207,0.80645,0.806388
1400,0.381,0.407871,0.8092,0.808991
1600,0.3501,0.411364,0.80525,0.803763
1800,0.3399,0.402836,0.8206,0.82049
2000,0.3376,0.386986,0.8242,0.82405


faisalq/EgyBERT, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/EgyBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6052,0.545477,0.7291,0.729086
400,0.53,0.508176,0.7468,0.743895
600,0.5056,0.463896,0.774,0.773941
800,0.4802,0.479058,0.7636,0.76021
1000,0.461,0.425094,0.7978,0.797599
1200,0.4424,0.408207,0.80645,0.806388
1400,0.381,0.407871,0.8092,0.808991
1600,0.3501,0.411364,0.80525,0.803763
1800,0.3399,0.402836,0.8206,0.82049
2000,0.3376,0.386986,0.8242,0.82405


UBC-NLP/MARBERT, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5863,0.533545,0.7269,0.722131
400,0.5044,0.483641,0.75965,0.757514
600,0.4665,0.442712,0.7899,0.789483
800,0.4508,0.441387,0.79315,0.792639
1000,0.442,0.406256,0.81135,0.811333
1200,0.429,0.416762,0.7976,0.796361
1400,0.3106,0.518278,0.8086,0.808401
1600,0.2678,0.443245,0.8095,0.809458
1800,0.2641,0.411658,0.81655,0.816547
2000,0.268,0.505864,0.81045,0.810416


UBC-NLP/MARBERT, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5863,0.533545,0.7269,0.722131
400,0.5044,0.483641,0.75965,0.757514
600,0.4665,0.442712,0.7899,0.789483
800,0.4508,0.441387,0.79315,0.792639
1000,0.442,0.406256,0.81135,0.811333
1200,0.429,0.416762,0.7976,0.796361
1400,0.3106,0.518278,0.8086,0.808401
1600,0.2678,0.443245,0.8095,0.809458
1800,0.2641,0.411658,0.81655,0.816547
2000,0.268,0.505864,0.81045,0.810416


UBC-NLP/MARBERT, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5863,0.533545,0.7269,0.722131
400,0.5044,0.483641,0.75965,0.757514
600,0.4665,0.442712,0.7899,0.789483
800,0.4508,0.441387,0.79315,0.792639
1000,0.442,0.406256,0.81135,0.811333
1200,0.429,0.416762,0.7976,0.796361
1400,0.3106,0.518278,0.8086,0.808401
1600,0.2678,0.443245,0.8095,0.809458
1800,0.2641,0.411658,0.81655,0.816547
2000,0.268,0.505864,0.81045,0.810416


UBC-NLP/MARBERTv2, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5977,0.528125,0.7316,0.729344
400,0.5056,0.461834,0.77195,0.771871
600,0.4642,0.454065,0.7867,0.785277
800,0.4451,0.436785,0.7888,0.78714
1000,0.4221,0.394369,0.81405,0.813983
1200,0.4134,0.382318,0.81965,0.819391
1400,0.3138,0.408911,0.82715,0.82709
1600,0.2758,0.390582,0.8268,0.826686
1800,0.2659,0.404524,0.82855,0.828294
2000,0.2636,0.408914,0.83045,0.830429


UBC-NLP/MARBERTv2, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5977,0.528125,0.7316,0.729344
400,0.5056,0.461834,0.77195,0.771871
600,0.4642,0.454065,0.7867,0.785277
800,0.4451,0.436785,0.7888,0.78714
1000,0.4221,0.394369,0.81405,0.813983
1200,0.4134,0.382318,0.81965,0.819391
1400,0.3138,0.408911,0.82715,0.82709
1600,0.2758,0.390582,0.8268,0.826686
1800,0.2659,0.404524,0.82855,0.828294
2000,0.2636,0.408914,0.83045,0.830429


UBC-NLP/MARBERTv2, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5977,0.528125,0.7316,0.729344
400,0.5056,0.461834,0.77195,0.771871
600,0.4642,0.454065,0.7867,0.785277
800,0.4451,0.436785,0.7888,0.78714
1000,0.4221,0.394369,0.81405,0.813983
1200,0.4134,0.382318,0.81965,0.819391
1400,0.3138,0.408911,0.82715,0.82709
1600,0.2758,0.390582,0.8268,0.826686
1800,0.2659,0.404524,0.82855,0.828294
2000,0.2636,0.408914,0.83045,0.830429


Unnamed: 0,Model,Accuracy,F1
0,UBC-NLP/MARBERT,0.817,0.816987
3,UBC-NLP/MARBERTv2,0.83265,0.832648
6,faisalq/EgyBERT,0.83265,0.832648
