In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
import glob
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


log_file = 'SDTwittC_gender_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


files1 = glob.glob('benchmarks2/SDTwittC/Male/*')
files2 = glob.glob('benchmarks2/SDTwittC/Female/*')


# getting males comments
dfs = []
x = 0
for file in files1:    
    df = pd.read_csv(file, header=None, names=['text'], encoding='utf-8', sep='\t', quoting=3, engine='python')
    dfs.append(df)
    x += len(df)

dfm = pd.concat(dfs, ignore_index=True)
dfm.fillna('', inplace=True)
display(x)
display(len(dfm))
dfm = dfm[dfm['text'] != '']
display(len(dfm))
dfm['label'] = 0
display(dfm[:4])

# getting females comments
dfs = []
x = 0
for file in files2:    
    df = pd.read_csv(file, header=None, names=['text'], encoding='utf-8', sep='\t', quoting=3, engine='python')
    dfs.append(df)
    x += len(df)

dff = pd.concat(dfs, ignore_index=True)
dff.fillna('', inplace=True)
display(x)
display(len(dff))
dff = dff[dff['text'] != '']
display(len(dff))
dff['label'] = 1
display(dff[:4])


print()
df = pd.concat([dfm, dff], axis=0, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
display(len(df))
display(df[:4])


classes_num = 2
display(classes_num)
display(len(df))

ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128

models = [ 
        'faisalq/SaudiBERT',
        'UBC-NLP/MARBERT',
        'UBC-NLP/MARBERTv2',  
]

for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      
        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        epochs = 4
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 1000, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 1000
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('SDTwittC_gender_results_2.csv')
display(best_results)




2024-04-02 06:48:42.958683: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-02 06:48:42.983582: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


233517

233517

233517

Unnamed: 0,text,label
0,حينما تخرج الحكمة من الحاكم الملك سلمان بن عبدالعزيز ❤,0
1,أعظم فيديو ترجمته في عام 2018 كان لهذا الرجل الي قال قصة حياته وتجربته في 8 دقائق فقط !! لكن كل دقيقة منها عظيمة و,0
2,اليوم في مباراة نهائي كأس خادم الحرمين الشريفين بين بيصير فيه طاقم كبير لتغطيته بأكثر من 580 كامير,0
3,نفس القوة هذا عرض خاص لمتابعين حسابي فقط,0


217313

217313

217313

Unnamed: 0,text,label
0,يا واحد عن كل احد يا جزء مني❤,1
1,مطر الليل💙🌧🌧🌧,1
2,الله يرزقني كثر ماشفت هالصوره بالتايم,1
3,هههههههههههههههههههههههههههههههههههههههههههههه,1





450830

Unnamed: 0,text,label
0,الله يبارك فيك,0
1,هههههههههههههههههههههههههههههه,1
2,اقتباسات من هنا وهناك,0
3,الله يخلي التكميم,1


2

450830

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 360664
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 90166
    })
})

faisalq/SaudiBERT, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/SaudiBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5618,0.526826,0.728002,0.727713
2000,0.5328,0.514994,0.740756,0.739685
3000,0.5163,0.504124,0.744505,0.742953
4000,0.5069,0.490764,0.752124,0.751399
5000,0.5012,0.484646,0.755484,0.754502
6000,0.4612,0.516601,0.757614,0.755747
7000,0.4022,0.507433,0.756638,0.755314
8000,0.3966,0.499219,0.760209,0.759316
9000,0.3971,0.498632,0.758512,0.755789
10000,0.3993,0.515233,0.764235,0.762771


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


faisalq/SaudiBERT, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/SaudiBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5616,0.526876,0.728501,0.728418
2000,0.5337,0.517371,0.739625,0.738528
3000,0.5166,0.503121,0.746346,0.744573
4000,0.507,0.49134,0.752667,0.75207
5000,0.502,0.485572,0.754131,0.752773
6000,0.4609,0.508135,0.756715,0.755148
7000,0.4017,0.504556,0.755529,0.753783
8000,0.397,0.500441,0.757813,0.756942
9000,0.3978,0.494705,0.756982,0.753877
10000,0.4014,0.51247,0.760897,0.758242


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


faisalq/SaudiBERT, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/SaudiBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5616,0.526876,0.728501,0.728418
2000,0.5337,0.517371,0.739625,0.738528
3000,0.5166,0.503121,0.746346,0.744573
4000,0.507,0.49134,0.752667,0.75207
5000,0.502,0.485572,0.754131,0.752773
6000,0.4609,0.508135,0.756715,0.755148
7000,0.4017,0.504556,0.755529,0.753783
8000,0.397,0.500441,0.757813,0.756942
9000,0.3978,0.494705,0.756982,0.753877
10000,0.4014,0.51247,0.760897,0.758242


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERT, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5825,0.555038,0.708948,0.708945
2000,0.5579,0.543228,0.718419,0.717259
3000,0.5424,0.526901,0.728412,0.726211
4000,0.5319,0.517006,0.735699,0.733911
5000,0.5269,0.514099,0.735444,0.735422
6000,0.4817,0.549443,0.734756,0.732644
7000,0.4123,0.551481,0.734079,0.732905
8000,0.411,0.527243,0.739092,0.738985
9000,0.4136,0.540525,0.740113,0.737992
10000,0.4154,0.553714,0.744017,0.741558


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERT, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5825,0.555038,0.708948,0.708945
2000,0.5579,0.543228,0.718419,0.717259
3000,0.5424,0.526901,0.728412,0.726211
4000,0.5319,0.517006,0.735699,0.733911
5000,0.5269,0.514099,0.735444,0.735422
6000,0.4817,0.549443,0.734756,0.732644
7000,0.4123,0.551481,0.734079,0.732905
8000,0.411,0.527243,0.739092,0.738985
9000,0.4136,0.540525,0.740113,0.737992
10000,0.4154,0.553714,0.744017,0.741558


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERT, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5825,0.555038,0.708948,0.708945
2000,0.5579,0.543228,0.718419,0.717259
3000,0.5424,0.526901,0.728412,0.726211
4000,0.5319,0.517006,0.735699,0.733911
5000,0.5269,0.514099,0.735444,0.735422
6000,0.4817,0.549443,0.734756,0.732644
7000,0.4123,0.551481,0.734079,0.732905
8000,0.411,0.527243,0.739092,0.738985
9000,0.4136,0.540525,0.740113,0.737992
10000,0.4154,0.553714,0.744017,0.741558


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERTv2, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5736,0.544639,0.71456,0.714444
2000,0.5459,0.523467,0.728944,0.728196
3000,0.5263,0.511618,0.735499,0.733681
4000,0.5157,0.507346,0.742664,0.742301
5000,0.5121,0.500271,0.747466,0.747283
6000,0.4675,0.519285,0.751935,0.751506
7000,0.4034,0.527667,0.745525,0.744652
8000,0.401,0.509836,0.75097,0.750733
9000,0.3995,0.510737,0.751969,0.750058
10000,0.4034,0.533263,0.753444,0.750937


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERTv2, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5736,0.544639,0.71456,0.714444
2000,0.5459,0.523467,0.728944,0.728196
3000,0.5263,0.511618,0.735499,0.733681
4000,0.5157,0.507346,0.742664,0.742301
5000,0.5121,0.500271,0.747466,0.747283
6000,0.4675,0.519285,0.751935,0.751506
7000,0.4034,0.527667,0.745525,0.744652
8000,0.401,0.509836,0.75097,0.750733
9000,0.3995,0.510737,0.751969,0.750058
10000,0.4034,0.533263,0.753444,0.750937


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


UBC-NLP/MARBERTv2, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/360664 [00:00<?, ? examples/s]

Map:   0%|          | 0/90166 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
1000,0.5736,0.544639,0.71456,0.714444
2000,0.5459,0.523467,0.728944,0.728196
3000,0.5263,0.511618,0.735499,0.733681
4000,0.5157,0.507346,0.742664,0.742301
5000,0.5121,0.500271,0.747466,0.747283
6000,0.4675,0.519285,0.751935,0.751506
7000,0.4034,0.527667,0.745525,0.744652
8000,0.401,0.509836,0.75097,0.750733
9000,0.3995,0.510737,0.751969,0.750058
10000,0.4034,0.533263,0.753444,0.750937


Checkpoint destination directory bert/checkpoint-20000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Unnamed: 0,Model,Accuracy,F1
0,UBC-NLP/MARBERT,0.744017,0.741558
3,UBC-NLP/MARBERTv2,0.758202,0.757156
6,faisalq/SaudiBERT,0.766697,0.765807
