In [1]:

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

from sklearn.metrics import classification_report
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

log_file = 'ml_nli.txt'

# ds1 = load_dataset('MoritzLaurer/multilingual-NLI-26lang-2mil7', split='ar_anli') 
# ds2 = load_dataset('MoritzLaurer/multilingual-NLI-26lang-2mil7', split='ar_fever')
# ds3 = load_dataset('MoritzLaurer/multilingual-NLI-26lang-2mil7', split='ar_ling')
# ds4 = load_dataset('MoritzLaurer/multilingual-NLI-26lang-2mil7', split='ar_mnli')
# ds5 = load_dataset('MoritzLaurer/multilingual-NLI-26lang-2mil7', split='ar_wanli')
# # display(ds)


ds1 = pd.read_parquet('multi_nli/ar_anli-00000-of-00001-d5ddcd7a96189c94.parquet')
ds2 = pd.read_parquet('multi_nli/ar_fever-00000-of-00001-75e864b8c1cf8d17.parquet')
ds3 = pd.read_parquet('multi_nli/ar_ling-00000-of-00001-f4e042f46b091cf7.parquet')
ds4 = pd.read_parquet('multi_nli/ar_mnli-00000-of-00001-13deaea9065575d9.parquet')
ds5 = pd.read_parquet('multi_nli/ar_wanli-00000-of-00001-7f580e9d2eff0880.parquet')

ds1 = Dataset.from_pandas(ds1)
ds2 = Dataset.from_pandas(ds2)
ds3 = Dataset.from_pandas(ds3)
ds4 = Dataset.from_pandas(ds4)
ds5 = Dataset.from_pandas(ds5)

ds1 = ds1.train_test_split(test_size=0.2)
ds2 = ds2.train_test_split(test_size=0.2)
ds3 = ds3.train_test_split(test_size=0.2)
ds4 = ds4.train_test_split(test_size=0.2)
ds5 = ds5.train_test_split(test_size=0.2)

# display(ds4)

dataset_t = concatenate_datasets([ds1['train'], ds2['train'], ds3['train'], ds4['train'], ds5['train']])
dataset_v = concatenate_datasets([ds1['test'], ds2['test'], ds3['test'], ds4['test'], ds5['test']])

# display(dataset_train)
display(len(dataset_t))
display(len(dataset_v))

unique_labels = set(dataset_t['label'])
classes_num = len(unique_labels)

print(f'Unique Labels: {unique_labels}')
print(f'Number of Classes: {classes_num}')

models = ['faisalq/bert-base-arabic-wordpiece', 'faisalq/bert-base-arabic-senpiece',
                'faisalq/bert-base-arabic-bbpe']

with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')

for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
        dataset_train = dataset_t
        dataset_validation = dataset_v
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
                                                     
        
        max_length = 128
        
        def preprocess_function(examples):
            return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding="max_length",
                            max_length=max_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)

                
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}
        

        epochs = 10
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 256
        
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, 
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            # weight_decay=0.01,
            logging_steps = 120, #50_000
            evaluation_strategy = 'steps',
            eval_steps = 120
            
        )
        
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        # trainer.evaluate()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results.to_csv('mlnli_results.csv')
display(best_results)

2024-02-11 06:39:16.257815: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-11 06:39:16.282131: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


84000

21000

Unique Labels: {0, 1, 2}
Number of Classes: 3
faisalq/bert-base-arabic-wordpiece, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,1.0085,0.909323,0.575714,0.549143
240,0.884,0.826953,0.62681,0.611991
360,0.8168,0.836469,0.642857,0.628004
480,0.7318,0.795567,0.649571,0.634084
600,0.7279,0.794047,0.653143,0.634771
720,0.6486,0.862447,0.662286,0.65184
840,0.5764,0.853524,0.66681,0.654346
960,0.5736,0.839994,0.664571,0.658136
1080,0.4406,0.982343,0.661762,0.65385
1200,0.4233,0.953802,0.665667,0.657209


faisalq/bert-base-arabic-wordpiece, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,1.0128,0.906327,0.567905,0.549694
240,0.8871,0.840768,0.620333,0.603527
360,0.8157,0.836209,0.64281,0.62697
480,0.7309,0.791345,0.652905,0.639252
600,0.7226,0.776877,0.660286,0.645746
720,0.6339,0.869664,0.662381,0.650303
840,0.5587,0.83969,0.661714,0.651973
960,0.5561,0.853379,0.66419,0.656413
1080,0.4276,1.016338,0.65981,0.650888
1200,0.4027,0.97403,0.666143,0.657315


faisalq/bert-base-arabic-wordpiece, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,1.0128,0.906327,0.567905,0.549694
240,0.8871,0.840768,0.620333,0.603527
360,0.8157,0.836209,0.64281,0.62697
480,0.7309,0.791345,0.652905,0.639252
600,0.7226,0.776877,0.660286,0.645746
720,0.6339,0.869664,0.662381,0.650303
840,0.5587,0.83969,0.661714,0.651973
960,0.5561,0.853379,0.66419,0.656413
1080,0.4276,1.016338,0.65981,0.650888
1200,0.4027,0.97403,0.666143,0.657315


faisalq/bert-base-arabic-senpiece, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9887,0.873665,0.59919,0.580753
240,0.8586,0.828511,0.63181,0.617675
360,0.7911,0.799252,0.652952,0.636988
480,0.6984,0.805312,0.65819,0.644195
600,0.6992,0.784036,0.662048,0.646201
720,0.6085,0.823329,0.670857,0.658548
840,0.5306,0.828429,0.668619,0.658897
960,0.5314,0.825902,0.671095,0.660694
1080,0.4031,0.982212,0.662667,0.655008
1200,0.378,1.01323,0.659238,0.652079


faisalq/bert-base-arabic-senpiece, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9887,0.873665,0.59919,0.580753
240,0.8586,0.828511,0.63181,0.617675
360,0.7911,0.799252,0.652952,0.636988
480,0.6984,0.805312,0.65819,0.644195
600,0.6992,0.784036,0.662048,0.646201
720,0.6085,0.823329,0.670857,0.658548
840,0.5306,0.828429,0.668619,0.658897
960,0.5314,0.825902,0.671095,0.660694
1080,0.4031,0.982212,0.662667,0.655008
1200,0.378,1.01323,0.659238,0.652079


faisalq/bert-base-arabic-senpiece, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9887,0.873665,0.59919,0.580753
240,0.8586,0.828511,0.63181,0.617675
360,0.7911,0.799252,0.652952,0.636988
480,0.6984,0.805312,0.65819,0.644195
600,0.6992,0.784036,0.662048,0.646201
720,0.6085,0.823329,0.670857,0.658548
840,0.5306,0.828429,0.668619,0.658897
960,0.5314,0.825902,0.671095,0.660694
1080,0.4031,0.982212,0.662667,0.655008
1200,0.378,1.01323,0.659238,0.652079


faisalq/bert-base-arabic-bbpe, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9923,0.891256,0.580333,0.56709
240,0.8739,0.830963,0.619619,0.605763
360,0.8056,0.821265,0.636048,0.621276
480,0.7121,0.809452,0.64481,0.6336
600,0.7048,0.805529,0.647048,0.636318
720,0.6116,0.868566,0.651429,0.643421
840,0.5264,0.879853,0.647905,0.642287
960,0.5271,0.866407,0.651619,0.64291
1080,0.3988,1.044992,0.64581,0.639966
1200,0.3642,0.981877,0.654619,0.647436


faisalq/bert-base-arabic-bbpe, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9923,0.891256,0.580333,0.56709
240,0.8739,0.830963,0.619619,0.605763
360,0.8056,0.821265,0.636048,0.621276
480,0.7121,0.809452,0.64481,0.6336
600,0.7048,0.805529,0.647048,0.636318
720,0.6116,0.868566,0.651429,0.643421
840,0.5264,0.879853,0.647905,0.642287
960,0.5271,0.866407,0.651619,0.64291
1080,0.3988,1.044992,0.64581,0.639966
1200,0.3642,0.981877,0.654619,0.647436


faisalq/bert-base-arabic-bbpe, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/84000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,F1 Score
120,0.9923,0.891256,0.580333,0.56709
240,0.8739,0.830963,0.619619,0.605763
360,0.8056,0.821265,0.636048,0.621276
480,0.7121,0.809452,0.64481,0.6336
600,0.7048,0.805529,0.647048,0.636318
720,0.6116,0.868566,0.651429,0.643421
840,0.5264,0.879853,0.647905,0.642287
960,0.5271,0.866407,0.651619,0.64291
1080,0.3988,1.044992,0.64581,0.639966
1200,0.3642,0.981877,0.654619,0.647436


Unnamed: 0,Model,Accuracy,F1
0,faisalq/bert-base-arabic-bbpe,0.654619,0.647436
1,faisalq/bert-base-arabic-bbpe,0.654619,0.647436
2,faisalq/bert-base-arabic-bbpe,0.654619,0.647436
3,faisalq/bert-base-arabic-senpiece,0.668667,0.661311
4,faisalq/bert-base-arabic-senpiece,0.668667,0.661311
5,faisalq/bert-base-arabic-senpiece,0.668667,0.661311
6,faisalq/bert-base-arabic-wordpiece,0.664571,0.658136
