In [1]:


import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, BertForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


log_file = 'AraStance.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


ds = load_dataset('strombergnlp/ara-stance')  

display(ds)
# return

ds_t = concatenate_datasets([ds['train'], ds['test']])
ds_v = ds['validation']

ds_t = ds_t.rename_column('stance', 'label')
ds_v = ds_v.rename_column('stance', 'label')

display(ds_t)
display(ds_v)

unique_labels = set(ds_t['label'])
classes_num = len(unique_labels)

print(f'Unique Labels: {unique_labels}')
print(f'Number of Classes: {classes_num}')

# return

max_sequence_length = 128



models = ['faisalq/bert-base-arabic-wordpiece', 'faisalq/bert-base-arabic-senpiece',
          'faisalq/bert-base-arabic-bbpe']


for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
        dataset_train = ds_t
        dataset_validation = ds_v                                                     
        
      

        def preprocess_function(examples):
            return tokenizer(examples['claim'], examples['article'], truncation=True, padding="max_length",
                            max_length=max_sequence_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        
        epochs = 30
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 256
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 10, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 10
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('AraStance_results.csv')
display(best_results)



2024-02-21 12:00:50.043627: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-21 12:00:50.066823: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset ara-stance (/home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'claim', 'article', 'stance'],
        num_rows: 2848
    })
    validation: Dataset({
        features: ['id', 'claim', 'article', 'stance'],
        num_rows: 569
    })
    test: Dataset({
        features: ['id', 'claim', 'article', 'stance'],
        num_rows: 646
    })
})

Dataset({
    features: ['id', 'claim', 'article', 'label'],
    num_rows: 3494
})

Dataset({
    features: ['id', 'claim', 'article', 'label'],
    num_rows: 569
})

Unique Labels: {0, 1, 2, 3}
Number of Classes: 4
faisalq/bert-base-arabic-wordpiece, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3494 [00:00<?, ? examples/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1459,1.132625,0.534271,0.273922
20,1.0189,1.071313,0.55536,0.306742
30,0.9744,1.018927,0.578207,0.399769
40,0.904,1.001878,0.59051,0.406488
50,0.8799,1.104224,0.528998,0.330818
60,1.1205,1.24886,0.516696,0.170336
70,1.1636,1.284787,0.516696,0.170336
80,1.1701,1.215525,0.516696,0.170336
90,1.1466,1.229535,0.516696,0.170336
100,1.1478,1.228903,0.516696,0.170336


faisalq/bert-base-arabic-wordpiece, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-10f47b778a150774.arrow


Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.15,1.129068,0.478032,0.272524
20,0.9845,0.993991,0.586995,0.395202
30,0.8561,0.941453,0.625659,0.473599
40,0.7166,0.931989,0.650264,0.506257
50,0.6287,0.907728,0.692443,0.521633
60,0.5287,0.797914,0.704745,0.601733
70,0.4156,0.767519,0.743409,0.639971
80,0.3191,0.733718,0.787346,0.699134
90,0.3326,0.729113,0.787346,0.682192
100,0.245,0.714793,0.797891,0.733745


faisalq/bert-base-arabic-wordpiece, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-10f47b778a150774.arrow
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-ac2d9af557f4b18e.arrow


Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.15,1.129068,0.478032,0.272524
20,0.9845,0.993991,0.586995,0.395202
30,0.8561,0.941453,0.625659,0.473599
40,0.7166,0.931989,0.650264,0.506257
50,0.6287,0.907728,0.692443,0.521633
60,0.5287,0.797914,0.704745,0.601733
70,0.4156,0.767519,0.743409,0.639971
80,0.3191,0.733718,0.787346,0.699134
90,0.3326,0.729113,0.787346,0.682192
100,0.245,0.714793,0.797891,0.733745


faisalq/bert-base-arabic-senpiece, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3494 [00:00<?, ? examples/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1355,1.144229,0.537786,0.275244
20,1.0088,1.080054,0.56942,0.314145
30,0.8437,0.965542,0.634446,0.436236
40,0.6583,1.032293,0.588752,0.450905
50,0.6147,0.838846,0.688928,0.52488
60,0.4692,0.915153,0.681898,0.539987
70,0.3979,0.770251,0.764499,0.684024
80,0.2993,0.863495,0.706503,0.616477
90,0.2518,0.775436,0.759227,0.683965
100,0.212,0.907977,0.732865,0.680055


faisalq/bert-base-arabic-senpiece, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-735b3146d470ee0e.arrow


Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1355,1.144229,0.537786,0.275244
20,1.0088,1.080054,0.56942,0.314145
30,0.8437,0.965542,0.634446,0.436236
40,0.6583,1.032293,0.588752,0.450905
50,0.6147,0.838846,0.688928,0.52488
60,0.4692,0.915153,0.681898,0.539987
70,0.3979,0.770251,0.764499,0.684024
80,0.2993,0.863495,0.706503,0.616477
90,0.2518,0.775436,0.759227,0.683965
100,0.212,0.907977,0.732865,0.680055


faisalq/bert-base-arabic-senpiece, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-735b3146d470ee0e.arrow
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-991470f38bda30c2.arrow


Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.1355,1.144229,0.537786,0.275244
20,1.0088,1.080054,0.56942,0.314145
30,0.8437,0.965542,0.634446,0.436236
40,0.6583,1.032293,0.588752,0.450905
50,0.6147,0.838846,0.688928,0.52488
60,0.4692,0.915153,0.681898,0.539987
70,0.3979,0.770251,0.764499,0.684024
80,0.2993,0.863495,0.706503,0.616477
90,0.2518,0.775436,0.759227,0.683965
100,0.212,0.907977,0.732865,0.680055


faisalq/bert-base-arabic-bbpe, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3494 [00:00<?, ? examples/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0507,1.021873,0.599297,0.419142
20,0.8441,0.928573,0.659051,0.483139
30,0.6668,0.819426,0.702988,0.55355
40,0.4364,0.880775,0.713533,0.611715
50,0.3606,0.704024,0.787346,0.704482
60,0.2597,0.686476,0.796134,0.725973
70,0.1769,0.749092,0.806678,0.746966
80,0.1237,0.73668,0.808436,0.75397
90,0.122,0.735035,0.826011,0.776831
100,0.0996,0.740844,0.827768,0.777323


faisalq/bert-base-arabic-bbpe, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-fcad83f47c1c1a47.arrow


Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0507,1.021873,0.599297,0.419142
20,0.8441,0.928573,0.659051,0.483139
30,0.6668,0.819426,0.702988,0.55355
40,0.4364,0.880775,0.713533,0.611715
50,0.3606,0.704024,0.787346,0.704482
60,0.2597,0.686476,0.796134,0.725973
70,0.1769,0.749092,0.806678,0.746966
80,0.1237,0.73668,0.808436,0.75397
90,0.122,0.735035,0.826011,0.776831
100,0.0996,0.740844,0.827768,0.777323


faisalq/bert-base-arabic-bbpe, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-fcad83f47c1c1a47.arrow
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/strombergnlp___ara-stance/stance/1.0.0/41bcabad1298f0b87ae022a14e9a13627dd91545245a530214769459af82360e/cache-9bcf4b5815c15a77.arrow


Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0507,1.021873,0.599297,0.419142
20,0.8441,0.928573,0.659051,0.483139
30,0.6668,0.819426,0.702988,0.55355
40,0.4364,0.880775,0.713533,0.611715
50,0.3606,0.704024,0.787346,0.704482
60,0.2597,0.686476,0.796134,0.725973
70,0.1769,0.749092,0.806678,0.746966
80,0.1237,0.73668,0.808436,0.75397
90,0.122,0.735035,0.826011,0.776831
100,0.0996,0.740844,0.827768,0.777323


Unnamed: 0,Model,Accuracy,F1
0,faisalq/bert-base-arabic-bbpe,0.843585,0.797586
3,faisalq/bert-base-arabic-senpiece,0.822496,0.773895
6,faisalq/bert-base-arabic-wordpiece,0.845343,0.791614
