In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)



df = pd.read_csv('dataset.csv', encoding='utf-8', engine='python') #, sep='\t' , quotechar="'"  , quoting=3
display(df.columns)
df.fillna('', inplace=True)

display(df[:4])



df = df[df['text'] != '']  # the name of field contains the text (adjust accordingly) 

classes = set(df['original_label'].values) # the name of field contains the label (adjust accordingly)
display(classes)

df['original_label'] = df['original_label'].astype('category')
df['label'] = df['original_label'].cat.codes  # keep the name 'label' as it is (do not change)



df = df[['text', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128 # you can change to 64 if the text is short


model_name = 'bert-base-uncased'

              
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
dataset_train = ds['train']
dataset_validation = ds['test']                                                    
        
      

def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",  # change 'text' to the name of the text column if it's different
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
dataset_train = dataset_train.map(preprocess_function, batched=True)
dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')               
            return {'accuracy': acc, 'f1_score': f1}

            
epochs = 5  # increase for more training time
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 16 # change to 32 if gpu's memory allows it
        
training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 10, #50_000
            evaluation_strategy = 'steps',
            eval_steps = 10
            
        )
        
trainer = Trainer(
            model = model,
            args = training_args,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
trainer.train()





Index(['#', 'type', 'text', 'original_label', 'aggregatedAnnotationConfidence',
       'annotator1', 'annotator2', 'annotator3'],
      dtype='object')

Unnamed: 0,#,type,text,original_label,aggregatedAnnotationConfidence,annotator1,annotator2,annotator3
0,1,TWEET,مبروك و سامحونا لعجزنا التام. عقبال اللي جوه. اللي بره يا عاجز يا بيزايد على العاجز,0,0.6667,-1,0,0
1,2,C1,كلنا بره ومش هنبطل نزايد على العجايز الي جابونا ورى,-1,0.6667,-1,-1,0
2,3,C2,بدل ما انت قاعد بره كده تعالي ازرع الصحرا,0,1.0,0,0,0
3,4,C3,قذر اتفووو ماتيجى مصر وتورينا نفسك كدا ياجبان,-1,1.0,-1,-1,-1


{-2, -1, 0}

3

1100

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 880
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 220
    })
})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
10,1.0661,1.043688,0.427273,0.199575
20,1.0439,1.049138,0.427273,0.199575
30,1.0397,1.026412,0.463636,0.337077
40,1.0368,1.034759,0.436364,0.319221
50,1.0458,1.030766,0.468182,0.329023
60,1.0013,1.019175,0.495455,0.359691
70,1.0503,1.043233,0.468182,0.342471
80,1.0388,1.030905,0.45,0.304056
90,1.0045,1.032683,0.413636,0.197611
100,1.0372,1.012174,0.490909,0.358935


TrainOutput(global_step=275, training_loss=0.998851318359375, metrics={'train_runtime': 12.3631, 'train_samples_per_second': 355.899, 'train_steps_per_second': 22.244, 'total_flos': 289424759500800.0, 'train_loss': 0.998851318359375, 'epoch': 5.0})