In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


log_file = 'companiesReviews_final_1.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


df = pd.read_csv('datasets/EgyptianCompaniesReviewsSA/Final_Data.csv', encoding='utf-8', engine='python', sep='\t') #,  , quotechar="'"  , quoting=3
display(df.columns)
df.fillna('', inplace=True)

display(df[:4])

# 'review_description', 'rating', 'company'



df = df[df['review_description'] != '']

classes = set(df['company'].values)
display(classes)

print()

classes = set(df['rating'].values)
display(classes)



df['rating'] = df['rating'].astype('category')
df['label'] = df['rating'].cat.codes



df = df[['review_description', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128


models = [ 
        'aubmindlab/bert-base-arabertv02-twitter',
        'CAMeL-Lab/bert-base-arabic-camelbert-da',
        'qarib/bert-base-qarib', 
]


for model_name in models:
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=classes_num).to('cuda')                                                 
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['review_description'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}

            
        epochs = 10
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 200, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 200
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('companiesReviews_final_1.csv')
display(best_results)



2024-07-28 04:44:59.354240: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-28 04:44:59.379579: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index(['review_description', 'rating', 'company'], dtype='object')

Unnamed: 0,review_description,rating,company
0,رائع,positive,talbat
1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشكل اسرع,positive,talbat
2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال بالشبكة..مع انه النت عندي تمام شو الحل??,negative,talbat
3,لماذا لا يمكننا طلب من ماكدونالدز؟,negative,talbat


{'Ezz Steel',
 'Raya',
 'TMG',
 'capiter',
 'domty',
 'elsewedy',
 'hilton',
 'nestle',
 'swvl',
 'talbat',
 'telecom_egypt',
 'venus'}




{'negative', 'neutral', 'positive'}

3

40045

DatasetDict({
    train: Dataset({
        features: ['review_description', 'label', '__index_level_0__'],
        num_rows: 32036
    })
    test: Dataset({
        features: ['review_description', 'label', '__index_level_0__'],
        num_rows: 8009
    })
})

aubmindlab/bert-base-arabertv02-twitter, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4384,0.390448,0.865401,0.588966
400,0.4047,0.382208,0.870146,0.59053
600,0.3596,0.404166,0.861406,0.630518
800,0.3286,0.404792,0.867899,0.617208
1000,0.3265,0.41434,0.865152,0.645271
1200,0.2514,0.4392,0.859283,0.626693
1400,0.2509,0.465698,0.856162,0.637656
1600,0.219,0.514569,0.845174,0.645139
1800,0.1774,0.512981,0.851167,0.630685
2000,0.1809,0.511532,0.847796,0.647198


aubmindlab/bert-base-arabertv02-twitter, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4383,0.391038,0.864278,0.591335
400,0.4054,0.383009,0.867774,0.590645
600,0.3605,0.402159,0.862155,0.622214
800,0.3291,0.405096,0.8669,0.599662
1000,0.3289,0.418595,0.864652,0.649358
1200,0.2532,0.451627,0.861031,0.627093
1400,0.2532,0.465363,0.859783,0.632285
1600,0.2223,0.491431,0.855413,0.638362
1800,0.1774,0.504707,0.852041,0.630624
2000,0.1822,0.523698,0.84043,0.657298


aubmindlab/bert-base-arabertv02-twitter, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4383,0.391038,0.864278,0.591335
400,0.4054,0.383009,0.867774,0.590645
600,0.3605,0.402159,0.862155,0.622214
800,0.3291,0.405096,0.8669,0.599662
1000,0.3289,0.418595,0.864652,0.649358
1200,0.2532,0.451627,0.861031,0.627093
1400,0.2532,0.465363,0.859783,0.632285
1600,0.2223,0.491431,0.855413,0.638362
1800,0.1774,0.504707,0.852041,0.630624
2000,0.1822,0.523698,0.84043,0.657298


CAMeL-Lab/bert-base-arabic-camelbert-da, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.478,0.431658,0.852041,0.577338
400,0.4303,0.408524,0.857036,0.580333
600,0.3769,0.423543,0.854039,0.624335
800,0.3395,0.438977,0.85791,0.610704
1000,0.3357,0.445913,0.85329,0.609731
1200,0.2371,0.521704,0.84917,0.600896
1400,0.2398,0.510852,0.847921,0.652587
1600,0.1925,0.640822,0.838931,0.632751
1800,0.1526,0.594183,0.8448,0.620253
2000,0.1642,0.569784,0.841304,0.628774


CAMeL-Lab/bert-base-arabic-camelbert-da, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.478,0.431658,0.852041,0.577338
400,0.4303,0.408524,0.857036,0.580333
600,0.3769,0.423543,0.854039,0.624335
800,0.3395,0.438977,0.85791,0.610704
1000,0.3357,0.445913,0.85329,0.609731
1200,0.2371,0.521704,0.84917,0.600896
1400,0.2398,0.510852,0.847921,0.652587
1600,0.1925,0.640822,0.838931,0.632751
1800,0.1526,0.594183,0.8448,0.620253
2000,0.1642,0.569784,0.841304,0.628774


CAMeL-Lab/bert-base-arabic-camelbert-da, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.478,0.431658,0.852041,0.577338
400,0.4303,0.408524,0.857036,0.580333
600,0.3769,0.423543,0.854039,0.624335
800,0.3395,0.438977,0.85791,0.610704
1000,0.3357,0.445913,0.85329,0.609731
1200,0.2371,0.521704,0.84917,0.600896
1400,0.2398,0.510852,0.847921,0.652587
1600,0.1925,0.640822,0.838931,0.632751
1800,0.1526,0.594183,0.8448,0.620253
2000,0.1642,0.569784,0.841304,0.628774


qarib/bert-base-qarib, try:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4467,0.420492,0.859658,0.596979
400,0.41,0.386764,0.8664,0.605646
600,0.3573,0.400318,0.861156,0.626959
800,0.3249,0.416461,0.861156,0.600309
1000,0.3265,0.425425,0.862779,0.659683
1200,0.2326,0.478472,0.861531,0.643762
1400,0.2389,0.467766,0.858659,0.639273
1600,0.2068,0.549594,0.845424,0.654443
1800,0.1595,0.577002,0.845174,0.642455
2000,0.1644,0.553337,0.845424,0.646215


qarib/bert-base-qarib, try:1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4467,0.420492,0.859658,0.596979
400,0.41,0.386764,0.8664,0.605646
600,0.3573,0.400318,0.861156,0.626959
800,0.3249,0.416461,0.861156,0.600309
1000,0.3265,0.425425,0.862779,0.659683
1200,0.2326,0.478472,0.861531,0.643762
1400,0.2389,0.467766,0.858659,0.639273
1600,0.2068,0.549594,0.845424,0.654443
1800,0.1595,0.577002,0.845174,0.642455
2000,0.1644,0.553337,0.845424,0.646215


qarib/bert-base-qarib, try:2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.4467,0.420492,0.859658,0.596979
400,0.41,0.386764,0.8664,0.605646
600,0.3573,0.400318,0.861156,0.626959
800,0.3249,0.416461,0.861156,0.600309
1000,0.3265,0.425425,0.862779,0.659683
1200,0.2326,0.478472,0.861531,0.643762
1400,0.2389,0.467766,0.858659,0.639273
1600,0.2068,0.549594,0.845424,0.654443
1800,0.1595,0.577002,0.845174,0.642455
2000,0.1644,0.553337,0.845424,0.646215


Unnamed: 0,Model,Accuracy,F1
0,CAMeL-Lab/bert-base-arabic-camelbert-da,0.847921,0.652587
3,aubmindlab/bert-base-arabertv02-twitter,0.842927,0.660584
5,qarib/bert-base-qarib,0.862779,0.659683
