In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

log_file = 'companiesReviews_final_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


df = pd.read_csv('datasets/EgyptianCompaniesReviewsSA/Final_Data.csv', encoding='utf-8', engine='python', sep='\t') #,  , quotechar="'"  , quoting=3
display(df.columns)
df.fillna('', inplace=True)

display(df[:4])

# 'review_description', 'rating', 'company'



df = df[df['review_description'] != '']

classes = set(df['company'].values)
display(classes)

print()

classes = set(df['rating'].values)
display(classes)



df['rating'] = df['rating'].astype('category')
df['label'] = df['rating'].cat.codes



df = df[['review_description', 'label']]


classes_num = len(classes)
display(classes_num)
display(len(df))


ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128


class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
                                                
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['review_description'], truncation=True, padding="max_length",
                            max_length=max_sequence_length, add_special_tokens=True)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}

            
        epochs = 10
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 200, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 200
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('companiesReviews_final_2.csv')
display(best_results)



2024-09-02 04:46:35.497296: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 04:46:35.520587: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index(['review_description', 'rating', 'company'], dtype='object')

Unnamed: 0,review_description,rating,company
0,رائع,positive,talbat
1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشكل اسرع,positive,talbat
2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال بالشبكة..مع انه النت عندي تمام شو الحل??,negative,talbat
3,لماذا لا يمكننا طلب من ماكدونالدز؟,negative,talbat


{'Ezz Steel',
 'Raya',
 'TMG',
 'capiter',
 'domty',
 'elsewedy',
 'hilton',
 'nestle',
 'swvl',
 'talbat',
 'telecom_egypt',
 'venus'}




{'negative', 'neutral', 'positive'}

3

40045

DatasetDict({
    train: Dataset({
        features: ['review_description', 'label', '__index_level_0__'],
        num_rows: 32036
    })
    test: Dataset({
        features: ['review_description', 'label', '__index_level_0__'],
        num_rows: 8009
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.694,0.428954,0.864278,0.585993
400,0.4195,0.397437,0.865027,0.586861
600,0.3738,0.4106,0.8664,0.588115
800,0.3463,0.388888,0.870271,0.59081
1000,0.3561,0.396905,0.867649,0.588408
1200,0.3003,0.415522,0.866275,0.589478
1400,0.2975,0.40807,0.866525,0.596127
1600,0.2659,0.433446,0.858035,0.595678
1800,0.2518,0.429826,0.863653,0.60359
2000,0.2589,0.436219,0.861156,0.62057


EgyBERT_lstm, try:1


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6687,0.417865,0.86253,0.584664
400,0.4967,0.511337,0.80909,0.539846
600,0.424,0.405968,0.865401,0.587454
800,0.367,0.393558,0.868398,0.589406
1000,0.3711,0.382655,0.87077,0.591092
1200,0.321,0.404404,0.8664,0.597644
1400,0.3267,0.404004,0.867774,0.601398
1600,0.2994,0.413261,0.8664,0.614452
1800,0.2794,0.411082,0.865401,0.614044
2000,0.281,0.424958,0.863529,0.614975


EgyBERT_lstm, try:2


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.6687,0.417865,0.86253,0.584664
400,0.4967,0.511337,0.80909,0.539846
600,0.424,0.405968,0.865401,0.587454
800,0.367,0.393558,0.868398,0.589406
1000,0.3711,0.382655,0.87077,0.591092
1200,0.321,0.404404,0.8664,0.597644
1400,0.3267,0.404004,0.867774,0.601398
1600,0.2994,0.413261,0.8664,0.614452
1800,0.2794,0.411082,0.865401,0.614044
2000,0.281,0.424958,0.863529,0.614975


EgyBERT_cnn, try:0


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5886,0.418476,0.864278,0.586053
400,0.416,0.389923,0.868897,0.589934
600,0.3751,0.392862,0.869522,0.59033
800,0.3492,0.378219,0.873018,0.594573
1000,0.3554,0.381729,0.870271,0.609354
1200,0.3011,0.393483,0.865526,0.651341
1400,0.3039,0.406647,0.867025,0.636451
1600,0.2762,0.423165,0.862779,0.661289
1800,0.2595,0.404236,0.864777,0.640817
2000,0.2618,0.426643,0.865401,0.63354


EgyBERT_cnn, try:1


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5906,0.420486,0.863029,0.585027
400,0.416,0.391193,0.865027,0.586852
600,0.3685,0.3988,0.870521,0.594165
800,0.3411,0.377956,0.870521,0.599069
1000,0.3499,0.373387,0.87077,0.598141
1200,0.2923,0.39935,0.8669,0.651216
1400,0.2884,0.414575,0.868023,0.645504
1600,0.2566,0.43999,0.856661,0.667874
1800,0.2391,0.452309,0.859783,0.652476
2000,0.2405,0.43871,0.858909,0.656217


EgyBERT_cnn, try:2


Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
200,0.5902,0.427326,0.86203,0.584104
400,0.4174,0.388957,0.869647,0.590401
600,0.3681,0.396724,0.870895,0.593002
800,0.3421,0.376606,0.871894,0.606077
1000,0.3495,0.38225,0.869647,0.60599
1200,0.2914,0.407225,0.866026,0.64644
1400,0.2889,0.414365,0.869022,0.643518
1600,0.2617,0.42917,0.860532,0.66288
1800,0.2409,0.436701,0.862655,0.651736
2000,0.2497,0.44211,0.864777,0.654561


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.846548,0.668199
1,EgyBERT_lstm,0.845798,0.660073
