In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, concatenate_datasets
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

log_file = 'SDC_EDC_2.txt'
with open(log_file, 'w') as f:
    f.write('Model,Accuracy,F1\n')


dfs = pd.read_csv('datasets/SDC_EDC/SDC.txt', header=None, names=['text'], encoding='utf-8', engine='python')
dfe = pd.read_csv('datasets/SDC_EDC/EDC.txt', header=None, names=['text'], encoding='utf-8', engine='python')

dfs['label'] = 0
dfe['label'] = 1


df = pd.concat([dfs, dfe], ignore_index=True)
display(len(df))
 
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

display(len(df))
display(df.columns)
display(df[:4])

classes_num = 2
display(classes_num)

ds = Dataset.from_pandas(df)

ds = ds.train_test_split(test_size=0.2)
display(ds)

max_sequence_length = 128



class BertLSTMModel(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128, num_layers=2):
        super(BertLSTMModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.num_labels = num_labels
    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]  
        logits = self.classifier(lstm_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )



class BertCNNModel(nn.Module):
    def __init__(self, model_name, num_labels, num_filters=100, filter_sizes=[3, 4, 5]):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, self.bert.config.hidden_size)) for fs in filter_sizes
        ])
        self.classifier = nn.Linear(len(filter_sizes) * num_filters, num_labels)
        self.num_labels = num_labels

    
    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state.unsqueeze(1)  
        
        conv_outputs = [torch.relu(conv(sequence_output)).squeeze(3) for conv in self.convs]
        pooled_outputs = [torch.max(output, 2)[0] for output in conv_outputs]
        cat_output = torch.cat(pooled_outputs, 1)
        
        logits = self.classifier(cat_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )








for j in range(2):
    if j == 0:
        model_name = 'EgyBERT_lstm'    
    else:
        model_name = 'EgyBERT_cnn'
    
    for i in range(3):
        print(f'{model_name}, try:{i}')
              
        tokenizer = AutoTokenizer.from_pretrained('faisalq/EgyBERT')
        # model = AutoModelForSequenceClassification.from_pretrained(model_name,
        #                                                       num_labels=classes_num).to('cuda')                                                 

        if j == 0:
            model = BertLSTMModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')       
        else:
            model = BertCNNModel(model_name='faisalq/EgyBERT', num_labels=classes_num).to('cuda')
       
                                                         
        dataset_train = ds['train']
        dataset_validation = ds['test']                                                    
        
      

        def preprocess_function(examples):
            return tokenizer(examples['text'], truncation=True, padding="max_length",
                            max_length=max_sequence_length)
        
        
        dataset_train = dataset_train.map(preprocess_function, batched=True)
        dataset_validation = dataset_validation.map(preprocess_function, batched=True)
        
       
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)    
            acc = accuracy_score(labels, predictions)        
            f1 = f1_score(labels, predictions, average='macro')   
            with open(log_file, 'a') as f:
                f.write(f'{model_name},{acc},{f1}\n')
            return {'accuracy': acc, 'f1_score': f1}


        
        
        epochs = 5
        save_steps = 10000 #save checkpoint every 10000 steps
        batch_size = 64
        
        training_args = TrainingArguments(
            output_dir = 'bert/',
            overwrite_output_dir=True,
            num_train_epochs = epochs,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            save_steps = save_steps,
            save_total_limit = 1, #only save the last 5 checkpoints
            fp16=True,
            learning_rate = 5e-5,  # 5e-5 is the default
            logging_steps = 50, #50_000
            evaluation_strategy = 'steps',
            # evaluate_during_training = True,
            eval_steps = 50
            
        )
        
        trainer = Trainer(
            model = model,
            args = training_args,
            # data_collator=data_collator,
            train_dataset=dataset_train,
            eval_dataset=dataset_validation,
            compute_metrics = compute_metrics
        )
        
        
        # trainer.train(resume_from_checkpoint=True)
        trainer.train()


results = pd.read_csv(log_file)

best_results = results.groupby('Model', as_index=False)['F1'].max()

best_results = pd.merge(best_results, results, on=['Model', 'F1'])
best_results = best_results[['Model', 'Accuracy', 'F1']]
best_results = best_results.drop_duplicates()
best_results.to_csv('SDC_EDC_results_2.csv')
display(best_results)



2024-09-02 21:26:00.087393: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 21:26:00.110817: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


28613

28613

Index(['text', 'label'], dtype='object')

Unnamed: 0,text,label
0,لما تـسيب الفلاني و المسلسل الفلاني والأغاني اللي بتحبها عشان ربنا وبس,1
1,وانت محل ما اروح ورايااا انا ايش قلت عنك دحين,0
2,فاذا حد مرت عليه نفس السالفة يفيدني بالله اديني خبر اول ما تدخل لأَنِّي انا عندي نفس المشكله ولي سمعتوا عادي لو باقي يوم حتا مين جرب تعرفون احد جاي الله يعافيكم الي يعرف ايش الحل مع العلم انو جاني لاكن يوم ارجع اشيك انا كمان اسأل رجع تاني ع يا جماعة الخير إنّو ما فيها اول شي انصحك تبعدين عن,0
3,وفي مجلس الحريم بعد ماطلعت كلمت وقالتلها على الموضوع فرحت ورحبت بالموضوع لانه واهله ناس طيبين ومعروفين بأخلاقهم واصلهم وقالتلها انها تستشير وترد لهم خبر,0


2

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22890
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5723
    })
})

EgyBERT_lstm, try:0


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.6429,0.53566,0.952298,0.952298
100,0.4066,0.251225,0.964529,0.964508
150,0.2101,0.154406,0.962258,0.962237
200,0.1513,0.110932,0.968198,0.968149
250,0.13,0.095798,0.971169,0.971155
300,0.1169,0.086764,0.970645,0.970607
350,0.1011,0.080338,0.972742,0.972737
400,0.0699,0.093915,0.972217,0.972215
450,0.0526,0.073518,0.97676,0.976744
500,0.0509,0.067771,0.977984,0.977968


EgyBERT_lstm, try:1


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.6318,0.506065,0.956142,0.956133
100,0.3909,0.260553,0.95055,0.950373
150,0.1945,0.138061,0.966975,0.966963
200,0.1506,0.103468,0.969072,0.969047
250,0.1222,0.111336,0.968373,0.968372
300,0.1196,0.087415,0.968897,0.968857
350,0.1097,0.083696,0.971518,0.971511
400,0.0749,0.079783,0.970994,0.970982
450,0.0545,0.076208,0.973965,0.973961
500,0.0528,0.072709,0.978857,0.978848


EgyBERT_lstm, try:2


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.6318,0.506065,0.956142,0.956133
100,0.3909,0.260553,0.95055,0.950373
150,0.1945,0.138061,0.966975,0.966963
200,0.1506,0.103468,0.969072,0.969047
250,0.1222,0.111336,0.968373,0.968372
300,0.1196,0.087415,0.968897,0.968857
350,0.1097,0.083696,0.971518,0.971511
400,0.0749,0.079783,0.970994,0.970982
450,0.0545,0.076208,0.973965,0.973961
500,0.0528,0.072709,0.978857,0.978848


EgyBERT_cnn, try:0


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.5162,0.304107,0.958064,0.958041
100,0.2453,0.16442,0.965927,0.965893
150,0.1613,0.124326,0.967325,0.967309
200,0.145,0.126608,0.962432,0.962348
250,0.1258,0.088594,0.971868,0.971848
300,0.1053,0.081201,0.973615,0.973589
350,0.1008,0.072104,0.97379,0.973774
400,0.0689,0.110803,0.96383,0.963816
450,0.1018,0.096424,0.951774,0.951772
500,0.0522,0.075755,0.975363,0.975351


EgyBERT_cnn, try:1


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.5182,0.305301,0.956491,0.956464
100,0.2561,0.183427,0.961733,0.961718
150,0.1861,0.140126,0.961209,0.961209
200,0.1419,0.111486,0.968024,0.967987
250,0.1139,0.090917,0.969771,0.969761
300,0.0988,0.082984,0.972043,0.972026
350,0.1036,0.076198,0.972217,0.972203
400,0.0697,0.078809,0.971868,0.97186
450,0.0606,0.079608,0.97379,0.973784
500,0.0496,0.073496,0.976411,0.976404


EgyBERT_cnn, try:2


Map:   0%|          | 0/22890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5723 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy,F1 Score
50,0.5206,0.314757,0.954395,0.954325
100,0.2475,0.190659,0.954395,0.954244
150,0.1707,0.130811,0.965403,0.965373
200,0.136,0.102789,0.969771,0.969726
250,0.1141,0.089552,0.970295,0.970269
300,0.0961,0.077073,0.974139,0.974125
350,0.0924,0.076,0.972043,0.972036
400,0.0633,0.094878,0.969247,0.969245
450,0.0508,0.070082,0.975887,0.975881
500,0.0446,0.073459,0.978333,0.978323


Unnamed: 0,Model,Accuracy,F1
0,EgyBERT_cnn,0.981478,0.981469
1,EgyBERT_lstm,0.979906,0.979898
