### Installing required libraries

In [None]:
!pip install scikit-learn
!pip install torch
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install sacremoses sentencepiece

### If there is any warnings, it will be ignored

In [None]:
import warnings
warnings.filterwarnings("ignore")

#### Preparing dataset for train the model

In this class, we will use AraBERTv02 tokenizer for Arabic language and RoBERTa tokenizer for English language

In [None]:
import csv
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class EnsembleDataset(Dataset):
    def __init__(self, text_data, labels, lang='en'):
        self.text_data = text_data
        self.labels = labels
        self.bertar = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02') # Loading AraBERT tokenizer for Arabic language
        self.berten = AutoTokenizer.from_pretrained('roberta-base') # Loading RoBERTa tokenizer for english
        self.lang = lang
        # one tokenizer will be used at a time for input language
        self.lsm = {'ar': self.bertar,
                    'en': self.berten
                    }
        # multilingual BERT tokenizer will be used for both Arabic and English languages
        self.mbert = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') # Loading multilingual BERT tokenizer for both languages

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.text_data[index] # text data based on the index
        label = self.labels[index] # corrosponding label of the text data
        # tokenize text data using language specific tokenizer
        # maximum token length for tokenizer is 256. If the text has more than 256 tokens, it will be trancated
        # after tokenize the text, the return format of text will be torch.tensor()
        self_tok = self.lsm[self.lang].encode_plus(text, add_special_tokens=True,
                                                   max_length=256, padding='max_length',
                                                   return_attention_mask=True, truncation=True, return_tensors='pt')
        # tokenize text data for both languages
        # maximum token length for tokenizer is 256. If the text has more than 256 tokens, it will be trancated
        # after tokenize the text, the return format of text will be torch.tensor()
        mbert_tok = self.mbert.encode_plus(text, add_special_tokens=True,
                                           max_length=256, padding='max_length',
                                           return_attention_mask=True, truncation=True, return_tensors='pt')

        fdata = (self_tok['input_ids'].squeeze(0), self_tok['attention_mask'].squeeze(0),
                 mbert_tok['input_ids'].squeeze(0), mbert_tok['attention_mask'].squeeze(0), torch.tensor(label))
        # the function will return input ids and attention masks of each tokenizer and labels will be converted to tensor values
        return fdata


### Defining the proposed ensemble model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from transformers import AutoModel

# Define the Ensemble classification model
class EnsembleClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(EnsembleClassifier, self).__init__()

        # Loading pretrained langguage models
        # AraBERTv02 is for Arabic language and RoBERTa for English
        self.bertar = AutoModel.from_pretrained('aubmindlab/bert-base-arabertv02') # ArabicBERT
        self.berten = AutoModel.from_pretrained('roberta-base') # Loading RoBERTa for English

        # lang specific model
        self.lsm = {'ar': self.bertar,
                    'en': self.berten
                    }
        
        self.lsm_drop = nn.Dropout(0.1) # language specific dropout

        #multilingual BERT for both languages
        self.mbert = AutoModel.from_pretrained('bert-base-multilingual-uncased')
        self.mbert_drop = nn.Dropout(0.3)
        

        # Fusion layer
        self.fusion_fc = nn.Linear(768*2, 128)
        
        # Output layer
        self.output_fc = nn.Linear(128, num_classes)
        # Activation function
        self.softmax = nn.Softmax(dim=1)

        #self.post_init()

    def forward(self, lang, in_1, attn_1, in_2, attn_2):
        
        # we'll feed the input_ids and attention masks to the model
        # we'll use pooler output for fusion and feeding into the feed forward network
        lsm_output = self.lsm[lang](input_ids=in_1, attention_mask=attn_1)
        lsm_output = self.lsm_drop(lsm_output.pooler_output)

        mbert_output = self.mbert(in_2, attn_2) #output_hidden_states=True,return_dict=False
        mbert_output = self.mbert_drop(mbert_output.pooler_output)

        # Concatenate the pooler output from both models
        features = torch.cat((lsm_output, lsm_output), dim=1) 

        # Fusion layer
        features = self.fusion_fc(features)

        output = self.output_fc(features)
        output = self.softmax(output)

        return output

Each time we'll feed 24 instances into the model. So defining the batch size 24

In [None]:
BATCH_SIZE = 24

### Training, Validation, and Evaluation loop

In [None]:
import gc
# Define the training and testing and Evaluation functions
def train(model, train_loader, criterion, optimizer, device, lang='en'):
    model.train()
    train_loss = 0.0
    correct = 0
    i=1
    for input_1, attn_1, input_2, attn_2, labels in train_loader:
        #torch.cuda.empty_cache()
        optimizer.zero_grad()
        # copy the values of each batch to the available device (CPU or GPU)
        input_1 = input_1.to(device)
        input_2 = input_2.to(device)
        attn_1 = attn_1.to(device)
        attn_2 = attn_2.to(device)
        labels = labels.to(device)
        # passing to the model
        output = model(lang, input_1, attn_1, input_2, attn_2)
        # calculating the loss for a batch
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() # stepping the optimizer
        train_loss += loss.item() * labels.size(0) # 
        _, predicted = torch.max(output, 1) # getting the prediction
        correct += (predicted == labels).sum().item()
        if i % 100 == 0:
          loss, current = loss.item(), i * BATCH_SIZE
          print(f'{current}/{len(train_loader.dataset)}\tTrain Loss: {train_loss/(i*BATCH_SIZE):.3f} | Train Acc: {(correct/(i*BATCH_SIZE))*100:.2f}%')
        i+=1
    return train_loss, correct

def test(model, test_loader, criterion, device, lang='en'):
    """
    This function we'll use to evaluate the performance on development data
    """
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for input_1, attn_1, input_2, attn_2, labels in test_loader:
            #optimizer.zero_grad()
            # copy the values of each batch to the available device (CPU or GPU)
            input_1 = input_1.to(device)
            input_2 = input_2.to(device)
            attn_1 = attn_1.to(device)
            attn_2 = attn_2.to(device)
            labels = labels.to(device)
            # passing to the model
            output = model(lang, input_1, attn_1, input_2, attn_2)
            loss = criterion(output, labels)
            test_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(output, 1)
            correct += (predicted == labels).sum().item()
    return test_loss, correct

def evaluate(model, test_loader, device, ep, g_labels):
    """
    This function we'll use to evaluate the performance on test data
    """
    model.eval()
    predictions = []
    y_test_pred = []
    en_test_loader, ar_test_loader = test_loader[0], test_loader[1]
    with torch.no_grad():
        # for English data
        for input_1, attn_1, input_2, attn_2, labels in en_test_loader:
            #optimizer.zero_grad()
            input_1 = input_1.to(device)
            input_2 = input_2.to(device)
            attn_1 = attn_1.to(device)
            attn_2 = attn_2.to(device)
            labels = labels.to(device)
            output = model('en', input_1, attn_1, input_2, attn_2)
            #print(output)
            _, predicted = torch.max(output, 1)
            predictions.append(predicted)
    #print(predictions)
    with torch.no_grad():
        # for Arabic data
        for input_1, attn_1, input_2, attn_2, labels in ar_test_loader:
            #optimizer.zero_grad()
            input_1 = input_1.to(device)
            input_2 = input_2.to(device)
            attn_1 = attn_1.to(device)
            attn_2 = attn_2.to(device)
            labels = labels.to(device)
            output = model('ar', input_1, attn_1, input_2, attn_2)
            #print(output)
            _, predicted = torch.max(output, 1)
            predictions.append(predicted)
    
    # Writingg the predictions to a text file
    with open(f'/kaggle/working/pred_ep_{ep}.txt', 'w') as f:
      for line in predictions:
        for l in line.tolist():
          #print(l)
          y_test_pred.append(l)
          f.write(str(l)+"\n")
    print(f'Printing Results for ***Epoch: {ep}***.......')
    target_labels = list(set(g_labels)).sort()
    y_true = g_labels
    # calculating the performances on test data
    acc, precision, recall, F1, report = calculate_performance(y_true, y_test_pred, target_labels)
    result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

    print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
    print(report)
    return predictions

### Reading the data from files

In [None]:
import os
def read_dataset(file_loc, delim=',', lang='en'):
    data = []
    labels = []
    with open(file_loc) as f:
        reader = csv.reader(f, delimiter=delim)
        next(reader)
        i = 0
        for row in reader:
            data.append(row[1])
            labels.append(int(row[2]))
    return data, labels

### Defining evaluation metric

In [None]:
from sklearn import metrics

def calculate_performance(y_true, y_pred, labels):
    """
    Calculating performances of our model
    :param y_true: actual labels in test set
    :param y_pred: predicted labels
    :param labels:
    :return: accuracy, precision, recall, f1 score and classification report
    """
    (acc, P, R, F1) = (0.0, 0.0, 0.0, 0.0)
    acc = metrics.accuracy_score(y_true, y_pred)
    P = metrics.precision_score(y_true, y_pred, average='weighted')
    R = metrics.recall_score(y_true, y_pred, average='weighted')
    F1 = metrics.f1_score(y_true, y_pred, average='macro')
    report = metrics.classification_report(y_true, y_pred, target_names=labels, digits=4)

    return acc * 100, P * 100, R * 100, F1 * 100, report

### Reading local files and prepare the data for training

In [None]:
#%cd ../1a_en/
train_data_file = '/kaggle/input/semeval17/english/train.csv'
dev_data_file = '/kaggle/input/semeval17/english/dev.csv'
test_data_file = '/kaggle/input/semeval17/english/test.csv'
file_delim = ','
train_data_file_ar = '/kaggle/input/semeval17/arabic/train.csv'
dev_data_file_ar = '/kaggle/input/semeval17/arabic/dev.csv'
test_data_file_ar = '/kaggle/input/semeval17/arabic/test.csv'

print(f"Reading Train file: {train_data_file}")
en_data, en_labels = read_dataset(train_data_file, file_delim)
en_train_dataset = EnsembleDataset(en_data, en_labels)
en_train_dataset = torch.utils.data.DataLoader(en_train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
ar_data, ar_labels = read_dataset(train_data_file_ar, file_delim)
ar_train_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_train_dataset = torch.utils.data.DataLoader(ar_train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print(f'Total examples in arabic train set: {len(ar_data)}')
print(f'Total examples in english train set: {len(en_data)}')


print(f"Reading Dev file: {dev_data_file}")
en_data, en_labels = read_dataset(dev_data_file, file_delim)
en_dev_dataset = EnsembleDataset(en_data, en_labels)
en_dev_dataset = torch.utils.data.DataLoader(en_dev_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
ar_data, ar_labels = read_dataset(dev_data_file_ar, file_delim)
ar_dev_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_dev_dataset = torch.utils.data.DataLoader(ar_dev_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


print(f"Reading Test file: {test_data_file}")
en_data, en_labels = read_dataset(test_data_file, file_delim)
en_test_dataset = EnsembleDataset(en_data, en_labels)
en_test_dataset = torch.utils.data.DataLoader(en_test_dataset, batch_size=BATCH_SIZE)
ar_data, ar_labels = read_dataset(test_data_file_ar, file_delim)
ar_test_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_test_dataset = torch.utils.data.DataLoader(ar_test_dataset, batch_size=BATCH_SIZE)
ts_labels = en_labels + ar_labels


### Defining model, loss function, and optimizers

In [None]:
import torch
from torch import nn
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EnsembleClassifier(num_classes=3)

model.to(device)

#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

### train and validate the model

In [None]:
num_epochs = 2
for epoch in range(0,num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    en_train_loss, en_corr = train(model, en_train_dataset, criterion, optimizer, device,'en')
    ar_train_loss, ar_corr = train(model, ar_train_dataset, criterion, optimizer, device,'ar')
    train_loss = (en_train_loss + ar_train_loss) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    train_acc = (en_corr + ar_corr) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    en_dev_loss, en_corr = test(model, en_dev_dataset, criterion, device, 'en')
    ar_dev_loss, ar_corr = test(model, ar_dev_dataset, criterion, device, 'ar')
    dev_loss = (en_dev_loss + ar_dev_loss) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    dev_acc = (en_corr + ar_corr) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    print('Epoch {}/{}: Train Loss = {:.4f}, Accuracy = {:.4f}, Dev Loss = {:.4f}, Dev Accuracy = {:.4f}'.format(epoch+1, num_epochs, train_loss, train_acc, dev_loss, dev_acc))


### Evaluate the model

In [None]:
test_prediction = evaluate(model, [en_test_dataset, ar_test_dataset], device, epoch, ts_labels)