### Reading the gold test labels for Arabic and English

In [1]:
import csv


en_gold_filepath = './data/english/test.csv'
ar_gold_filepath = './data/arabic/test.csv'

en_gold_labels = []
ar_gold_labels = []

with open(en_gold_filepath) as f:
  reader = csv.reader(f)
  next(reader)
  for row in reader:
    en_gold_labels.append(row[2])

with open(ar_gold_filepath) as f:
  reader = csv.reader(f)
  next(reader)
  for row in reader:
    ar_gold_labels.append(row[2])

### Defining evaluation metrics

In [2]:
from sklearn import metrics

def calculate_performance(y_true, y_pred, labels):
    """
    Calculating performances of our model
    :param y_true: actual labels in test set
    :param y_pred: predicted labels
    :param labels:
    :return: accuracy, precision, recall, f1 score and classification report
    """
    (acc, P, R, F1) = (0.0, 0.0, 0.0, 0.0)
    acc = metrics.accuracy_score(y_true, y_pred)
    P = metrics.precision_score(y_true, y_pred, average='weighted')
    R = metrics.recall_score(y_true, y_pred, average='weighted')
    F1 = metrics.f1_score(y_true, y_pred, average='macro')
    report = metrics.classification_report(y_true, y_pred, target_names=labels, digits=4)

    return acc * 100, P * 100, R * 100, F1 * 100, report

### Generate performances for finetuning models

In [3]:
en_models = ['robert-en', 'mbert-en', 'xlm-en']
ar_models = ['arabert-ar', 'mbert-ar', 'xlm-ar']

In [4]:

for model in en_models:
  predicted_labels = []
  print(f'Reading predicted data for: {model}')
  lines = open(model+'/predict_results.txt').read().strip().split('\n')
  for line in lines[1:]:
    id, label = line.split('\t')
    predicted_labels.append(label.strip())
  acc, precision, recall, F1, report = calculate_performance(en_gold_labels, predicted_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)
  print('-'*25)

for model in ar_models:
  predicted_labels = []
  print(f'Reading predicted data for: {model}')
  lines = open(model+'/predict_results.txt').read().strip().split('\n')
  for line in lines[1:]:
    id, label = line.split('\t')
    predicted_labels.append(label.strip())
  acc, precision, recall, F1, report = calculate_performance(ar_gold_labels, predicted_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)
  print('-'*25)


Reading predicted data for: robert-en
Test set:	 Acc	Precision	Recall	F1
70.6936	71.3441	70.6936	70.8404

              precision    recall  f1-score   support

           0     0.7085    0.7681    0.7371      3972
           1     0.7489    0.6345    0.6870      5937
           2     0.6330    0.7857    0.7011      2375

    accuracy                         0.7069     12284
   macro avg     0.6968    0.7294    0.7084     12284
weighted avg     0.7134    0.7069    0.7059     12284

-------------------------
Reading predicted data for: mbert-en
Test set:	 Acc	Precision	Recall	F1
67.1605	67.4815	67.1605	67.0634

              precision    recall  f1-score   support

           0     0.6783    0.6981    0.6881      3972
           1     0.6988    0.6367    0.6663      5937
           2     0.6089    0.7145    0.6575      2375

    accuracy                         0.6716     12284
   macro avg     0.6620    0.6831    0.6706     12284
weighted avg     0.6748    0.6716    0.6717     12284

-

### Calculate majority voting performance

In [5]:
from collections import Counter
def get_majority_label(data):
  label, count = Counter(data).most_common()[0]
  return label

#### English

In [6]:
predicted_labels = {}
for model in en_models:
  print(f'Reading predicted data for: {model}')
  lines = open(model+'/predict_results.txt').read().strip().split('\n')
  for line in lines[1:]:
    id, label = line.split('\t')
    if id not in predicted_labels:
      predicted_labels[id] = []
    predicted_labels[id].append(label.strip())

final_pred_labels = []
for k, v in predicted_labels.items():
  final_pred_labels.append(get_majority_label(v))

acc, precision, recall, F1, report = calculate_performance(en_gold_labels, final_pred_labels, ['0', '1', '2'])
result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
print(report)

Reading predicted data for: robert-en
Reading predicted data for: mbert-en
Reading predicted data for: xlm-en
Test set:	 Acc	Precision	Recall	F1
70.9541	71.5480	70.9541	71.0322

              precision    recall  f1-score   support

           0     0.7121    0.7666    0.7384      3972
           1     0.7498    0.6434    0.6925      5937
           2     0.6354    0.7794    0.7001      2375

    accuracy                         0.7095     12284
   macro avg     0.6991    0.7298    0.7103     12284
weighted avg     0.7155    0.7095    0.7088     12284



#### Arabic

In [7]:
predicted_labels = {}
for model in ar_models:
  print(f'Reading predicted data for: {model}')
  lines = open(model+'/predict_results.txt').read().strip().split('\n')
  for line in lines[1:]:
    id, label = line.split('\t')
    if id not in predicted_labels:
      predicted_labels[id] = []
    predicted_labels[id].append(label.strip())

final_pred_labels = []
for k, v in predicted_labels.items():
  final_pred_labels.append(get_majority_label(v))

acc, precision, recall, F1, report = calculate_performance(ar_gold_labels, final_pred_labels, ['0', '1', '2'])
result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
print(report)

Reading predicted data for: arabert-ar
Reading predicted data for: mbert-ar
Reading predicted data for: xlm-ar
Test set:	 Acc	Precision	Recall	F1
66.6885	66.3713	66.6885	66.4150

              precision    recall  f1-score   support

           0     0.7120    0.7921    0.7499      2222
           1     0.6354    0.5381    0.5827      2364
           2     0.6371    0.6843    0.6599      1514

    accuracy                         0.6669      6100
   macro avg     0.6615    0.6715    0.6642      6100
weighted avg     0.6637    0.6669    0.6628      6100



### Calculating the performance of BERTs-FF Ensemble

#### with attention mechanism

In [8]:
import glob
path = './ensemble_attn/'

for filepath in glob.glob(f'{path}*.txt'):
  predicted_labels = []
  print(f'Reading predicted data from: {filepath}')
  lines = open(filepath).read().strip().split('\n')
  for line in lines:
    predicted_labels.append(line.strip())
  en_pred_labels = predicted_labels[:len(en_gold_labels)]
  ar_pred_labels = predicted_labels[len(en_gold_labels):]
  print('English Data: ')
  acc, precision, recall, F1, report = calculate_performance(en_gold_labels, en_pred_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)
  print('Arabic Data: ')
  acc, precision, recall, F1, report = calculate_performance(ar_gold_labels, ar_pred_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)

Reading predicted data from: ./ensemble_attn/output-pred_ep_2.txt
English Data: 
Test set:	 Acc	Precision	Recall	F1
67.4373	69.1370	67.4373	67.3072

              precision    recall  f1-score   support

           0     0.6705    0.7548    0.7102      3972
           1     0.7562    0.5826    0.6582      5937
           2     0.5641    0.7693    0.6509      2375

    accuracy                         0.6744     12284
   macro avg     0.6636    0.7022    0.6731     12284
weighted avg     0.6914    0.6744    0.6736     12284

Arabic Data: 
Test set:	 Acc	Precision	Recall	F1
66.2951	67.8214	66.2951	66.4180

              precision    recall  f1-score   support

           0     0.8254    0.6787    0.7449      2222
           1     0.5997    0.6324    0.6156      2364
           2     0.5848    0.6876    0.6321      1514

    accuracy                         0.6630      6100
   macro avg     0.6700    0.6662    0.6642      6100
weighted avg     0.6782    0.6630    0.6668      6100



#### with Feed Forward

In [9]:
import glob
path = './ensemble-ff/'

for filepath in glob.glob(f'{path}*.txt'):
  predicted_labels = []
  print(f'Reading predicted data from: {filepath}')
  lines = open(filepath).read().strip().split('\n')
  for line in lines:
    predicted_labels.append(line.strip())
  en_pred_labels = predicted_labels[:len(en_gold_labels)]
  ar_pred_labels = predicted_labels[len(en_gold_labels):]
  print('English Data: ')
  acc, precision, recall, F1, report = calculate_performance(en_gold_labels, en_pred_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)
  print('Arabic Data: ')
  acc, precision, recall, F1, report = calculate_performance(ar_gold_labels, ar_pred_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)

Reading predicted data from: ./ensemble-ff/pred_ep_2.txt
English Data: 
Test set:	 Acc	Precision	Recall	F1
70.0261	70.4980	70.0261	69.8751

              precision    recall  f1-score   support

           0     0.6758    0.7815    0.7248      3972
           1     0.7435    0.6426    0.6894      5937
           2     0.6574    0.7086    0.6821      2375

    accuracy                         0.7003     12284
   macro avg     0.6923    0.7109    0.6988     12284
weighted avg     0.7050    0.7003    0.6994     12284

Arabic Data: 
Test set:	 Acc	Precision	Recall	F1
67.6066	68.0116	67.6066	67.1159

              precision    recall  f1-score   support

           0     0.7723    0.7678    0.7700      2222
           1     0.5928    0.6595    0.6243      2364
           2     0.6812    0.5674    0.6191      1514

    accuracy                         0.6761      6100
   macro avg     0.6821    0.6649    0.6712      6100
weighted avg     0.6801    0.6761    0.6761      6100



#### English as train data and English as test data

In [10]:
import glob
path = './test101/'

for filepath in glob.glob(f'{path}*.txt'):
  predicted_labels = []
  print(f'Reading predicted data from: {filepath}')
  lines = open(filepath).read().strip().split('\n')
  for line in lines:
    predicted_labels.append(line.strip())
  print('English Data: ')
  acc, precision, recall, F1, report = calculate_performance(en_gold_labels, predicted_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)

Reading predicted data from: ./test101/output-pred_ep_2.txt
English Data: 
Test set:	 Acc	Precision	Recall	F1
68.9108	69.2574	68.9108	68.5915

              precision    recall  f1-score   support

           0     0.7245    0.6548    0.6879      3972
           1     0.6995    0.6945    0.6970      5937
           2     0.6218    0.7331    0.6729      2375

    accuracy                         0.6891     12284
   macro avg     0.6819    0.6941    0.6859     12284
weighted avg     0.6926    0.6891    0.6894     12284



#### Arabic as train data and Arabic as test data

In [11]:
import glob
path = './test102/'

for filepath in glob.glob(f'{path}*.txt'):
  predicted_labels = []
  print(f'Reading predicted data from: {filepath}')
  lines = open(filepath).read().strip().split('\n')
  for line in lines:
    predicted_labels.append(line.strip())
  print('Arabic Data: ')
  acc, precision, recall, F1, report = calculate_performance(ar_gold_labels, predicted_labels, ['0', '1', '2'])
  result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
          "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

  print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
  print(report)

Reading predicted data from: ./test102/output-pred_ep_2.txt
Arabic Data: 
Test set:	 Acc	Precision	Recall	F1
67.6721	69.0103	67.6721	67.8239

              precision    recall  f1-score   support

           0     0.8167    0.6796    0.7418      2222
           1     0.5954    0.6853    0.6372      2364
           2     0.6523    0.6592    0.6557      1514

    accuracy                         0.6767      6100
   macro avg     0.6881    0.6747    0.6782      6100
weighted avg     0.6901    0.6767    0.6799      6100

