In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install sacremoses sentencepiece

In [2]:
import csv
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, AutoTokenizer

class EnsembleDataset(Dataset):
    def __init__(self, text_data, labels, lang='en'):
        self.text_data = text_data
        self.labels = labels
        self.bertar = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02') # ArabicBERT
        self.berten = AutoTokenizer.from_pretrained('roberta-base') # Loading bulgarian BERT
        self.lang = lang
        # lang_specific_model = lsm
        self.lsm = {'ar': self.bertar,
                    'en': self.berten
                    }
        self.mbert = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
        self.xlm_r = AutoTokenizer.from_pretrained('xlm-roberta-base')

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.text_data[index]
        #lang = text[0]
        label = self.labels[index]
        # tokenize text data
        self_tok = self.lsm[self.lang].encode_plus(text, add_special_tokens=True,
                                                   max_length=256, padding='max_length',
                                                   return_attention_mask=True, truncation=True, return_tensors='pt')
        mbert_tok = self.mbert.encode_plus(text, add_special_tokens=True,
                                           max_length=256, padding='max_length',
                                           return_attention_mask=True, truncation=True, return_tensors='pt')
        """
        xlm_r_tok = self.xlm_r.encode_plus(text, add_special_tokens=True,
                                           max_length=256, padding='max_length',
                                           return_attention_mask=True, truncation=True, return_tensors='pt')
        """
        fdata = (self_tok['input_ids'].squeeze(0), self_tok['attention_mask'].squeeze(0),
                 mbert_tok['input_ids'].squeeze(0), mbert_tok['attention_mask'].squeeze(0), torch.tensor(label))
        return fdata


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from transformers import AutoModel

# Define the Ensemble classification model
class EnsembleClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(EnsembleClassifier, self).__init__()

        #"""
        self.bertar = AutoModel.from_pretrained('aubmindlab/bert-base-arabertv02') # ArabicBERT
        self.berten = AutoModel.from_pretrained('roberta-base') # Loading RoBERTa for English

        # lang_specific_model = lsm
        self.lsm = {'ar': self.bertar,
                    'en': self.berten
                    }
        #"""
        self.lsm_drop = nn.Dropout(0.1) # language specific dropout
        #self.lsm_fc = nn.Linear(768, 128) #for BERT=768

        #multilingual BERT
        self.mbert = AutoModel.from_pretrained('bert-base-multilingual-uncased')
        self.mbert_drop = nn.Dropout(0.3)
        #self.mbert_fc = nn.Linear(768, 128)
        #"""
        #XLM-R
        #self.xlmr = AutoModel.from_pretrained('xlm-roberta-base')
        #self.xlmr_drop = nn.Dropout(0.3)
        #self.xlmr_fc = nn.Linear(768, 128)

        # Fusion layer
        self.fusion_fc = nn.Linear(768*2, 128)
        """
        self.attention = torch.nn.TransformerEncoderLayer(d_model=128, nhead=8, batch_first=False, dropout=0.3)

        self.classifier = nn.Sequential(nn.Dropout(),
                                        nn.Linear(in_features=128, out_features=128),
                                        nn.ReLU(inplace=True),
                                        nn.Dropout(),
                                        nn.Linear(in_features=128, out_features=num_classes))

        """
        # Output layer
        self.output_fc = nn.Linear(128, num_classes)
        self.softmax = nn.Softmax(dim=1)

        #self.post_init()

    def forward(self, lang, in_1, attn_1, in_2, attn_2):
        #print(labels)
        #print(t1)
        # Text input through BERT model
        #l_model = self.lsm[en]
        #print(lang)
        lsm_output = self.lsm[lang](input_ids=in_1, attention_mask=attn_1)#return_dict=False
        #if type(lsm_output) == tuple:
        #    if len(lsm_output) > 1:
        #        lsm_output = lsm_output[1]
        #    else:
        #        lsm_output = lsm_output[0]
        lsm_output = self.lsm_drop(lsm_output.pooler_output)
        #self.lsm_fc = nn.Linear(lsm_output.shape[1], 256) #for BERT=768
        #lsm_output = self.lsm_fc(lsm_output)

        mbert_output = self.mbert(in_2, attn_2) #output_hidden_states=True,return_dict=False
        #print(mbert_output)
        #mbert_output = mbert_output.hidden_states[1]
        #mbert_output = mbert_output[:, -1, :]
        mbert_output = self.mbert_drop(mbert_output.pooler_output)
        #mbert_output = self.mbert_fc(mbert_output)

        #_, xlmr_output = self.xlmr(t1, attention_mask=t1_m, return_dict=False)
        #xlmr_output = self.xlmr_drop(xlmr_output)
        #xlmr_output = self.xlmr_fc(xlmr_output)

        # Concatenate the text and image features
        features = torch.cat((lsm_output, lsm_output), dim=1) #
        #features = torch.cat((mbert_output, bloomz_output), dim=1)
        #features = torch.cat((mbert_output, mbert_output2), dim=1)

        # Fusion layer
        features = self.fusion_fc(features)

        #features = self.attention(features)
        # Output layer
        #output = self.classifier(features)
        output = self.output_fc(features)
        output = self.softmax(output)

        return output

In [4]:
import os
def read_dataset(file_loc, delim=',', lang='en'):
    #lang = []
    #premise = []
    #hypothesis = []
    data = []
    labels = []
    with open(file_loc) as f:
        reader = csv.reader(f, delimiter=delim)
        next(reader)
        i = 0
        for row in reader:
            #lang.append(row[0])
            #premise.append(row[1])
            #hypothesis.append(row[2])
            data.append(row[1])
            labels.append(int(row[2]))
    return data, labels

In [5]:
from sklearn import metrics

def calculate_performance(y_true, y_pred, labels):
    """
    Calculating performances of our model
    :param y_true: actual labels in test set
    :param y_pred: predicted labels
    :param labels:
    :return: accuracy, precision, recall, f1 score and classification report
    """
    (acc, P, R, F1) = (0.0, 0.0, 0.0, 0.0)
    acc = metrics.accuracy_score(y_true, y_pred)
    P = metrics.precision_score(y_true, y_pred, average='weighted')
    R = metrics.recall_score(y_true, y_pred, average='weighted')
    F1 = metrics.f1_score(y_true, y_pred, average='macro')
    report = metrics.classification_report(y_true, y_pred, target_names=labels, digits=4)

    return acc * 100, P * 100, R * 100, F1 * 100, report

In [6]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/cs6765
!ls

Mounted at /content/drive
/content/drive/My Drive/cs6765
 A3				       ensemble-ff		     Sentiment-6765.ipynb
 arabert-ar			      'ensemble of bert.ipynb'	     test101
 bin				       mbert-ar			     test102
'Calculate performace.ipynb'	       mbert-en			     xlm-ar
'Copy of new_ensemble of bert.ipynb'  'new_ensemble of bert.ipynb'   xlm-en
 data				       results
 ensemble_attn			       robert-en


In [7]:
BATCH_SIZE = 20

In [None]:
#%cd ../1a_en/
train_data_file = './data/english/train.csv'
dev_data_file = './data/english/dev.csv'
test_data_file = './data/english/test.csv'
file_delim = ','
train_data_file_ar = './data/arabic/train.csv'
dev_data_file_ar = './data/arabic/dev.csv'
test_data_file_ar = './data/arabic/test.csv'

print(f"Reading Train file: {train_data_file}")
en_data, en_labels = read_dataset(train_data_file, file_delim)
en_train_dataset = EnsembleDataset(en_data, en_labels)
en_train_dataset = torch.utils.data.DataLoader(en_train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
ar_data, ar_labels = read_dataset(train_data_file_ar, file_delim)
ar_train_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_train_dataset = torch.utils.data.DataLoader(ar_train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print(f'Total examples in arabic train set: {len(ar_data)}')
print(f'Total examples in english train set: {len(en_data)}')


print(f"Reading Dev file: {dev_data_file}")
en_data, en_labels = read_dataset(dev_data_file, file_delim)
en_dev_dataset = EnsembleDataset(en_data, en_labels)
en_dev_dataset = torch.utils.data.DataLoader(en_dev_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
ar_data, ar_labels = read_dataset(dev_data_file_ar, file_delim)
ar_dev_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_dev_dataset = torch.utils.data.DataLoader(ar_dev_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


print(f"Reading Test file: {test_data_file}")
en_data, en_labels = read_dataset(test_data_file, file_delim)
en_test_dataset = EnsembleDataset(en_data, en_labels)
en_test_dataset = torch.utils.data.DataLoader(en_test_dataset, batch_size=BATCH_SIZE)
ar_data, ar_labels = read_dataset(test_data_file_ar, file_delim)
ar_test_dataset = EnsembleDataset(ar_data, ar_labels, 'ar')
ar_test_dataset = torch.utils.data.DataLoader(ar_test_dataset, batch_size=BATCH_SIZE)
ts_labels = en_labels + ar_labels
"""
train_dataset = EnsembleDataset(tr_data, tr_labels)
train_dataset = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)#,

dev_dataset = EnsembleDataset(d_data, d_labels)
dev_dataset = torch.utils.data.DataLoader(dev_dataset, batch_size=1, shuffle=True, drop_last=True)

test_dataset = EnsembleDataset(ts_data, ts_labels)
test_dataset = torch.utils.data.DataLoader(test_dataset, batch_size=1)
#train_dataset = dev_dataset = test_dataset= []
#print(train_loader.text_data)
"""
n=1

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EnsembleClassifier(num_classes=3)

model.to(device)

In [10]:
from torch import nn
from torch import optim

#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [11]:
import gc
# Define the training and testing functions
def train(model, train_loader, criterion, optimizer, device, lang='en'):
    model.train()
    train_loss = 0.0
    correct = 0
    i=1
    for input_1, attn_1, input_2, attn_2, labels in train_loader:
        #torch.cuda.empty_cache()
        optimizer.zero_grad()
        input_1 = input_1.to(device)
        input_2 = input_2.to(device)
        attn_1 = attn_1.to(device)
        attn_2 = attn_2.to(device)
        labels = labels.to(device)
        output = model(lang, input_1, attn_1, input_2, attn_2)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(output, 1)
        correct += (predicted == labels).sum().item()
        if i % 100 == 0:
          loss, current = loss.item(), i * BATCH_SIZE
          print(f'{current}/{len(train_loader.dataset)}\tTrain Loss: {train_loss/(i*BATCH_SIZE):.3f} | Train Acc: {(correct/(i*BATCH_SIZE))*100:.2f}%')
        i+=1
        #gc.collect()
    #train_loss /= len(train_loader.dataset)
    #accuracy = correct / len(train_loader.dataset)
    return train_loss, correct

def test(model, test_loader, criterion, device, lang='en'):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for input_1, attn_1, input_2, attn_2, labels in test_loader:
            #optimizer.zero_grad()
            input_1 = input_1.to(device)
            input_2 = input_2.to(device)
            attn_1 = attn_1.to(device)
            attn_2 = attn_2.to(device)
            labels = labels.to(device)
            output = model(lang, input_1, attn_1, input_2, attn_2)
            loss = criterion(output, labels)
            test_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(output, 1)
            correct += (predicted == labels).sum().item()
    #test_loss /= len(test_loader.dataset)
    #accuracy = correct / len(test_loader.dataset)
    return test_loss, correct

def evaluate(model, test_loader, device, ep, g_labels, lang):
    model.eval()
    predictions = []
    y_test_pred = []
    if lang == 'en':
      with torch.no_grad():
          for input_1, attn_1, input_2, attn_2, labels in test_loader[0]:
              #optimizer.zero_grad()
              input_1 = input_1.to(device)
              input_2 = input_2.to(device)
              attn_1 = attn_1.to(device)
              attn_2 = attn_2.to(device)
              labels = labels.to(device)
              output = model('en', input_1, attn_1, input_2, attn_2)
              #print(output)
              _, predicted = torch.max(output, 1)
              predictions.append(predicted)
      #print(predictions)
      with open(f'./test101/output-pred_ep_{ep}.txt', 'w') as f:
      for line in predictions:
        for l in line.tolist():
            y_test_pred.append(l)
            f.write(str(l)+"\n")
    else:
      with torch.no_grad():
          for input_1, attn_1, input_2, attn_2, labels in test_loader[0]:
              #optimizer.zero_grad()
              input_1 = input_1.to(device)
              input_2 = input_2.to(device)
              attn_1 = attn_1.to(device)
              attn_2 = attn_2.to(device)
              labels = labels.to(device)
              output = model('ar', input_1, attn_1, input_2, attn_2)
              #print(output)
              _, predicted = torch.max(output, 1)
              predictions.append(predicted)
      with open(f'./test102/output-pred_ep_{ep}.txt', 'w') as f:
        for line in predictions:
          for l in line.tolist():
            #print(l)
            y_test_pred.append(l)
            f.write(str(l)+"\n")
    print(f'Printing Results for ***Epoch: {ep}***.......')
    target_labels = list(set(g_labels)).sort()
    y_true = g_labels
    acc, precision, recall, F1, report = calculate_performance(y_true, y_test_pred, target_labels)
    result = str("{0:.4f}".format(acc)) + "\t" + str("{0:.4f}".format(precision)) + "\t" + str(
        "{0:.4f}".format(recall)) + "\t" + str("{0:.4f}".format(F1)) + "\n"

    print("Test set:\t Acc\tPrecision\tRecall\tF1\n" + result)
    print(report)
    return predictions

### English Data

In [None]:
# Train the model
num_epochs = 3
for epoch in range(0,num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    en_train_loss, en_corr = train(model, en_train_dataset, criterion, optimizer, device,'en')
    #ar_train_loss, ar_corr = train(model, ar_train_dataset, criterion, optimizer, device,'ar')
    train_loss = (en_train_loss) / len(en_train_dataset.dataset)
    #train_loss = (en_train_loss + ar_train_loss) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    train_acc = (en_corr) / (len(en_train_dataset.dataset))
    #train_acc = (en_corr + ar_corr) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    en_dev_loss, en_corr = test(model, en_dev_dataset, criterion, device, 'en')
    #ar_dev_loss, ar_corr = test(model, ar_dev_dataset, criterion, device, 'ar')
    dev_loss = (en_dev_loss) / (len(en_dev_dataset.dataset))
    #dev_loss = (en_dev_loss + ar_dev_loss) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    dev_acc = (en_corr) / (len(en_dev_dataset.dataset))
    #dev_acc = (en_corr + ar_corr) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    print('Epoch {}/{}: Train Loss = {:.4f}, Accuracy = {:.4f}, Dev Loss = {:.4f}, Dev Accuracy = {:.4f}'.format(epoch+1, num_epochs, train_loss, train_acc, dev_loss, dev_acc))


In [None]:
test_prediction = evaluate(model, [en_test_dataset], device, epoch, en_labels, 'en')

### Arabic Data

In [None]:
# Train the model
num_epochs = 3
for epoch in range(0,num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    ar_train_loss, ar_corr = train(model, ar_train_dataset, criterion, optimizer, device,'ar')
    train_loss = (ar_train_loss) / len(ar_train_dataset.dataset)
    #train_loss = (en_train_loss + ar_train_loss) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    train_acc = (ar_corr) / (len(ar_train_dataset.dataset))
    #train_acc = (en_corr + ar_corr) / (len(en_train_dataset.dataset) + len(ar_train_dataset.dataset))
    ar_dev_loss, ar_corr = test(model, ar_dev_dataset, criterion, device, 'en')
    #ar_dev_loss, ar_corr = test(model, ar_dev_dataset, criterion, device, 'ar')
    dev_loss = (ar_dev_loss) / (len(ar_dev_dataset.dataset))
    #dev_loss = (en_dev_loss + ar_dev_loss) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    dev_acc = (ar_corr) / (len(ar_dev_dataset.dataset))
    #dev_acc = (en_corr + ar_corr) / (len(en_dev_dataset.dataset) + len(ar_dev_dataset.dataset))
    print('Epoch {}/{}: Train Loss = {:.4f}, Accuracy = {:.4f}, Dev Loss = {:.4f}, Dev Accuracy = {:.4f}'.format(epoch+1, num_epochs, train_loss, train_acc, dev_loss, dev_acc))


In [None]:
test_prediction = evaluate(model, [ar_test_dataset], device, epoch, ar_labels, 'ar')