In [None]:
!pip install transformers -q
!pip install fasttext -q
from transformers import Trainer, TrainingArguments, RobertaForMultipleChoice, RobertaTokenizerFast, XLMRobertaForMultipleChoice, XLMRobertaTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import numpy as np
import pandas as pd
import sys
import torch.nn as nn
import torch.nn.functional as f
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score 
import copy
from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
from prettytable import PrettyTable
import csv
from tqdm.notebook import tqdm_notebook as tqdm
import os
import fasttext
from collections import Counter
import random

In [None]:
train_data = pd.read_csv('./coveo/train.csv')
train_data.head()

In [None]:
class MultiDatset(Dataset):
    
    """This class create the dataset needed to train a multiplechoice Model.
    It gets a panda dataframe and a tokenizer,which gets the text and the choices and provides 
    the tokenized input in the shape of [number_of_choices, length_of_the_text] in following format 

                    <s>text</s></s>choice<lang-identifier></s><pad> 
                    
    and it provides a python Dataset
    
    Arguments:
    
        dataframe: a panda dataframe containing the examples
        tokenizer_name: Tokenizer model name. Default is 'Longformer'
        tokenizer_pth: Path to load the tokenizer from.  Default is 'allenai/longformer-base-4096'
        mode: A string, 'train' or 'test', to determine if the dataset has labels or not. Default is 'train'
        max_length: The longest possible sequences. the default value is 512. 
        seq_length: The sequence length from trimming the input text.
        lang_identifier: if set True, it will add a language identifier to each choice. default value is False
    
    
    Return:
    
        A python dataset, which returns a dictionary containing Input_ids, Attention_mask and Label."""
    
    def __init__(self, dataframe: pd.DataFrame, tokenizer_name: str='Roberta',
                 tokenizer_pth: str= 'roberta-base', mode: str='train',
                 max_length: int =512, seq_length: int=200, lang_identifier: bool=False):
        """Read the dataframe row by row, tokenized them and store them in a list
        
        Arguments:
        
            dataframe:  panda datafrmae
            tokenizer_name
            tokenizer_pth
            mode
        
        Attributes:
            
            self.instances: a list to store the instances in the dataframe after tokenization as
            a tuple (tokenized_input, label)"""
        
        
        tokenizer_models = {'XLM' : XLMRobertaTokenizerFast,
                           'Roberta': RobertaTokenizerFast
                           }
        
        tokenizer = tokenizer_models[tokenizer_name].from_pretrained(tokenizer_pth)
        orig_length = len(tokenizer)
        num_added_tokens = 0
        self.lang = lang_identifier
        if self.lang:
            self.fr_token = '<fr_lang>'
            self.en_token = '<en_lang>'
            self.other_token = '<other_lang>'
            special_tokens_dict = {'additional_special_tokens': [self.fr_token,self.en_token,self.other_token]}
            num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
            self.lang_detector = fasttext.load_model('./coveo/lid.176.bin')
            
        self.total_num = orig_length + num_added_tokens
        

        
        self.instances = []
        self.seq_length = seq_length
        self.mode = mode
        bar = tqdm(total = len(dataframe))
        for index, row in dataframe.iterrows():
            choices = self.extract_choices(row)
            text = self.trim_text(row['text'], tokenizer, seq_length=self.seq_length)
            tokenized_input = tokenizer([text]*len(choices), choices, padding=True,
                                        max_length=max_length, return_tensors='pt')

            
            if self.mode == 'train' or self.mode == 'validation':
                label = [self.get_label(row['label'])]
            else:
                label = ''
            self.instances.append((tokenized_input, label))
            bar.update(1)
    
    def __getitem__(self, index: int):
        """" Returns a specific item in the dataset
        
        Arguments:
            index: the item's index
        
        Returns:
            a dictionary containing input_ids, attention_mask and label"""
        tokenized_texts, label = self.instances[index]
        if self.mode == 'train' or self.mode=='validation':
            label_pt = torch.tensor(label)
            tokenized_texts['label'] = label_pt

        return tokenized_texts
    
    def __len__(self):
        '''returns length of the dataset'''
        return len(self.instances)

    def trim_text(self, text: str,tokenizer: transformers.tokenizer, seq_length: int=100):
        '''This method gets a string, a tokenizer and length, and trim the given text to
        make it equal to the given length. It tries to pick even contexts from both left and right.
        
        Arguments:
            text: the input string
            tokenizer
            seq_length: the length of final sequence
            
        Return:
            the trimmed text'''
        
        text = text.replace('[BLANK]', '<mask>')
        tokenized_text = tokenizer(text)['input_ids']
        blank_spot = tokenized_text.index(tokenizer.mask_token_id)
        text_len = len(tokenized_text)
        selected_part = []
        left_space = blank_spot+1
        right_space = text_len - blank_spot -1

        if text_len <= seq_length:
            return tokenizer.decode(text_len)

        if left_space >= int(seq_length/2) and right_space >= int(seq_length/2):
            selected_part = tokenized_text[blank_spot-int(seq_length/2)+1:blank_spot+1]
            selected_part += tokenized_text[blank_spot+1:blank_spot+1+int(seq_length/2)]
        elif left_space > right_space:
            remain_space = seq_length - right_space
            selected_part = tokenized_text[blank_spot-remain_space+1:blank_spot+1]
            selected_part += tokenized_text[blank_spot+1:]
        else:
            remain_space = seq_length - left_space
            selected_part = tokenized_text[:blank_spot+1]
            selected_part += tokenized_text[blank_spot+1:blank_spot+1+remain_space]

        return tokenizer.decode(selected_part)
    
            
    def extract_choices(self, instance: pd.core.series.Series):
        """Get an row from the panda dataframe and returns the possible choices.

        Arguments:
            
            instance: a panda dataframe row
            
        Returns:
            list: a list of possible choices
        """

        choices = []
        for i in range(1,7):
            choice = instance['choice'+str(i)]
            if type(choice) == float:
#                 choices.append(str(choice))
                continue
            else:
                choices.append(choice)
        
        choices = self.language_identifier(choices)
        return choices
    
    def language_identifier(self, choices: list):
        '''Gets a list of choices, detect their langauge and add the language identifier to each choice
        
        Arguments:
            choices: list
        
        Returns:
            choices with their langauge identifiers: list'''
        
            option_language = [self.lang_detector.predict(word.lower())[0][0] for word in choices]
            counter = Counter(option_language)
            language = counter.most_common(1)[0][0]
            if language != '__label__en':
                modified_choices = [choice+' '+self.en_token for choice in choices]
            elif language != '__label__fr':
                modified_choices = [choice+' '+self.fr_token for choice in choices]
            else: 
                modified_choices = [choice+' '+self.other_token for choice in choices]
            return modified_choices

        
    def get_label(self, label: str):
        '''Gets a string and map it to its numerical label
        
        Arguments:
            label: A string of the label
        
        Return:
            list: a single item list containin the mapped label'''
        return{
            'choice1': 0,
            'choice2': 1,
            'choice3': 2,
            'choice4': 3,
            'choice5': 4,
            'choice6': 5,
        }[label]
    
    def length_tokenizer(self):
        '''Returns length of tokenizer'''
        return(self.total_num)
    

##Unit Test: testing the MultiDataset

In [None]:
train_dataset_test = MultiDatset(train_data.iloc[:10], tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=128,
                            seq_length = 100)
train_dataset_test[:3]['input_ids']

Shuffling the training-data and then spliting it into 70% training and 30% validation


In [None]:
train_data_shufffled = train_data.sample(frac=1, random_state=2021)
train_split , val_split = train_test_split(train_data_shufffled,
                                           test_size=0.3,
                                           random_state=2021) 
train_dataset = MultiDatset(train_split, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200)
val_dataset = MultiDatset(val_split, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200)

In [None]:
class MultiModel():
    
    """The Multi choice model which gets a transformers multi choice model (A language model with a classifier on top)
    and train the model on the specified dataset.
    
    Arguments:
        model_name: It is a string specifing which based model you would like to use. options: Longformer, Roberta
        model_name: A string poiting out to the location of the model.
        total_num_embeddings: the length of tokenizer
        is_freezed: if set True, it will freeze all layers except Embeddings and classifier. """
    
    def __init__(self,train_args: transformers.TrainingArguments, total_num_embeddings: int=50265, 
                 model_name: str = 'Roberta', model_path: str = 'roberta-base', is_freezed: bool=True):
        '''Initialize the model'''
        model_options = { 'XLM': XLMRobertaForMultipleChoice,
                'Roberta': RobertaForMultipleChoice}
        
        self.model = model_options[model_name].from_pretrained(model_path)
        self.model.resize_token_embeddings(total_num_embeddings)
        
        if is_freezed:

            for layer in self.model.roberta.encoder.layer[:]:
                for param in layer.parameters():
                    param.requires_grad = False
            for param in [self.model.roberta.pooler.dense.weight, self.model.roberta.pooler.dense.bias]:
                    param.requires_grad = False

        self.train_args = train_args
    
    def train(self, train_dataset: MultiDatset, val_dataset: MultiDatset):
        
        """This function prepare and start a Transformers.Trainer by getting transformer.TraininggArguments,
        And training dataset
        
        Arguments: 
            args: Transfromers.TrainingArguments
            train_dataset: A MultiC_Dataset dataset containing training instances
            eval_dataset: A MultiC_Dataset dataset containing validation instances 
        Returns:
        
            It return a Trainer instance. """
        
        trainer = Trainer(model=self.model,
                        args=self.train_args,
                        train_dataset=train_dataset,         
                        eval_dataset=val_dataset,             
                        compute_metrics=self.compute_metrics)
        return trainer
        
        
    def compute_metrics(self, pred):
        '''Calculate the evaluation metrics'''
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        f1micro = f1_score(labels, preds, average='micro')
        f1macro =  f1_score(labels, preds, average='macro')
        precision = precision_score(labels,preds, average='macro')
        recall = recall_score(labels, preds, average='macro')

        return {
          'accuracy': acc,
            'f1micro' : f1micro,
            'f1macro' : f1macro,
            'precision' : precision,
            'recall' : recall,
          }
    

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=8,
    do_train=True,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model= 'f1macro',
    gradient_accumulation_steps=8,
    seed = 12,   
)
total_num_embeddings = train_dataset.length_tokenizer()
model = MultiModel(training_args, total_num_embeddings, model_name='Roberta', model_path='roberta-base')
trainer = model.train(train_dataset, val_dataset)
trainer.train()
trainer.save_model('./roberta_multiplechoice_onlylastlayer_withlanguageidentifier')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=8,
    do_train=True,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model= 'f1macro',
    gradient_accumulation_steps=8,
    seed = 12,   
)
total_num_embeddings = train_dataset.length_tokenizer()
model = MultiModel(training_args,total_num_embeddings, model_name='Roberta',
                   model_path='./roberta_multiplechoice_onlylastlayer_withlanguageidentifier')
trainer = model.train(train_dataset, val_dataset)

In [None]:
def write_csv(results, path):
    '''gets a numpy array of predictions and after applying a softmax function, it will write the results.'''
    softmax= nn.Softmax(dim=0)
    columns = ['idx', 'choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6']
    logits = []
    for index, row in enumerate(results):
#         print(row)
        tensor_row = torch.tensor(row, dtype=float)
#         print(softmax(tensor_row).tolist())
        logits.append([index]+ softmax(tensor_row).tolist())
        
    with open(path, 'w+', encoding='utf-8') as f:
        write = csv.writer(f)
        write.writerow(columns)
        write.writerows(logits)

In [None]:
test_data = pd.read_csv('./coveo/test.csv')
test_dataset = MultiDatset(test_data, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='test')
results = trainer.predict(test_dataset)
write_csv(results[0], './results_roberta_multiplechoice_onlylastlayer_withlanguageidentifier.csv')

In [None]:
def split_dataset_langauge(dataframe):
    '''gets a dataframe and split it based on 3 languages, En, Fr, and miss-classified   '''
    model = fasttext.load_model('./coveo/lid.176.bin')
    pbar = tqdm(total = len(dataframe))
    fr_dataset = dataframe.copy()
    en_dataset = dataframe.copy()
    miss_dataset = dataframe.copy()
    for i, (index, row) in enumerate(dataframe.iterrows()):
        option_language = [model.predict(word.lower())[0][0] for word in 
                           dataframe.iloc[i][['choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6']]
                          if str(word) != 'nan']
        counter = Counter(option_language)
        language = counter.most_common(1)[0][0]
        if language != '__label__en':
            en_dataset.drop(index=index, axis=0, inplace=True)
        if language != '__label__fr':
            fr_dataset.drop(index=index, axis=0, inplace=True)
        if language == '__label__fr' or language == '__label__en':
            miss_dataset.drop(index=index, axis=0, inplace=True)
        pbar.update(1)
    return en_dataset, fr_dataset, miss_dataset

In [None]:
en_dataset, fr_dataset, miss_dataset = split_dataset_langauge()
en_multidataset = MultiDatset(en_dataset, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
fr_multidataset = MultiDatset(fr_dataset, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
miss_multidataset = MultiDatset(miss_dataset, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')

In [None]:
trainer.predict(en_multidataset)
rainer.predict(fr_multidataset)
rainer.predict(miss_multidataset)

In [None]:
def calculate_seq_length(dataframe, tokenizer):
    '''Split a dataframe into 4 bins based on the input sequence length'''
    
    bin1 = []
    bin2 = []
    bin3 = []
    bin4 = []
    bar = tqdm(total = len(dataframe))
    for idx, (index, row) in enumerate(dataframe.iterrows()):
        text = trim_text(row['text'], tokenizer)
        length = len(text)
        if length < 50:
            bin1.append(datframe.iloc[idx].tolist())
        elif length >=50 and length <100 :
            bin2.append(datframe.iloc[idx].tolist())
        elif length >=  100 and length< 150:
        
            bin3.append(datframe.iloc[idx].tolist())
        else: 
            bin4.append(datframe.iloc[idx].tolist())
            
        bar.update(1)
    return (pd.DataFrame(bin1, columns=dataframe.columns),pd.DataFrame(bin2, columns=dataframe.columns),
            pd.DataFrame(bin3, columns=dataframe.columns),pd.DataFrame(bin4, columns=dataframe.columns))

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
bin1,bin2,bin3,bin4 = calculate_length(val_split, tokenizer)
bin1_multidataset = MultiDatset(bin1, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
bin2_multidatset = MultiDatset(bin2, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
bin3_multidatset = MultiDatset(bin3, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
bin4_multidatset = MultiDatset(bin4, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')

In [None]:
trainer.predict(bin1_multidataset)
trainer.predict(bin2_multidataset)
trainer.predict(bin3_multidataset)
trainer.predict(bin4_multidataset)

In [None]:
def blank_position(dataframe, tokenizer):
    '''Find the position of "BLANK" word in the input text and split the dataframe into 4 bins 
    based on the "BLANK" position'''
    bar = tqdm(total = len(dataframe))
    bin1 = []
    bin2 = []
    bin3 = []
    bin4 = []

    for idx, (index, row) in enumerate(dataframe.iterrows()):
        text = trim_text(row['text'], tokenizer)
        index = text.index(tokenizer.mask_token_id)
        
        if index < 50:
            bin1.append(dataframe.iloc[idx].tolist())
        elif index >=50 and index <100 :
            bin2.append(dataframe.iloc[idx].tolist())
        elif index >=  100 and index< 150:
        
            bin3.append(dataframe.iloc[idx].tolist())
        else: 
            bin4.append(dataframe.iloc[idx].tolist())
            
        bar.update(1)
    return (pd.DataFrame(bin1, columns=dataframe.columns),pd.DataFrame(bin2, columns=dataframe.columns),
            pd.DataFrame(bin3, columns=dataframe.columns),pd.DataFrame(bin4, columns=dataframe.columns))

In [None]:
balnkbin1,balnkbin2,balnkbin3,balnkbin4 = blank_position(val_split, tokenizer)
blankbin1_multidataset = MultiDatset(balnkbin1, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
blankbin2_multidatset = MultiDatset(balnkbin2, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
blankbin3_multidatset = MultiDatset(balnkbin3, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')
blankbin4_multidatset = MultiDatset(balnkbin4, tokenizer_name='Roberta',
                            tokenizer_pth = 'roberta-base', max_length=256,
                            seq_length = 200, mode='validation')

In [None]:
trainer.predict(blankbin1_multidatset)
trainer.predict(blankbin2_multidatset)
trainer.predict(blankbin3_multidatset)
trainer.predict(blankbin4_multidatset)