In [None]:
!pip install transformers -q
from transformers import LongformerForMultipleChoice, LongformerTokenizerFast, AdamW, Trainer, TrainingArguments, RobertaForMultipleChoice, RobertaTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import numpy as np
import pandas as pd
import sys
import torch.nn as nn
import torch.nn.functional as f
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score 
import copy
from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
from prettytable import PrettyTable
import csv


In [None]:
## Loading the train_data
train_data = pd.read_csv('./train.csv')
train_data.head()

In [None]:
def count_parameters(dataframe: pd.DataFrame):
    """A simple function 
    to print the label distributions in a panada dataframe
    
    Input: a panda dataframe
    Outut : None. It just print the label distribution table."""
    
    table = PrettyTable(["Labels", "Distribution"])
    t= dataframe['label'].value_counts(normalize = True)
    for name, count in zip(t.index, t.values):
        table.add_row([name, round(count,4)])

    print(table)


In [None]:
count_parameters(train_data)

In [None]:
class MultiDatset(Dataset):
    
    """This class create the dataset needed to train a multiplechoice Model.
    It gets a panda dataframe and a tokenizer,which gets the text and the choices and provides 
    the tokenized input in the shape of [number_of_choices, length_of_the_text] in following format 

                    <s>text</s></s>choice</s><pad> 
                    
    and it provides a python Dataset
    
    Arguments:
    
        dataframe: a panda dataframe containing the examples
        tokenizer_name: Tokenizer model name. Default is 'Longformer'
        tokenizer_pth: Path to load the tokenizer from.  Default is 'allenai/longformer-base-4096'
        mode: A string, 'train' or 'test', to determine if the dataset has labels or not. Default is 'train'
        max_length: The longest possible sequences. the default value is 512. 
    
    
    Return:
    
        A python dataset, which returns a dictionary containing Input_ids, Attention_mask and Label."""
    
    def __init__(self, dataframe: pd.DataFrame, tokenizer_name: str='Longformer',
                 tokenizer_pth: str= 'allenai/longformer-base-4096', mode: str='train', max_length: int =512):
        """Read the dataframe row by row, tokenized them and store them in a list
        
        Arguments:
        
            dataframe:  panda datafrmae
            tokenizer_name
            tokenizer_pth
            mode
        
        Attributes:
            
            self.instances: a list to store the instances in the dataframe after tokenization as
            a tuple (tokenized_input, label)"""
        
        
        tokenizer_models = {'Longformer': LongformerTokenizerFast,
                           'Roberta': RobertaTokenizerFast 
                           }
        
        tokenizer = tokenizer_models[tokenizer_name].from_pretrained(tokenizer_pth)
        
        self.instances = []
        self.mode = mode
        for index, row in dataframe.iterrows():
            choices = self.extract_choices(row)
            tokenized_input = tokenizer([row['text']]*len(choices), choices, max_length=max_length,
                                        return_tensors='pt', padding=True, truncation=True)
            
            if self.mode == 'train':
                label = [self.get_label(row['label'])]
            else:
                label = ''
            self.instances.append((tokenized_input, label))
    
    def __getitem__(self, index: int):
        """" Returns a specific item in the dataset
        
        Arguments:
            index: the item's index
        
        Returns:
            a dictionary containing input_ids, attention_mask and label"""
        
        tokenized_texts, label = self.instances[index]
        if self.mode == 'train':
            label_pt = torch.tensor(label)
            tokenized_texts['label'] = label_pt

        return tokenized_texts
    
    def __len__(self):
        '''returns length of the dataset'''
        return len(self.instances)
            
    def extract_choices(self, instance: pd.core.series.Series):
        """Get an row from the panda dataframe and returns the possible choices.
        
        Arguments:
            
            instance: a panda dataframe row
            
        Returns:
            list: a list of possible choices
        """
        choices = []
        for i in range(1,7):
            choice = instance['choice'+str(i)]
            if type(choice) == float:
#                 choices.append(str(choice))
                continue
            else:
                choices.append(choice)
        return choices
    
    def get_label(self, label: str):
        '''Gets a string and map it to its numerical label
        
        Arguments:
            label: A string of the label
        
        Return:
            list: a single item list containin the mapped label'''
        return{
            'choice1': 0,
            'choice2': 1,
            'choice3': 2,
            'choice4': 3,
            'choice5': 4,
            'choice6': 5,
        }[label]
    

In [None]:
##Testing if the MultiQ_Dataset works propely
train_dataset = MultiDatset(train_data.iloc[:10], 'Longformer')
train_dataset[5]['input_ids'].shape

Since the provided dataset is very big, we just use the first 10000 instances and then breake the extracted part to 70% and 30% splits for training and validation. 

In [None]:
train_split , val_split = train_test_split(train_data[:10000],
                                           test_size=0.3,
                                           random_state=2021) 
train_dataset = MultiDatset(train_split)
val_dataset = MultiDatset(val_split)


In [None]:
class MultiModel():
    
    """The Multi choice model which gets a transformers multi choice model (A language model with a classifier on top)
    and train the model on the specified dataset.
    
    Arguments:
        model_name: It is a string specifing which based model you would like to use. options: Longformer, Roberta
        model_name: A string poiting out to the location of the model."""
    
    def __init__(self,train_args: transformers.TrainingArguments, model_name: str = 'Longformer',
                 model_path: str = 'allenai/longformer-base-4096'):
        '''Initialize the model'''
        model_options = {'Longformer': LongformerForMultipleChoice, 
                'Roberta': RobertaForMultipleChoice}
        self.model = model_options[model_name].from_pretrained(model_path)
        self.train_args = train_args
    
    def train(self, train_dataset: MultiDatset, val_dataset: MultiDatset):
        
        """This function prepare and start a Transformers.Trainer by getting transformer.TraininggArguments,
        And training dataset
        
        Arguments: 
            args: Transfromers.TrainingArguments
            train_dataset: A MultiC_Dataset dataset containing training instances
            eval_dataset: A MultiC_Dataset dataset containing validation instances 
        Returns:
        
            It return a Trainer instance. """
        
        trainer = Trainer(model=self.model,
                        args=self.train_args,
                        train_dataset=train_dataset,         
                        eval_dataset=val_dataset,             
                        compute_metrics=self.compute_metrics)
        return trainer
        
        
    def compute_metrics(self, pred):
        '''Calculate the evaluation metrics'''
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        f1micro = f1_score(labels, preds, average='micro')
        f1macro =  f1_score(labels, preds, average='macro')
        precision = precision_score(labels,preds, average='macro')
        recall = recall_score(labels, preds, average='macro')
        print(classification_report(labels,preds))
        mbe = self.MBE(labels, preds)
        return {
          'accuracy': acc,
            'f1micro' : f1micro,
            'f1macro' : f1macro,
            'precision' : precision,
            'recall' : recall,
            'MBE': mbe
          }
    
    def MBE(self, y_true, y_pred):
        '''
        Parameters:
            y_true (array): Array of observed values
            y_pred (array): Array of prediction values

        Returns:
            mbe (float): Biais score
        '''
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        y_true = y_true.reshape(len(y_true),1)
        y_pred = y_pred.reshape(len(y_pred),1)   
        diff = (y_true-y_pred)
        mbe = diff.mean()
        return mbe

    

Now We just create an instance of TraininArguments and pass it to MultiC_Model. Then by creating a trainer object, start training and then predict on a test set.

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=7,
    do_train=True,
    do_eval=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    gradient_accumulation_steps=8,
    seed = 12,   
)

model = MultiModel(training_args,model_name='Longformer')
trainer = model.train(train_dataset, val_dataset)
trainer.train()

In [None]:
trainer.save_model('longformer_multiplechoice10000_512.pth/')

In [None]:
trainer.predict(val_dataset)

In [None]:
testset = MultiDatset(train_data[10001:20000])
trainer.predict(testset)

In [None]:
def write_csv(results, path):
    '''gets a numpy array of predictions and after applying a softmax function, it will write the results.'''
    softmax= nn.Softmax(dim=0)
    columns = ['idx', 'choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6']
    logits = []
    for index, row in enumerate(results[0]):
        tensor_row = torch.tensor(row, dtype=float)
        logits.append([index]+softmax(tensor_row).tolist())
        
    with open(path, 'w+', encoding='utf-8') as f:
        write = csv.writer(f)
        write.writerow(columns)
        write.writerows(logits)

In [None]:
test_data = pd.read_csv('./test.csv')
test_dataset = MultiDatset(test_data,mode='test', max_length= 4096)
results = trainer.predict(test_dataset)
write_csv(results[0], './results.csv')