In [2]:
!pip install seqeval
!pip install transformers
!pip install sentencepiece

Collecting seqeval
  Using cached seqeval-1.2.2.tar.gz (43 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=a74c7bb7cc842ea6f752eccc13dbd632d36cd4ae392e0f70a660fccf453aba58
  Stored in directory: /root/.cache/pip/wheels/39/29/36/1c4f7905c133e11748ca375960154964082d4fb03478323089
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting transformers
  Using cached transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting filelock
  Downloading filelock-3.4.1-py3-none-any.whl (9.9 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |██████████████████████

In [3]:
import torch
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import config as config
from importlib import reload

import pandas as pd
import s3fs
from torch.utils.data import Dataset, DataLoader
import random
from transformers import AdamW
import gc
from torch.utils.data import Dataset, DataLoader



In [4]:
class loadDatafromFile:
    
    ''' Loads data from file and returns dataframe'''
    
    def __init__(self,filepath_src= config.filePath_src,filePath_tar=config.filePath_tar, filePath_srcTags=config.filePath_srcTags,
                 filePath_tarTags=config.filePath_tarTags):
        
        self.filePath_src = filepath_src
        self.filePath_tar=filePath_tar
        self.filePath_srcTags=filePath_srcTags
        self.filePath_tarTags=filePath_tarTags
        
        
    def load_data(self,file):
        
        fs = s3fs.S3FileSystem()
        data=[]
        with fs.open(file, encoding = "utf-8") as f:
            for line in f:
                l = str(line, encoding='utf-8')
    #             print(l)
                data.append(l)

        f.close()
        
        return data
        
    def createDf(self):
        
        column_names = ["source","target","src_tokens","tar_tokens"]
        df = pd.DataFrame(columns=column_names,dtype=object)
        data_src = [i.strip() for i in self.load_data(self.filePath_src)]
        data_tar = [i.strip() for i in self.load_data(self.filePath_tar)]
        data_srcTags = [i.strip() for i in self.load_data(self.filePath_srcTags)]
        data_tarTags = [i.strip() for i in self.load_data(self.filePath_tarTags)]

        df = df.assign(source=data_src)
        df = df.assign(target = data_tar)
        df = df.assign(src_tokens = data_srcTags)
        df = df.assign(tar_tokens = data_tarTags)
        
        return df
        

In [18]:

class createTokenizedDf :
    '''Used for converting the input dataframe to tokenized dataframe'''
    
    def __init__(self,df,model_type):
        self.df = df
        self.model_type = model_type
    
    def convertDf(self):
        
        source_sentences = self.df["source"].tolist()
        source_tags = self.df["src_tokens"].tolist()
        target_sentences = self.df["target"].tolist()
        target_tags = self.df["tar_tokens"].tolist()
        sentence_id = 0
        data = []
        
        
        for source_sentence, source_tag_line, target_sentence, target_tag_line in zip(source_sentences, source_tags,
                                                                                      target_sentences, target_tags):
            for word, tag in zip(source_sentence.split(), source_tag_line.split()):

                # Tokenize the word and count # of subwords the word is broken into

                
                if self.model_type == 'xlm':
                   
                    tokenized_word = config.TOKENIZER.tokenize(word)
                
                elif self.model_type == 'bert':
                    
                    tokenized_word = config.TOKENIZER_BERT.tokenize(word)
                
                n_subwords = len(tokenized_word)


                for i in range(n_subwords):
                    data.append([sentence_id, tokenized_word[i],tag])


            if self.model_type == 'xlm':
                data.append([sentence_id, "</s>", "-100"])
                data.append([sentence_id, "</s>", "-100"])
            
            elif self.model_type == 'bert':
                data.append([sentence_id, "[SEP]", "-100"])
            
            
            target_words = target_sentence.split()
            target_tags = target_tag_line.split()
            
            data.append([sentence_id, "का", target_tags.pop(0)]) #random gap token

            for word in target_words :

                if self.model_type == 'xlm':
                    tokenized_word = config.TOKENIZER.tokenize(word)
                
                elif self.model_type == 'bert':
                    tokenized_word = config.TOKENIZER_BERT.tokenize(word)
                
                
                n_subwords = len(tokenized_word)

                for i in range(n_subwords):
                    data.append([sentence_id, tokenized_word[i],target_tags[0]])

                target_tags.pop(0)
                data.append([sentence_id, "का", target_tags.pop(0)]) # random gap token from vocab
            
            if self.model_type == 'xlm':
                data.append([sentence_id,'</s>','-100'])
            elif self.model_type == 'bert':
                data.append([sentence_id,'[SEP]','-100'])
            
            sentence_id += 1

        new_df=pd.DataFrame(data, columns=['sentence_id', 'words', 'labels'])
        new_df['labels'] = new_df['labels'].replace(['OK','BAD','-100'],[1,0,-100]) # Replacing labels with int tokens
        
        return new_df

In [38]:
class CompDataset(Dataset):

    def __init__(self, df,model_type):
        
        self.df_data = df
        self.model_type = model_type

    def __getitem__(self, index):
        
        
        
        temp_df = self.df_data.loc[self.df_data['sentence_id']==index]
        
        tokens = temp_df['words'].tolist()
        labels = temp_df['labels'].tolist()
        
        input_ids =[]
        attention_mask =[]
        
        
        if self.model_type == 'xlm':
            input_ids = [0] + config.TOKENIZER.convert_tokens_to_ids(tokens) # adding <s> token
        elif self.model_type == 'bert':
            input_ids = [101] + config.TOKENIZER_BERT.convert_tokens_to_ids(tokens)
        
        input_len=len(input_ids)
        
        if self.model_type == 'xlm':
            input_ids.extend([1] *(config.MAX_LEN-input_len)) # padding tokens
        elif self.model_type == 'bert':
            input_ids.extend([0] *(config.MAX_LEN-input_len)) # padding tokens
            
        attention_mask.extend([1] * input_len)
        attention_mask.extend([0] * (config.MAX_LEN-input_len)) # padding tokens
        
        labels = [-100] + labels
        labels.extend([-100] * (config.MAX_LEN-input_len))
        
        
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        labels = torch.tensor(labels, dtype=torch.long)
#         labels = torch.tensor(labels)
        sample = (input_ids, attention_mask,labels)
        
        assert len(input_ids) == config.MAX_LEN
        assert len(attention_mask) == config.MAX_LEN
        assert len(labels) == config.MAX_LEN


        return sample
    
    def __len__(self):
        return len(self.df_data.groupby(['sentence_id']))

In [10]:
class createkfoldData():
    
    def __init__(self,dataframe):
        self.dataframe = dataframe
        
    def get_kfoldIndexes():

        kf = KFold(n_splits=config.FOLDS)
        train_df_list = []
        val_df_list = []
        fold_list = list(kf.split(self.dataframe))

        for i, fold in enumerate(fold_list):

        # map the train and val index values to dataframe rows
            df_train = self.dataframe[self.dataframe.index.isin(fold[0])]
            df_val = self.dataframe[self.dataframe.index.isin(fold[1])]
            df_train = df_train.reset_index(drop=True)
            df_val = df_val.reset_index(drop=True)
            train_df_list.append(df_train)
            val_df_list.append(df_val)

        return train_df_list,val_df_list
    #     print(len(train_list))
    #     print(len(val_list))
    

In [11]:
class createDataloaders():
    
    def __init__(self,dataset):
        self.dataset = dataset
        
    def createDataloaders():
        data_loader = torch.utils.data.DataLoader(self.dataset,batch_size = config.BATCH_SIZE,shuffle = True, num_workers = 2) # for data to be returned in batches for batch grad-descent
    # one fold is now divided into 4 batches that can be accessed with any iterator like for ,etc
        return data_loader

In [5]:
dataobj = loadDatafromFile(config.filePath_src, config.filePath_tar, config.filePath_srcTags,config.filePath_tarTags )
dataset = dataobj.createDf()
dataset

Unnamed: 0,source,target,src_tokens,tar_tokens
0,José Ortega y Gasset visited Husserl at Freibu...,1934 besuchte José Ortega y Gasset Husserl in ...,OK OK OK OK OK OK OK OK OK OK OK,OK OK OK OK OK OK OK OK OK OK OK OK OK OK OK O...
1,"However , a disappointing ninth in China meant...",Eine enttäuschende Neunte in China bedeutete j...,OK OK OK BAD OK OK OK OK OK OK OK OK OK OK OK ...,OK BAD OK BAD OK BAD OK OK OK OK OK OK OK OK O...
2,"In his diary , Chase wrote that the release of...","In seinem Tagebuch , Chase schrieb , dass die ...",OK OK OK OK BAD BAD OK OK OK OK OK OK OK OK BA...,OK OK OK OK OK OK OK OK OK BAD OK BAD OK OK OK...
3,Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,OK BAD BAD OK OK OK OK OK OK OK OK,OK OK OK BAD OK OK OK OK BAD BAD OK OK OK OK O...
4,Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,OK OK OK OK BAD OK OK OK OK OK BAD OK BAD OK O...,OK OK OK BAD OK OK OK BAD OK OK OK OK OK OK OK...
...,...,...,...,...
6995,Some may also discourage or disallow unsanitar...,Einige können auch unhygienische Praktiken wie...,OK OK OK OK OK OK OK OK OK OK OK OK OK OK OK O...,OK OK OK OK OK OK OK OK OK OK OK OK OK OK OK O...
6996,"In the late 1860s , the crinolines disappeared...",In den späten 1860er Jahren verschwanden die K...,OK OK OK OK OK OK OK OK OK OK BAD BAD OK OK OK...,OK OK OK OK OK OK OK OK OK OK OK OK OK OK OK O...
6997,"Disco was criticized as mindless , consumerist...","Disco wurde als geistlos , konsumistisch , übe...",OK OK OK OK BAD OK BAD OK OK OK OK OK,OK OK OK OK OK OK OK BAD OK OK OK BAD OK OK OK...
6998,Planters would then fill large hogsheads with ...,Die Pflanzer würden dann große Heuschrecken mi...,OK OK OK BAD OK BAD OK OK BAD BAD OK OK OK OK OK,OK OK OK OK OK BAD OK OK OK OK OK BAD OK OK OK...


In [39]:
tokenObj = createTokenizedDf(dataset,model_type = 'bert')
tokenized_data = tokenObj.convertDf()
tokenized_data

Unnamed: 0,sentence_id,words,labels
0,0,jose,1
1,0,ortega,1
2,0,y,1
3,0,gas,1
4,0,##set,1
...,...,...,...
494788,6999,##e,1
494789,6999,का,1
494790,6999,.,1
494791,6999,का,1


In [40]:
tokenized_data[tokenized_data.sentence_id == 0]

Unnamed: 0,sentence_id,words,labels
0,0,jose,1
1,0,ortega,1
2,0,y,1
3,0,gas,1
4,0,##set,1
5,0,visited,1
6,0,hu,1
7,0,##sser,1
8,0,##l,1
9,0,at,1


In [17]:
config.TOKENIZER_BERT.convert_tokens_to_ids('[CLS]')

101

In [41]:
train_data = CompDataset(tokenized_data,model_type = 'bert')
train_data[0]

(tensor([  101,  4560, 25859,  1061,  3806, 13462,  4716, 15876, 18116,  2140,
          2012, 22871,  1999,  4579,  1012,   102,   100,  4579,   100,  2022,
          6342, 10143,  2063,   100,  4560,   100, 25859,   100,  1061,   100,
          3806, 13462,   100, 15876, 18116,  2140,   100,  1999,   100, 22871,
           100,  1012,   100,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [44]:
config.TOKENIZER.convert_ids_to_tokens(70077)

'▁Eve'

In [37]:
config.TOKENIZER.convert_tokens_to_ids('gas')

3923