In [1]:
import os
import torch
import pandas as pd
import transformers
from utils import text_to_dataloader, tokenize_word

In [2]:
train_path = os.path.join("data","en_partut-ud-train.conllu")
dev_path = os.path.join("data","en_partut-ud-dev.conllu")
test_path = os.path.join("data","en_partut-ud-test.conllu")

In [3]:
HEADER_CONST = "# sent_id = "
TEXT_CONST = "# text = "
STOP_CONST = "\n"
WORD_OFFSET = 1
LABEL_OFFSET = 3


def txt_to_dataframe(data_path):
    '''
    read UD text file and convert to df format
    '''
    with open(data_path, "r") as fp:
        df = pd.DataFrame(
            columns={
                "text",
                "word",
                "label"
            }
        )
        for line in fp.readlines():
            if TEXT_CONST in line:
                words_list = []
                labels_list = []
                text = line.split(TEXT_CONST)[1]
                # this is a new text, need to parse all the words in it
            elif line is not STOP_CONST and HEADER_CONST not in line:
                temp_list = line.split("\t")
                words_list.append(temp_list[WORD_OFFSET])
                labels_list.append(temp_list[LABEL_OFFSET])
            if line == STOP_CONST:
                # this is the end of the text, adding to df
                cur_df = pd.DataFrame(
                    {
                        "text": len(words_list) * [text],
                        "word": words_list,
                        "label": labels_list
                    }
                )
                df = pd.concat([df,cur_df])
        return df
            


In [4]:
df_train = txt_to_dataframe(train_path)
df_dev = txt_to_dataframe(dev_path)
df_test = txt_to_dataframe(test_path)

In [5]:
df_train["label"].value_counts()

NOUN     9249
ADP      5220
PUNCT    5105
DET      4616
VERB     4126
ADJ      3410
AUX      2076
PROPN    2033
PRON     1734
ADV      1707
CCONJ    1472
PART     1168
NUM       787
SCONJ     627
X         140
SYM        42
_          27
INTJ        6
Name: label, dtype: int64

In [6]:
df_dev["label"].value_counts()

NOUN     568
PUNCT    353
ADP      297
VERB     276
DET      266
ADJ      210
PRON     153
AUX      124
ADV      108
PROPN    107
CCONJ     88
NUM       60
PART      56
SCONJ     41
X         13
SYM        2
_          1
Name: label, dtype: int64

In [7]:
df_test["label"].value_counts()

NOUN     753
ADP      488
DET      439
PUNCT    339
VERB     326
AUX      234
ADJ      224
ADV      131
PRON     106
CCONJ     96
PROPN     90
PART      66
NUM       61
SCONJ     51
_          4
X          2
INTJ       2
Name: label, dtype: int64

In [8]:
df_train["text"].apply(lambda x: "[CLS] " + x.lower().replace("\n",""))

0    [CLS] distribution of this license does not cr...
1    [CLS] distribution of this license does not cr...
2    [CLS] distribution of this license does not cr...
3    [CLS] distribution of this license does not cr...
4    [CLS] distribution of this license does not cr...
                           ...                        
5    [CLS] no shakespearean poems were included in ...
6    [CLS] no shakespearean poems were included in ...
7    [CLS] no shakespearean poems were included in ...
8    [CLS] no shakespearean poems were included in ...
9    [CLS] no shakespearean poems were included in ...
Name: text, Length: 43545, dtype: object

In [9]:
bert_tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
text_to_dataloader(df_train, "cuda", 32, bert_tokenizer, 256)

<torch.utils.data.dataloader.DataLoader at 0x7f1b467a6ee0>

In [11]:
df_train.apply(lambda row: tokenize_word(row.text_ids, row.word, bert_tokenizer), axis=1)

0    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                           ...                        
5    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
6    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
7    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
8    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...
9    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
Length: 43545, dtype: object

In [12]:
df_train.apply(lambda x: print(x))

0    Distribution of this license does not create a...
1    Distribution of this license does not create a...
2    Distribution of this license does not create a...
3    Distribution of this license does not create a...
4    Distribution of this license does not create a...
                           ...                        
5    No Shakespearean poems were included in the Fi...
6    No Shakespearean poems were included in the Fi...
7    No Shakespearean poems were included in the Fi...
8    No Shakespearean poems were included in the Fi...
9    No Shakespearean poems were included in the Fi...
Name: text, Length: 43545, dtype: object
0    Distribution
1              of
2            this
3         license
4            does
         ...     
5              in
6             the
7           First
8           Folio
9               .
Name: word, Length: 43545, dtype: object
0     NOUN
1      ADP
2      DET
3     NOUN
4      AUX
     ...  
5      ADP
6      DET
7      ADJ
8     NOUN
9    

text          None
word          None
label         None
label_idx     None
text_ids      None
attn_mask     None
query_mask    None
dtype: object

In [13]:
df_train

Unnamed: 0,text,word,label,label_idx,text_ids,attn_mask,query_mask
0,Distribution of this license does not create a...,Distribution,NOUN,7,"[101, 4353, 1997, 2023, 6105, 2515, 2025, 3443...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Distribution of this license does not create a...,of,ADP,1,"[101, 4353, 1997, 2023, 6105, 2515, 2025, 3443...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Distribution of this license does not create a...,this,DET,5,"[101, 4353, 1997, 2023, 6105, 2515, 2025, 3443...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Distribution of this license does not create a...,license,NOUN,7,"[101, 4353, 1997, 2023, 6105, 2515, 2025, 3443...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Distribution of this license does not create a...,does,AUX,3,"[101, 4353, 1997, 2023, 6105, 2515, 2025, 3443...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
5,No Shakespearean poems were included in the Fi...,in,ADP,1,"[101, 2053, 8101, 2319, 5878, 2020, 2443, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
6,No Shakespearean poems were included in the Fi...,the,DET,5,"[101, 2053, 8101, 2319, 5878, 2020, 2443, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
7,No Shakespearean poems were included in the Fi...,First,ADJ,0,"[101, 2053, 8101, 2319, 5878, 2020, 2443, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
8,No Shakespearean poems were included in the Fi...,Folio,NOUN,7,"[101, 2053, 8101, 2319, 5878, 2020, 2443, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [17]:
df_train["query_mask"].apply(lambda x: sum(x)).value_counts()

1    41100
2     1421
3      757
4      164
0       72
5       25
6        6
Name: query_mask, dtype: int64

In [21]:
df_train[df_train["query_mask"].apply(lambda x: sum(x)) == 0]

Unnamed: 0,text,word,label,label_idx,text_ids,attn_mask,query_mask
18,"Madam President, if the vote records correctly...",can,AUX,3,"[101, 21658, 2343, 1010, 2065, 1996, 3789, 263...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,If your ruling is that I cannot give an explan...,cannot,_,17,"[101, 2065, 2115, 6996, 2003, 2008, 1045, 2064...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"Madam President, we cannot and must not accept...",cannot,_,17,"[101, 21658, 2343, 1010, 2057, 2064, 2025, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
19,There will be major problems with enforcing th...,cannot,_,17,"[101, 2045, 2097, 2022, 2350, 3471, 2007, 2745...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"In particular, annexes cannot be adapted to ta...",cannot,_,17,"[101, 1999, 3327, 1010, 17827, 2229, 2064, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
3,"By the 1980s, Shakespeare studies were open to...",s,X,16,"[101, 2011, 1996, 3865, 1010, 8101, 2913, 2020...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15,Proposed alternative candidates include Franci...,17,NUM,8,"[101, 3818, 4522, 5347, 2421, 4557, 11611, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16,Proposed alternative candidates include Franci...,th,ADJ,0,"[101, 3818, 4522, 5347, 2421, 4557, 11611, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"From the 18th century, the desire for authenti...",18,NUM,8,"[101, 2013, 1996, 4985, 2301, 1010, 1996, 4792...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
