In [1]:
from transformers import BertTokenizer, BertTokenizerFast
import torch

from html import unescape
import re

from pprint import pprint
from tabulate import tabulate

from loadData import loadData
import const

In [2]:
save_directory = '.'

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer.add_special_tokens(
    {"additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"]})
tokenizer.add_tokens(["<E>", "</E>", "<URL>", "@USER"])
tokenizer.save_pretrained(save_directory)

('./vocab.txt', './special_tokens_map.json', './added_tokens.json')

In [3]:
def parseHTMLTagInInputText(input_text):
    return unescape(input_text)

In [4]:
# Credit: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def demoji(input_text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', input_text)


In [5]:
input_text_processing_func_list = [parseHTMLTagInInputText]
# input_text_processing_func_list = []

In [6]:
train_dataloader_for_event = {}

for event in const.EVENT_LIST:
    train_dataloader, dev_dataloader, test_dataloader, subtask_list = loadData(
        event, tokenizer, input_text_processing_func_list=input_text_processing_func_list, train_ratio = 1)
    train_dataloader_for_event[event] = train_dataloader

Dataset Size Report: 26621 / 0 / 0 (train/dev/test)
Dataset Size Report: 13581 / 0 / 0 (train/dev/test)
Dataset Size Report: 15165 / 0 / 0 (train/dev/test)
Dataset Size Report: 15487 / 0 / 0 (train/dev/test)
Dataset Size Report: 12189 / 0 / 0 (train/dev/test)


**General Notes**

- I should have used the unique tweets only as different candiadate location shouldn't influence these special cases I checked.
- For the data cleaning discussed here, we did nothing on the golden_chunk extraction stage.

# Check HTML Tag

Note that I didn't check what kind of tags we encountered, neither their locations. My assumption is that very few instances will be impacted when extracting the golden chunk. The influence is more likely to be on the BERT.
Therefore, a easy remedy is to `html.unescape` both the `gold_chunk_list` and the input text (`masked_tokenized_tweet_text`)

In [7]:
if parseHTMLTagInInputText not in input_text_processing_func_list:
    parsed_instance_list_for_event = {}

    for event in const.EVENT_LIST:
        parsed_instance_list = []
        total_count = 0
        parsed_instance_count = 0
        for batch in train_dataloader_for_event[event]:
            for batch_data in batch['batch_data']:
                masked_tokenized_text = batch_data[0]

                total_count += 1
                if unescape(masked_tokenized_text) != masked_tokenized_text:
                    parsed_instance_list.append(
                        (unescape(masked_tokenized_text), masked_tokenized_text))
                    parsed_instance_count += 1
        parsed_instance_list_for_event[event] = parsed_instance_list
        print(f'EVENT {event:<20s}: {total_count} instances loaded in total, {parsed_instance_count} of them contains HTML tags.')
else:
    print("Input text has already been parsed by http.parser")

Input text has already been parsed by http.parser


# Check the UNK token

In [8]:
unk_instance_list_for_event = {}

for event in const.EVENT_LIST:
    unk_instance_list = []
    total_count = 0
    unk_instance_count = 0
    for batch in train_dataloader_for_event[event]:
        for batch_idx, input_id in enumerate(batch['input_ids']):
            total_count += 1
            if tokenizer.unk_token_id in input_id:
                unk_instance_count += 1
                unk_instance_list.append(
                    (tokenizer.convert_ids_to_tokens(input_id),
                     tokenizer.decode(input_id),
                     batch['batch_data'][batch_idx][0], # 0 is for the masked_tokenized_tweet_text
                     ))
                
    unk_instance_list_for_event[event] = unk_instance_list
    print(f'EVENT {event:<20s}: {total_count} instances loaded in total, {unk_instance_count} of them contains UNK token.')

EVENT positive            : 26621 instances loaded in total, 2576 of them contains UNK token.
EVENT negative            : 13581 instances loaded in total, 1361 of them contains UNK token.
EVENT can_not_test        : 15165 instances loaded in total, 1076 of them contains UNK token.
EVENT death               : 15487 instances loaded in total, 1709 of them contains UNK token.
EVENT cure_and_prevention : 12189 instances loaded in total, 1335 of them contains UNK token.


## Extract the UNK-ORI pair

Some thoughts based on the results shown below:
NOTE: they may NOT necessarily be the common case.

- The emoji is the major source of UNK token
  + is it relevant to our purpose?
  + incorrect spacing also leads to UNK, e.g. 'health🙏🏽' -> [UNK]
- A few typos noticed:
  + double symbol such as don''t
  + typo like will -> wil : ['w', '##il']
- Non-english character
- In Bert's tokenizer, the hashtag is divided into two parts: # + <content>. In this case, the content part may be identied as UNK. An example is #covid19 -> ['#', '[UNK]']
  + high-frequency hashtag: as special token
  + low-frequency hashtag: remove
- Special character noticed:
  + ⁉️⁉️⁉️
  + (note the suffix, what's that, and try to use to keyboard to navigate around it...) Home۔
  + 報道 (more than once)
  + \xad, which occured in the gold chunk:  '\xadIndonesia’s' check [this](https://stackoverflow.com/questions/51976328/best-way-to-remove-xad-in-python)
- Can we use the continuartion token ##<content> to help us detect the abnormal words, in the **preprocessing** stage?
  + Two examples:
    - positive/Positive/POSITIVE: the whole word "positive" is in the vocab and therefore making it into lower case could give more accurate embedding? (Embedding of positive should bebetter than the dynamic embedding of Pos and ##itive)
    - transitive/Transitive/TRANSITIVE: the whole word "transitive" is NOT in the vocab and therefore keeping it in the original form may be better (case should cont)
  + Based on the above argument, we propose to convert the case of words that are NOT in the vocab and leave the the other untouched.
  + Moreover, can we implement a two-pass-like process: use BERT's pretrained tokenizer to help us detect abnormal words/tokens in the preprocessing stage.
  + Lastly, this idea sound like a layman's covid19-BERT, as we are tuning the vocabulary by hand. (correct me if I was wrong) 

In [9]:
event = 'positive'
unk_instance_list = unk_instance_list_for_event[event]

for bert_token_list, bert_decoded_text, input_text in unk_instance_list[:1]:
    print(bert_token_list)
    print(bert_decoded_text)
    print(input_text)
    print('\n', '-'*20, '\n')

['[CLS]', '#', 'co', '##ron', '##virus', '##ital', '##ia', 'I', 'have', 'Some', 'Po', '##sitive', 'news', 'for', 'Italian', 'citizens', ':', '*', '*', '13', 'out', 'of', '15', 'Italian', 'tourist', 'who', 'have', 'Co', '##vid', '##19', 'positive', 'in', 'India', 'are', 'tested', '<E>', 'negative', '</E>', 'after', '16', 'days', 'Medical', 'care', '!', '!', 'So', 'Be', 'positive', ',', 'N', '##d', 'Main', '##tain', 'distance', 'with', 'h', '##ygiene', '!', '!', 'ST', '##A', '##Y', 'ST', '##RO', '##NG', '[UNK]', '<URL>', '[SEP]', '[PAD]']
[CLS] # coronvirusitalia I have Some Positive news for Italian citizens : * * 13 out of 15 Italian tourist who have Covid19 positive in India are tested <E> negative </E> after 16 days Medical care!! So Be positive, Nd Maintain distance with hygiene!! STAY STRONG [UNK] <URL> [SEP] [PAD]
#coronvirusitalia I have Some Positive news for Italian citizens : **13 out of 15 Italian tourist who have Covid19 positive in India are tested <E> negative </E> after 1

In [10]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': "['<E>', '</E>', '<URL>', '@USER']"}

In [11]:
import difflib
import re

In [19]:
matcher = re.compile('\s*([-:!?,._\'’%+$*#/“”"…@\(\)・—‘|])\s*')    

In [22]:
incorrect_length_count = 0
for _, bert_decoded_text, input_text in unk_instance_list:
    decoded_text = bert_decoded_text.replace('[CLS]', '').replace('[PAD]', '').replace('[SEP]', '').strip()

    decoded_text = matcher.sub(r'\1', decoded_text)
    input_text = matcher.sub(r'\1', input_text)

    decoded_text_list = decoded_text.split()
    input_text_list = input_text.split()

    if len(decoded_text_list) != len(input_text_list):
        print(decoded_text_list)
        print(input_text_list)
        print(decoded_text)
        print(input_text)
        incorrect_length_count += 1

['<E>', '</E>', 'The', 'Australian', '[UNK]', '道"An', '11-year-old', 'girl', 'has', 'become', '[UNK]’s', 'youngest', 'COVID-19', 'victim', 'after', 'a', 'posthumous', 'test', 'for', 'the', 'virus', 'returned', 'a', 'positive', '10', 'days', 'after', 'she', 'died', 'on', 'East', 'Java’s', 'Madura', 'Island', 'last', 'month."<URL>']
['<E>', '</E>', 'The', 'Australian報道"An', '11-year-old', 'girl', 'has', 'become', '\xadIndonesia’s', 'youngest', 'COVID-19', 'victim', 'after', 'a', 'posthumous', 'test', 'for', 'the', 'virus', 'returned', 'a', 'positive', '10', 'days', 'after', 'she', 'died', 'on', 'East', 'Java’s', 'Madura', 'Island', 'last', 'month."<URL>']
<E> </E> The Australian [UNK] 道"An 11-year-old girl has become [UNK]’s youngest COVID-19 victim after a posthumous test for the virus returned a positive 10 days after she died on East Java’s Madura Island last month."<URL>
<E> </E> The Australian報道"An 11-year-old girl has become ­Indonesia’s youngest COVID-19 victim after a posthumous 

In [24]:

for _, bert_decoded_text, input_text in unk_instance_list:
    decoded_text = bert_decoded_text.replace('[CLS]', '').replace('[PAD]', '').replace('[SEP]', '').strip()

    decoded_text = matcher.sub(r'\1', decoded_text)
    input_text = matcher.sub(r'\1', input_text)

    decoded_text_list = decoded_text.split()
    input_text_list = input_text.split()

        
    if len(decoded_text_list) == len(input_text_list):
        diff_list = [[], []]
        for idx, decoded_token in enumerate(decoded_text_list):
            input_token = input_text_list[idx]
            if decoded_token == input_token:
                continue
            else:
                diff_list[0].append(decoded_token)
                diff_list[1].append(input_token)
    
        print(diff_list[0], '->', diff_list[1])
    print('')

['[UNK]'] -> ['ITALY🤗']

['[UNK]'] -> ['🦠']

['[UNK]'] -> ['🙄🤦🏽\u200d♂️']

['[UNK]'] -> ['🙄']

['[UNK]', '[UNK]'] -> ['😭', '💔💔']

['[UNK]', '[UNK]'] -> ['🚨Breaking', 'News🚨']

['[UNK]'] -> ['😂']

['[UNK]', '[UNK]'] -> ['🚨Breaking', 'News🚨']

['[UNK]'] -> ['of🐭']

['[UNK]'] -> ['❤️']

['alone”[UNK]'] -> ['alone”😢']

['blog:[UNK]'] -> ['blog:👉']

['[UNK]'] -> ['😢']

['[UNK]'] -> ['😂😂😂']

['[UNK]'] -> ['🤞🏻']

['[UNK]'] -> ['😭😭😭😭']

['</E>:[UNK]', '[UNK]', '[UNK]'] -> ['</E>:⚠️', '⚠️', '⚠️']

['[UNK]!#CFC'] -> ['😯!#CFC']

['[UNK]'] -> ['🙏']

['[UNK],or'] -> ['🦠,or']

['[UNK]@[UNK]', '[UNK]@[UNK]'] -> ['\u2066@CapitalHilton\u2069', '\u2066@fox5dc\u2069']

['[UNK].<E>', '[UNK]'] -> ['🥺.<E>', 'roommates😅❤️']

['latest:[UNK]', '[UNK]', '[UNK]'] -> ['latest:⚠️', '⚠️', '⚠️']

['[UNK])and'] -> ['😔)and']

['virus”[UNK]'] -> ['virus”😂']

['pet???[UNK].<URL>'] -> ['pet???😒.<URL>']


['wow.[UNK]'] -> ['wow.🙄']

['[UNK]'] -> ['buddy😃😃']

['COVID-19![UNK]#Kano#Lagos#CoronaVirus'] -> ['COVID-19!🤦\u200d♂

['too![UNK]'] -> ['too!❤️']

['[UNK]'] -> ['🥴']

['negative.[UNK]'] -> ['negative.۔']

['[UNK]'] -> ['🙏🏽']

['[UNK]"IBV'] -> ['👉"IBV']

['[UNK]', '[UNK]'] -> ['update➡️', '➡️']

['[UNK]'] -> ['😞']

['[UNK]'] -> ['😌']

['though.[UNK]'] -> ['though.😩']

['prayers.[UNK]'] -> ['prayers.😭😭']

['said.[UNK]'] -> ['said.😞21']

['[UNK]'] -> ['infection🤔or']

['[UNK]'] -> ['👀']

['blog:[UNK]'] -> ['blog:👉']

['[UNK](got', '[UNK]'] -> ['😷(got', '💗']

['[UNK](got', '[UNK]'] -> ['😷(got', '💗']

['[UNK]', 'Beach.[UNK]...figuratively', 'speaking...[UNK]'] -> ['🌊', 'Beach.🏖️...figuratively', 'speaking...😡']

['[UNK].I', '[UNK]'] -> ['🥺.I', 'roommates😅❤️']

['[UNK].<URL>'] -> ['😒.<URL>']

['COVID-19.[UNK]'] -> ['COVID-19.\u200b']

['wow.[UNK]'] -> ['wow.🙄']

['self-[UNK]'] -> ['self-isolate😳']

['[UNK]'] -> ['😭']

['COVID-19.[UNK]', '[UNK]', 'disease.[UNK]', '[UNK]'] -> ['COVID-19.\u2060', '\u2060', 'disease.\u2060', '\u2060']

['[UNK]'] -> ['😭']

['virus”[UNK]'] -> ['virus”😂']

['[UNK]'] -> ['😐']

['[U

['COVID-19.[UNK]', '[UNK]', '</E>.[UNK]', '[UNK]'] -> ['COVID-19.\u2060', '\u2060', '</E>.\u2060', '\u2060']

['coronavirus.[UNK]'] -> ['coronavirus.😂😂😭😭💔']

['[UNK]'] -> ['😎😎']

['poora..[UNK]#lockdown'] -> ['poora..😡#lockdown']

['poora..[UNK]#lockdown'] -> ['poora..😡#lockdown']

['labs.[UNK]'] -> ['labs.😬']

['[UNK]'] -> ['😎😎']

['[UNK]', '[UNK]'] -> ['🚨Breaking', 'News🚨']

['[UNK]'] -> ['😓']

['[UNK]'] -> ['🙏🏽']

['[UNK]'] -> ['here👇']

['[UNK]', '[UNK]#Corona19#pregnancy#cravings'] -> ['😩', '🤦🏻\u200d♀️🤰🏻#Corona19#pregnancy#cravings']


['for#COVID19![UNK]', 'healthy![UNK]'] -> ['for#COVID19!🙌', 'healthy!❤️']

['lie.[UNK]', 'Russia.[UNK]', 'Crimea.[UNK]'] -> ['lie.👍', 'Russia.😹😹😹', 'Crimea.✌']

['[UNK]'] -> ['😓']

['[UNK]'] -> ['⤵️']

['[UNK]'] -> ['⬇️']

['County.N:[UNK]'] -> ['County.N:🙄']

['[UNK]'] -> ['🥴']

['[UNK]:<E>'] -> ['📥Inbox:<E>']

['[UNK]'] -> ['😢']

['Covid-19.[UNK]', '[UNK]'] -> ['Covid-19.🙏🏽', '🧡🧡🧡']

['[UNK]#coronavirusuk'] -> ['🥺#coronavirusuk']

['[UNK]'] -> ['🦠

['visit.[UNK]'] -> ['visit.🤙🏽']

['[UNK]'] -> ['😂']

['[UNK]'] -> ['🥴']

['[UNK]'] -> ['🦠']

['tough![UNK]'] -> ['tough!😢😘Thank']

['that...[UNK]'] -> ['that...😮']

['[UNK]#COVID19#TheLockdown#Covid_19#StayAtHome'] -> ['pass🙏#COVID19#TheLockdown#Covid_19#StayAtHome']

['[UNK]', '[UNK]!Prince'] -> ['👑', '🦠!Prince']

['[UNK]'] -> ['🇮🇳']

['[UNK]'] -> ['ever❤️']

['[UNK]', '[UNK]'] -> ['🏸', '📰']

['[UNK]'] -> ['🤬🤬😡']

['</E>.[UNK]'] -> ['</E>.😱']

['[UNK]'] -> ['corona😭😭']

['[UNK]'] -> ['😫']

['[UNK]'] -> ['🤔']

['</E>:How?!Us:Really?[UNK]'] -> ['</E>:How?!Us:Really?🤨']

['[UNK]#coronavirus#NYPD#FirstResponder'] -> ['❤️#coronavirus#NYPD#FirstResponder']

['lives![UNK]'] -> ['lives!❤️❤️🌑🌎Idris']

['[UNK]'] -> ['here👇']

['lie.[UNK]', 'Russia.[UNK]', 'Crimea.[UNK]'] -> ['lie.👍', 'Russia.😹😹😹', 'Crimea.✌']

['[UNK]'] -> ['❤️']

['said.[UNK]'] -> ['said.😞21']

['[UNK]'] -> ['😡💵']

['said.[UNK]'] -> ['said.😞21']

['[UNK]'] -> ['🦠']

['[UNK]'] -> ['😭😭😭😭😭😭😭😭😭😭']

['carrier.[UNK]'] -> ['carrier.🤦

In [14]:
print(incorrect_length_count, len(unk_instance_list))

36 2576


## Check tokenizer's encoding

Below I am checking when the tokenizer will segement the token into subwords, one example is that for one word POSITIVE, it will divide it into four pieces:

> 'P', '##OS', '##IT', '##IVE'

Check [Exploring BERT's Vocabulary](http://juditacs.github.io/2019/02/19/bert-tokenization-stats.html)


In [5]:
def wordTokenizeEncodCheck(word, tokenizer):

    input_ids = tokenizer.encode_plus(word)['input_ids']

    print('--------------------')
    print('Input Word:', word)
    print(f'BERT Token List ({len(input_ids)}):', tokenizer.convert_ids_to_tokens(input_ids))
    print('BERT Decode:', tokenizer.decode(input_ids))
    

If the whole word is in the vocabulary

In [6]:
wordTokenizeEncodCheck('positive', tokenizer)
wordTokenizeEncodCheck('POSITIVE', tokenizer)
wordTokenizeEncodCheck('Positive', tokenizer)


--------------------
Input Word: positive
BERT Token List (3): ['[CLS]', 'positive', '[SEP]']
BERT Decode: [CLS] positive [SEP]
--------------------
Input Word: POSITIVE
BERT Token List (6): ['[CLS]', 'P', '##OS', '##IT', '##IVE', '[SEP]']
BERT Decode: [CLS] POSITIVE [SEP]
--------------------
Input Word: Positive
BERT Token List (4): ['[CLS]', 'Po', '##sitive', '[SEP]']
BERT Decode: [CLS] Positive [SEP]


If the whole word is **NOT** in the vocabulary

In [7]:
wordTokenizeEncodCheck('transitive', tokenizer)
wordTokenizeEncodCheck('Transitive', tokenizer)
wordTokenizeEncodCheck('TRANSITIVE', tokenizer)


--------------------
Input Word: transitive
BERT Token List (4): ['[CLS]', 'transit', '##ive', '[SEP]']
BERT Decode: [CLS] transitive [SEP]
--------------------
Input Word: Transitive
BERT Token List (4): ['[CLS]', 'Transit', '##ive', '[SEP]']
BERT Decode: [CLS] Transitive [SEP]
--------------------
Input Word: TRANSITIVE
BERT Token List (7): ['[CLS]', 'T', '##RA', '##NS', '##IT', '##IVE', '[SEP]']
BERT Decode: [CLS] TRANSITIVE [SEP]


In [5]:
!egrep ^trans$ vocab.txt
!egrep ^Trans$ vocab.txt

trans
Trans


In [13]:
!egrep '^##' -v vocab.txt | egrep '^\[' -v 


!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
¡
¢
£
¥
§
¨
©
ª
«
¬
®
°
±
²
³
´
µ
¶
·
¹
º
»
¼
½
¾
¿
À
Á
Â
Ä
Å
Æ
Ç
È
É
Í
Î
Ñ
Ó
Ö
×
Ø
Ú
Ü
Þ
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
÷
ø
ù
ú
û
ü
ý
þ
ÿ
Ā
ā
ă
ą
Ć
ć
Č
č
ď
Đ
đ
ē
ė
ę
ě
ğ
ġ
Ħ
ħ
ĩ
Ī
ī
İ
ı
ļ
Ľ
ľ
Ł
ł
ń
ņ
ň
ŋ
Ō
ō
ŏ
ő
Œ
œ
ř
Ś
ś
Ş
ş
Š
š
Ţ
ţ
ť
ũ
ū
ŭ
ů
ű
ų
ŵ
ŷ
ź
Ż
ż
Ž
ž
Ə
ƒ
ơ
ư
ǎ
ǐ
ǒ
ǔ
ǫ
Ș
ș
Ț
ț
ɐ
ɑ
ɔ
ɕ
ə
ɛ
ɡ
ɣ
ɨ
ɪ
ɲ
ɾ
ʀ
ʁ
ʂ
ʃ
ʊ
ʋ
ʌ
ʐ
ʑ
ʒ
ʔ
ʰ
ʲ
ʳ
ʷ
ʻ
ʼ
ʾ
ʿ
ˈ
ː
ˡ
ˢ
ˣ
́
̃
̍
̯
͡
Α
Β
Γ
Δ
Ε
Η
Θ
Ι
Κ
Λ
Μ
Ν
Ο
Π
Σ
Τ
Φ
Χ
Ψ
Ω
ά
έ
ή
ί
α
β
γ
δ
ε
ζ
η
θ
ι
κ
λ
μ
ν
ξ
ο
π
ρ
ς
σ
τ
υ
φ
χ