In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
model_type = 'xlm-roberta-base' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "../dataset/en-fr/"

with open(data_path + 'train_texts.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [3]:
datasets = train_text, valid_text, test_text

In [4]:
[len(ds) for ds in datasets]

[1029, 8, 11]

In [5]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    #reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"  ## comment this out please! no replacing '-'s for malay
    #r = re.compile(reg, re.DOTALL)
    #text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')    # replace symbols with the most relevant counterparts
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text) # replace --   -- to ''
    text = re.sub(r'\s+', ' ', text)    # strip all whitespaces
    
    text = re.sub(r',\s?,', ',', text)  # merge commas separating only whitespace
    text = re.sub(r',\s?\.', '.', text) # , . -> ,
    text = re.sub(r'(?<=[a-zA-Z0-9]),(?=[a-zA-Z0-9])',', ',text) # say,you -> say, you
    text = re.sub(r'\?\s?\.', '?', text)# ? . -> ?
    text = re.sub(r'\s+', ' ', text)    # strip all redundant whitespace that could have been caused by preprocessing
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [6]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [7]:
[len([t for t in ds if len(t)>0]) for ds in datasets] # remove all 0 word datasets

[1029, 8, 11]

In [8]:
[len(' '.join(ds).split(' ')) for ds in datasets] # make them sentences separated by a space for tokenizing

[2347637, 17395, 18539]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [10]:
target_ids = tokenizer.encode(".?,")[1:-1]
target_ids

[6, 5, 32, 4]

In [11]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 5, '?': 705, ',': 4}

In [12]:
target_ids = list(target_token2id.values())
target_ids

[5, 705, 4]

In [13]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(0)
        
        if target != 0:
            encoded_words.append(target2id[target])
        else:
            encoded_words.append(6)
        targets.append(target)
        
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [14]:
print(id2target)
# s = "Tyranosaurus: kill me? Not enough, rumplestilskin -- said the co-pilot -- ..."
s = "it  can  be  a  very  complicated  thing, the  ocean. and  it  can  be  a  very  complicated  thing, what  human  health  is."
print(s)
s = clean_text(s)
print(s)
data, tgts = create_target(s)
print(data)
print(tgts)
[(tokenizer._convert_id_to_token(d), ta) for d,ta in zip(data[1:-1], tgts[1:-1])]

{0: 0, -1: -1, 5: 1, 705: 2, 4: 3}
it  can  be  a  very  complicated  thing, the  ocean. and  it  can  be  a  very  complicated  thing, what  human  health  is.
it can be a very complicated thing, the ocean. and it can be a very complicated thing, what human health is.
[0, 442, 6, 831, 6, 186, 6, 10, 6, 4552, 6, 96704, 297, 6, 13580, 4, 70, 6, 77904, 5, 136, 6, 442, 6, 831, 6, 186, 6, 10, 6, 4552, 6, 96704, 297, 6, 13580, 4, 2367, 6, 14135, 6, 16227, 6, 83, 5, 2]
[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, -1]


[('▁it', 0),
 ('▁', 0),
 ('▁can', 0),
 ('▁', 0),
 ('▁be', 0),
 ('▁', 0),
 ('▁a', 0),
 ('▁', 0),
 ('▁very', 0),
 ('▁', 0),
 ('▁complicat', -1),
 ('ed', 0),
 ('▁', 0),
 ('▁thing', 0),
 (',', 3),
 ('▁the', 0),
 ('▁', 0),
 ('▁ocean', 0),
 ('.', 1),
 ('▁and', 0),
 ('▁', 0),
 ('▁it', 0),
 ('▁', 0),
 ('▁can', 0),
 ('▁', 0),
 ('▁be', 0),
 ('▁', 0),
 ('▁a', 0),
 ('▁', 0),
 ('▁very', 0),
 ('▁', 0),
 ('▁complicat', -1),
 ('ed', 0),
 ('▁', 0),
 ('▁thing', 0),
 (',', 3),
 ('▁what', 0),
 ('▁', 0),
 ('▁human', 0),
 ('▁', 0),
 ('▁health', 0),
 ('▁', 0),
 ('▁is', 0),
 ('.', 1)]

In [14]:
# encoded_texts, targets = create_target(transcripts[164])

In [15]:
# print(datasets[0][0])

In [15]:
encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [22]:
print(len(encoded_texts[1]))
print(np.array(encoded_texts).shape, len(targets))

4613
(3,) 3


In [16]:
# encoded_words, targets
comma_count = 0
word_count = 0
q_count = 0
p_count = 0

for target in targets:
    for tar in target:
        for ta in tar:
            comma_count += 1 if (ta == 3) else 0
            q_count += 1 if (ta == 2) else 0
            p_count += 1 if (ta == 1) else 0
   
sc = 0
mwc = 0
for text,target in zip(encoded_texts, targets):
    for tex,tar  in zip(text,target):
        en = 0
        for t,ta in zip(tex,tar):
            if t not in [6,5,0,-1,1,2,4,705] and ta != -1:
                word_count+=1
                en+=1
            elif t in [705, 5]:
                mwc*=sc
                sc += 1
                mwc += en
                mwc /= sc
                en = 0
                
print(mwc)
     
print(comma_count, word_count, q_count, p_count)
            

'''
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")
'''

15.461595378723963
198800 2383282 10332 141628


'\nfor te, ta in zip(encoded_texts[0][0], targets[0][0]):\n    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")\n'

In [17]:
def return_counts(encoded_texts, targets):
    # encoded_words, targets
    comma_count = 0
    word_count = 0
    q_count = 0
    p_count = 0
    space_count = 0
    for target in targets:
        for tar in target:
            for ta in tar:
                comma_count += 1 if (ta == 3) else 0
                q_count += 1 if (ta == 2) else 0
                p_count += 1 if (ta == 1) else 0
    sc = 0
    mwc = 0
    for text,target in zip(encoded_texts, targets):
        for tex,tar  in zip(text,target):
            en = 0
            for t,ta in zip(tex,tar):
                if t not in [6,5,0,-1,1,2,4,705] and ta != -1:
                    word_count+=1
                    en+=1
                elif t == 6 and ta != -1: # space
                    space_count+=1
                elif t in [705, 5]:
                    mwc*=sc
                    sc += 1
                    mwc += en
                    mwc /= sc
                    en = 0
    return space_count, p_count, q_count, comma_count

In [18]:
os.makedirs(data_path + model_type, exist_ok=True)
space_count, p_count, q_count, comma_count = return_counts(encoded_texts,targets)

In [19]:

for i, name in enumerate(('train', 'valid', 'test')):
    with open(data_path + f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i], space_count, p_count, q_count, comma_count), f)

In [23]:
from collections import Counter

for ds_targets in targets:
    c = Counter((target for t in ds_targets for target in t))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

139619	10215	188165	2001462	267423
909	71	1225	15141	1899
1100	46	1120	16208	2072


In [None]:
e = []
i = 0

raw_words = datasets[1][2].split(' ')

for te, ta in zip(encoded_texts[1][2], targets[1][2]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1
print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t")

In [79]:
print(tokenizer.decode(encoded_texts[0][0]))
#print(encoded_texts[0][0])
print(datasets[0][0])
# print(encoded_texts[0][1])

<s> it  can  be  a  very  complicated  thing, the  ocean. and  it  can  be  a  very  complicated  thing, what  human  health  is. and  bringing  those  two  together  might  seem  a  very  daunting  task, but  what  i'm  going  to  try  to  say  is  that  even  in  that  complexity, there's  some  simple  themes  that  i  think, if  we  understand, we  can  really  move  forward. and  those  simple  themes  aren't  really  themes  about  the  complex  science  of  what's  going  on, but  things  that  we  all  pretty  well  know. and  i'm  going  to  start  with  this  one, if  momma  ain't  happy, ain't  nobody  happy. we  know  that, right? we've  experienced  that. and  if  we  just  take  that  and  we  build  from  there, then  we  can  go  to  the  next  step, which  is  that  if  the  ocean  ain't  happy, ain't  nobody  happy. that's  the  theme  of  my  talk. and  we're  making  the  ocean  pretty  unhappy  in  a  lot  of  different  ways. this  is  a  shot  of  cannery  row  i