In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [12]:
model_type = 'bert-base-uncased' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "../dataset/en-fr/"

with open(data_path + 'train_texts.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [13]:
datasets = train_text, valid_text, test_text

In [14]:
[len(ds) for ds in datasets]

[1029, 8, 11]

In [15]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')    # replace symbols with the most relevant counterparts
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text) # replace --   -- to ''
    text = re.sub(r'\s+', ' ', text)    # strip all whitespaces
    
    text = re.sub(r',\s?,', ',', text)  # merge commas separating only whitespace
    text = re.sub(r',\s?\.', '.', text) # , . -> ,
    text = re.sub(r'\?\s?\.', '?', text)# ? . -> ?
    text = re.sub(r'\s+', ' ', text)    # strip all redundant whitespace that could have been caused by preprocessing
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [16]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [17]:
[len([t for t in ds if len(t)>0]) for ds in datasets] # remove all 0 word datasets

[1029, 8, 11]

In [18]:
[len(' '.join(ds).split(' ')) for ds in datasets] # make them sentences separated by a space for tokenizing

[2339461, 17346, 18474]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [20]:
target_ids = tokenizer.encode(".?,")[1:-1]
target_ids

[1012, 1029, 1010]

In [21]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 1012, '?': 1029, ',': 1010}

In [22]:
target_ids = list(target_token2id.values())
target_ids

[1012, 1029, 1010]

In [23]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [31]:
print(id2target)
s = "Tyranosaurus: kill me? Not enough, rumplestilskin -- said the co-pilot -- ..."
print(s)
s = clean_text(s)
print(s)
data, targets = create_target(s)
print(targets)
[(tokenizer._convert_id_to_token(d), ta) for d,ta in zip(data[1:-1], targets[1:-1])]

{0: 0, -1: -1, 1012: 1, 1029: 2, 1010: 3}
Tyranosaurus: kill me? Not enough, rumplestilskin -- said the co-pilot -- ...
tyranosaurus, kill me? not enough, rumplestilskin, said the co pilot,
[-1, -1, -1, 3, 0, 2, 0, 3, -1, -1, -1, -1, 3, 0, 0, 0, 3, -1]


[('ty', -1),
 ('##rano', -1),
 ('##saurus', 3),
 ('kill', 0),
 ('me', 2),
 ('not', 0),
 ('enough', 3),
 ('rum', -1),
 ('##ples', -1),
 ('##ti', -1),
 ('##ls', -1),
 ('##kin', 3),
 ('said', 0),
 ('the', 0),
 ('co', 0),
 ('pilot', 3)]

In [14]:
# encoded_texts, targets = create_target(transcripts[164])

In [15]:
# print(datasets[0][0])

In [33]:
encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [37]:
# encoded_words, targets
comma_count = 0
word_count = 0
q_count = 0
p_count = 0

for tar in targets[0]:
    for ta in tar:
        comma_count += 1 if (ta == 3) else 0
        word_count += 1 if (ta != -1) else 0
        q_count += 1 if (ta == 2) else 0
        p_count += 1 if (ta == 1) else 0
        
print(comma_count, word_count, q_count, p_count)
            

'''
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")
'''

188165 2339461 10215 139619


'\nfor te, ta in zip(encoded_texts[0][0], targets[0][0]):\n    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")\n'

In [22]:
os.makedirs(data_path + model_type, exist_ok=True)

for i, name in enumerate(('train', 'valid', 'test')):
    with open(data_path + f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i]), f)

In [23]:
from collections import Counter

for ds_targets in targets:
    c = Counter((target for t in ds_targets for target in t))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

139619	10215	188165	2001462	267423
909	71	1225	15141	1899
1100	46	1120	16208	2072


In [24]:
e = []
i = 0

raw_words = datasets[1][2].split(' ')

for te, ta in zip(encoded_texts[1][2], targets[1][2]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1
print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t")

[CLS] you      	[PAD]     	you
know           	,         	know,
i've           	[PAD]     	i've
talked         	[PAD]     	talked
about          	[PAD]     	about
some           	[PAD]     	some
of             	[PAD]     	of
these          	[PAD]     	these
projects       	[PAD]     	projects
before         	,         	before,
about          	[PAD]     	about
the            	[PAD]     	the
human          	[PAD]     	human
genome         	[PAD]     	genome
and            	[PAD]     	and
what           	[PAD]     	what
that           	[PAD]     	that
might          	[PAD]     	might
mean           	,         	mean,
and            	[PAD]     	and
discovering    	[PAD]     	discovering
new            	[PAD]     	new
sets           	[PAD]     	sets
of             	[PAD]     	of
genes          	.         	genes.
we're          	[PAD]     	we're
actually       	[PAD]     	actually
starting       	[PAD]     	starting
at             	[PAD]     	at
a              	[PAD]     	a
new            	[P

OverflowError: out of range integral type conversion attempted

In [25]:
print(tokenizer.decode(encoded_texts[1][2]))

[CLS] you know i've talked about some of these projects before about the human genome and what that might mean and discovering new sets of genes we're actually starting at a new point we've been digitizing biology and now we're trying to go from that digital code into a new phase of biology with designing and synthesizing life so we've always been trying to ask big questions., what is life? is something that i think many biologists have been trying to understand at various levels we've tried various approaches paring it down to minimal components we've been digitizing it now for almost 20 years when we sequenced the human genome it was going from the analog world of biology into the digital world of the computer now we're trying to ask can we regenerate life or can we create new life out of this digital universe this is the map of a small organism mycoplasma genitalium that has the smallest genome for a species that can self replicate in the laboratory and we've been trying to just see