In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
model_type = 'xlm-roberta-base' #'bert-base-bahasa-uncased' #'bert-base-multilingual-uncased' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "../dataset/malay-dataset/"

with open(data_path + 'train_malay.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'valid_malay.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_malay.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [3]:
datasets = [train_text[0:len(train_text)//1], valid_text[0:len(valid_text)//1], test_text[0:len(test_text)//1]]

In [4]:
[len(ds) for ds in datasets]

[1, 1, 1]

In [4]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    #reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"  ## comment this out please! no replacing '-'s for malay
    #r = re.compile(reg, re.DOTALL)
    #text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    text = re.sub(r'^[,.?]','',text) # remove all starting punctuations (they make zero sense)
#     text = text.replace('-', ',')
    text = text.replace(';', '.')    # replace symbols with the most relevant counterparts
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text) # replace --   -- to ''
    text = re.sub(r'\s+', ' ', text)    # strip all whitespaces
    
    text = re.sub(r',\s?,', ',', text)  # merge commas separating only whitespace
    text = re.sub(r',\s?\.', '.', text) # , . -> ,
    text = re.sub(r'\.\s?,', ',', text) # . , -> ,
    
    text = re.sub(r'(?<=[a-zA-Z0-9]),(?=[a-zA-Z0-9])',', ',text) # say,you -> say, you
    text = re.sub(r'\?\s?\.', '?', text)# ? . -> ?
    text = re.sub(r'…','.',text)
    text = re.sub(r'\,+',',',text)
    text = re.sub(r'\.+','.',text)
    text = re.sub(r'\?+','?',text)
    
    text = re.sub(r'\s+', ' ', text)    # strip all redundant whitespace that could have been caused by preprocessing
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [5]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [6]:
import re
from tqdm import tqdm
train = re.split(r'(\.|\?)',datasets[0][0]) #split with capture groups
print(len(datasets[0]))
sentences = [[] for i in range(len(datasets[0]))]
for j in tqdm(range(len(datasets[0]))):
    train = re.split(r'(\.|\?)',datasets[0][j])
    if train[0] == '.' or train[0] =='?':
        print("OHNO ",j)
        start = 1
    else:
        start = 0 
    for i in range(start,len(train),2):
        sentence = train[i]
        if i < len(train)-1:
            end = train[i+1]
        else: 
            end = '.'
        comma_count = sentence.count(',')
        if comma_count >=1:
            sentences[j].append(sentence+end)

sentences = [i for i in sentences if len(i) > 0]

for i in range(len(sentences)):
    sentences[i] = " ".join(sentences[i])

datasets[0] = sentences
## clean again
datasets = [[clean_text(text) for text in ds] for ds in datasets]



6798


100%|██████████| 6798/6798 [00:00<00:00, 29365.51it/s]


In [7]:
[len([t for t in ds if len(t)>0]) for ds in datasets] # remove all 0 word datasets

[6797, 850, 850]

In [8]:
[len(' '.join(ds).split(' ')) for ds in datasets] # make them sentences separated by a space for tokenizing

[2780143, 480281, 504158]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [10]:
target_ids = tokenizer.encode(".?,")[1:-1]
target_ids

[6, 5, 32, 4]

In [11]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 5, '?': 705, ',': 4}

In [12]:
target_ids = list(target_token2id.values())
target_ids
tokenizer.decode([i for i in range(104,106)])

'dу'

In [13]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    spaceint = 6
    encoded_words, targets = [], []
    
    words = text.split()

    i = 0
    for word in words:

        orig_word = word
        word = word.encode('ascii','ignore').decode().strip()
        
        if len(word) == 0 or word in ['\u200d','\ufeff']:
            i+=1
            continue
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token) and word != target_token:
                word = word.rstrip(target_token)
                target = id2target[target_id]
            elif word == target_token:
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)

        targets.append(0)
        
        if target != 0:
            encoded_words.append(target2id[target])
        else:
            encoded_words.append(spaceint)
        
        targets.append(target)
        
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        if len(encoded_word) == 0:
            print(f"Word:  {(i, words[i], orig_word)} word: {len(word)}, {encoded_word}")
        assert(len(encoded_word)>0)
        i+=1

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [14]:
print(id2target)
# s = "Tyranosaurus: kill me? Not enough, rumplestilskin -- said the co-pilot -- ..."
#s = "it  can  be  a  very  complicated  thing , the  ocean . and  it  can  be  a  very  complicated  thing, what  human  health  is."
s = "aku cakap ok sahaja , time itu juga dia suruh start kerja . alhamdulillah akhirnya dapat juga kerja . "
print(s)
s = clean_text(s)
print(s)
data, tgts = create_target(s)
print(data)
print(tgts)
[(tokenizer._convert_id_to_token(d), ta) for d,ta in zip(data[1:-1], tgts[1:-1])]

{0: 0, -1: -1, 5: 1, 705: 2, 4: 3}
aku cakap ok sahaja , time itu juga dia suruh start kerja . alhamdulillah akhirnya dapat juga kerja . 
aku cakap ok sahaja, time itu juga dia suruh start kerja. alhamdulillah akhirnya dapat juga kerja.
[0, 2121, 6, 55081, 6, 3664, 6, 13990, 4, 1733, 6, 752, 6, 1220, 6, 879, 6, 168753, 6, 4034, 6, 9792, 5, 193087, 6, 25375, 6, 1802, 6, 1220, 6, 9792, 5, 2]
[-1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1]


[('▁aku', 0),
 ('▁', 0),
 ('▁cakap', 0),
 ('▁', 0),
 ('▁ok', 0),
 ('▁', 0),
 ('▁sahaja', 0),
 (',', 3),
 ('▁time', 0),
 ('▁', 0),
 ('▁itu', 0),
 ('▁', 0),
 ('▁juga', 0),
 ('▁', 0),
 ('▁dia', 0),
 ('▁', 0),
 ('▁suruh', 0),
 ('▁', 0),
 ('▁start', 0),
 ('▁', 0),
 ('▁kerja', 0),
 ('.', 1),
 ('▁alhamdulillah', 0),
 ('▁', 0),
 ('▁akhirnya', 0),
 ('▁', 0),
 ('▁dapat', 0),
 ('▁', 0),
 ('▁juga', 0),
 ('▁', 0),
 ('▁kerja', 0),
 ('.', 1)]

In [None]:
# encoded_texts, targets = create_target(transcripts[164])

In [None]:
# print(datasets[0][0])

In [15]:
encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

100%|██████████| 6797/6797 [01:14<00:00, 91.80it/s] 
100%|██████████| 850/850 [00:12<00:00, 65.93it/s]
100%|██████████| 850/850 [00:13<00:00, 63.10it/s]


In [16]:
print(len(encoded_texts[1]))
print(np.array(encoded_texts).shape, len(targets))

850
(3,) 3


In [17]:
# encoded_words, targets
comma_count = 0
word_count = 0
q_count = 0
p_count = 0

for target in targets:
    for tar in target:
        for ta in tar:
            comma_count += 1 if (ta == 3) else 0
            q_count += 1 if (ta == 2) else 0
            p_count += 1 if (ta == 1) else 0
   
sc = 0
mwc = 0
for text,target in zip(encoded_texts, targets):
    for tex,tar  in zip(text,target):
        en = 0
        for t,ta in zip(tex,tar):
            if t not in [6,5,0,-1,1,2,4,705] and ta != -1:
                word_count+=1
                en+=1
            elif t in [705, 5]:
                mwc*=sc
                sc += 1
                mwc += en
                mwc /= sc
                en = 0
                
print(mwc)
     
print(comma_count, word_count, q_count, p_count)
            

'''
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")
'''

21.318509574504603
265309 3764580 4235 172160


'\nfor te, ta in zip(encoded_texts[0][0], targets[0][0]):\n    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")\n'

In [18]:
def return_counts(encoded_texts, targets):
    # encoded_words, targets
    comma_count = 0
    word_count = 0
    q_count = 0
    p_count = 0
    space_count = 0
    for target in targets:
        for tar in target:
            for ta in tar:
                comma_count += 1 if (ta == 3) else 0
                q_count += 1 if (ta == 2) else 0
                p_count += 1 if (ta == 1) else 0
    sc = 0
    mwc = 0
    for text,target in zip(encoded_texts, targets):
        for tex,tar  in zip(text,target):
            en = 0
            for t,ta in zip(tex,tar):
                if t not in [6,5,0,-1,1,2,4,705] and ta != -1:
                    word_count+=1
                    en+=1
                elif t == 6 and ta != -1: # space
                    space_count+=1
                elif t in [705, 5]:
                    mwc*=sc
                    sc += 1
                    mwc += en
                    mwc /= sc
                    en = 0
    print(mwc)
    return space_count, p_count, q_count, comma_count

In [19]:
os.makedirs(data_path + model_type, exist_ok=True)
space_count, p_count, q_count, comma_count = return_counts(encoded_texts,targets)

21.318509574504603


In [20]:

for i, name in enumerate(('train', 'valid', 'test')):
    with open(data_path + f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i], space_count, p_count, q_count, comma_count), f)

In [21]:
from collections import Counter

for ds_targets in targets:
    c = Counter((target for t in ds_targets for target in t))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

90470	2506	163939	3856079	622536
18138	915	20750	670571	106877
18103	782	20527	668320	105722


In [22]:
e = []
i = 0

raw_words = datasets[1][2].split(' ')

for te, ta in zip(encoded_texts[1][2], targets[1][2]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1
print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t")

IndexError: list index out of range

In [None]:
#print(tokenizer.decode(encoded_texts[0][1]))
[(tokenizer.convert_ids_to_tokens(code), trgt) for code,trgt in zip(encoded_texts[0][0], targets[0][0]) if trgt not in [0,-1] ]
#print(encoded_texts[0][0])
# print(datasets[0][0])
# print(encoded_texts[0][1])

In [None]:
import pickle
model_type = 'xlm-roberta-base' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "../dataset/malay-dataset/"
data_short = '../dataset/malay-short/'

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

encd = []
tgts = []
with open(f"{data_path + model_type}/train_data.pkl",'rb') as f:
    enc, target, _,_,_,_ = pickle.load(f)
    encd.append(enc)
    tgts.append(target)
with open(f"{data_path + model_type}/valid_data.pkl",'rb') as f:
    enc, target, _,_,_,_ = pickle.load(f)
    encd.append(enc)
    tgts.append(target)
with open(f"{data_path + model_type}/test_data.pkl",'rb') as f:
    enc, target, _,_,_,_ = pickle.load(f)
    encd.append(enc)
    tgts.append(target)

In [None]:
import random
s,p,q,c = return_counts(encd,tgts)
for i,name in enumerate(['train','valid','test']):
    x = list(zip(encd[i],tgts[i]))
    enc = random.sample(x,len(x)//4)
    enc,tgt = list(zip(*enc))
    with open(data_short+ f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((enc, tgt, s, p, q, c), f)
    




In [37]:
#return_counts(encd,tgts)
#print(encoded_texts[0][0])
#print(tokenizer.decode(encoded_texts[0][0]))
'''
for word in encoded_texts[0][0]:
    print(tokenizer.convert_ids_to_tokens(word))
'''
tokenizer.encode("<mask>",add_special_tokens=False)

[250001]