In [1]:
import transformers
from transformers import TFAutoModel, AutoTokenizer, PreTrainedTokenizerFast, TFAutoModelForMaskedLM, TFAutoModelForTokenClassification, TFMT5ForConditionalGeneration

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as kl
import tensorflow_addons as tfa

import re
import nltk
from nltk.tokenize import word_tokenize
import sentencepiece

import os
import gzip
import tarfile
import glob
import random
import csv

import pandas as pd
import numpy as np

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Tasks:
1. import parallel corpus file(s)
2. combine sequences with <\s> in between 
3. tokenize and pad sequences
4. get list of chinese tokens (use .map(regex) to get list; then tokenize it)
5. make random token replacements for chinese half (ignoring pad, English, start and end tokens)
6. create labels for chinese mistranslation
7. write tokenized sequence, labels to csv file

### UM-Corpus

1. import parallel corpus files

In [2]:
um_train_files = [file for file in glob.glob('./umcorpus-v1/UM-Corpus/data/*/*/*')]
um_test_file = [file for file in glob.glob('./umcorpus-v1/UM-Corpus/data/Testing/*')]
um_train_files.append(um_test_file[0])

um_en = []
um_zh = []

for file in um_train_files:
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(len(lines)):
            if i % 2 == 0:
                um_en.append(lines[i])
            else: 
                um_zh.append(lines[i])
                
                
um_en = list(map(lambda x: x.rstrip('\n'), um_en))
um_zh = list(map(lambda x: x.rstrip('\n'), um_zh))
um_en = list(map(lambda x: x.rstrip(), um_en))
um_zh = list(map(lambda x: x.rstrip(), um_zh))

In [3]:
zh_vocab = list(map(lambda x: re.findall(r'[\u4e00-\u9fff]', x), um_zh))
en_vocab = list(map(lambda x: re.findall("\S+", x), um_en))

In [6]:
zh_vocab = [char for elem in zh_vocab for char in elem]
en_vocab = [char for elem in en_vocab for char in elem]

### XLM-RoBERTa version

In [122]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer(um_en, um_zh)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


In [133]:
tokenizer.decode(2)

'</s>'

In [145]:
tokens['input_ids'][9]

[0,
 87,
 174920,
 39395,
 23,
 12989,
 450,
 87,
 13648,
 959,
 18025,
 23937,
 5,
 2,
 2,
 13129,
 789,
 91378,
 7629,
 4,
 129164,
 175175,
 6711,
 30,
 2]

In [141]:
tokenizer(random.choice(en_vocab))

{'input_ids': [0, 88203, 2], 'attention_mask': [1, 1, 1]}

In [146]:
def generate_mlm(sequences, en_vocab, zh_vocab):
    
    ##sequences ARE tokenized
    
    k=24 ##count to change seed for each sequence
    
    masked=[]
    labels=[]
        
    for sequence in sequences:
        random.seed(k)
        seq = sequence
        split = seq.index(2)
        seqlen = len(seq)
        label = [0]*seqlen

        for i in range(1, split-1):
            cur_prob = random.random()
            if cur_prob < 0.3333:
                faketok = tokenizer(random.choice(en_vocab))['input_ids'][1]     
                while seq[i] == faketok or faketok==259:
                    faketok = tokenizer(random.choice(en_vocab))['input_ids'][1]                          
                seq1 = seq[:i]
                seq2 = seq[i+1:]
                seq = seq1+[faketok]+seq2
                label[i] = 1
        
        for j in range(split+2, seqlen-1):
            cur_prob = random.random()
            if cur_prob < 0.3333:
                faketok = tokenizer(random.choice(zh_vocab))['input_ids'][2]
                if seq[j] == faketok:
                    faketok = tokenizer(random.choice(zh_vocab))['input_ids'][2]                          
                seq1 = seq[:j]
                seq2 = seq[j+1:]
                seq = seq1+[faketok]+seq2
                label[j] = 1
        
        masked.append(seq)
        labels.append(label)
                
        k+=1
    
    return masked, labels

In [147]:
masked_tokens, labels = generate_mlm(tokens['input_ids'], en_vocab, zh_vocab)

In [148]:
print(masked_tokens[34])
print(labels[34])

[0, 1401, 14922, 122395, 186, 70, 38352, 1672, 1632, 84765, 99, 56816, 5, 2, 2, 21404, 274, 6824, 28673, 684, 2003, 4058, 2003, 4671, 30, 2]
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0]


In [150]:
for i in range(len(masked_tokens)//2):
    with open(f'./MLM xlm-roberta files/tokens1/um-mlm_tokens_{format(i, "07d")}.txt', 'w', encoding='utf-8') as f:
        with open(f'./MLM xlm-roberta files/labels1/um-mlm_labels_{format(i, "07d")}.txt', 'w', encoding='utf-8') as g:
            f.write(stringify(masked_tokens[i]))
            g.write(stringify(labels[i]))

In [151]:
for i in range(len(masked_tokens)//2, len(masked_tokens)):
    with open(f'./MLM xlm-roberta files/tokens2/um-mlm_tokens_{format(i, "07d")}.txt', 'w', encoding='utf-8') as f:
        with open(f'./MLM xlm-roberta files/labels2/um-mlm_labels_{format(i, "07d")}.txt', 'w', encoding='utf-8') as g:
            f.write(stringify(masked_tokens[i]))
            g.write(stringify(labels[i]))