In [1]:
import transformers
from transformers import TFAutoModel, AutoTokenizer, PreTrainedTokenizerFast, TFAutoModelForMaskedLM, TFAutoModelForTokenClassification, TFMT5ForConditionalGeneration

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as kl
import tensorflow_addons as tfa

import re
import nltk
from nltk.tokenize import word_tokenize

import os
import gzip
import tarfile
import glob
import random
import csv
import statistics

import sklearn
import pandas as pd
import numpy as np

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
def destringify(string):
    numlist = tf.strings.split(string)
    numlist = tf.strings.to_number(numlist, out_type=tf.dtypes.int32)
    return numlist

In [3]:
def stringify(numlist):
    string = ''
    for num in numlist:
        string = string + str(num)+' '
        string1 = string.rstrip()
    return string1

In [4]:
def attention_maskify(string):
    numlist = tf.strings.split(string)
    numlist = tf.strings.to_number(numlist, out_type=tf.dtypes.int32)
    masklist = tf.math.not_equal(numlist, tf.constant([1]))
    return tf.cast(masklist, tf.int32)

### Build Model

In [5]:
base_model = transformers.TFXLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=1, from_pt=True)

All PyTorch model weights were used when initializing TFXLMRobertaForTokenClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
base_model.summary()

Model: "tfxlm_roberta_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 277453056 
 )                                                               
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 277,453,825
Trainable params: 277,453,825
Non-trainable params: 0
_________________________________________________________________


In [7]:
base_model.load_weights('XLM-Roberta_take_3_WEIGHTS_trained')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21f6480cf10>

# Predict WMT data

### Build WMT dataset

In [8]:
def get_labels(string):
    x = string.split()
    out = [y=='BAD' for y in x]
    
    return np.asarray(out).astype(int)

def get_zh_word_labels(string):
    x = string.split()
    out = [y=='BAD' for y in x]
    word = [out[i] for i in range(len(out)) if i%2==1]
    
    return np.asarray(word).astype(int)

In [9]:
def expand_labels(labels, tokens):
    ## converts WMT's word-labels into token-labels
    new_labels = np.zeros_like(tokens['input_ids'])
    
    for i in range(len(labels)):
        word_ids = tokens.word_ids(i)
        cur_labels = labels[i]
        for j in range(len(word_ids)):
            if word_ids[j] != None:
                new_labels[i,j]= cur_labels[word_ids[j]]
    
    return new_labels


In [10]:
train_df = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.src', sep="/n", header=None, names=["Source"])
train_df['Target'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.mt', sep="/n", header=None)
train_df['Post Edits'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.pe', sep="/n", header=None)
train_df['Source Tags'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.source_tags', sep="/n", header=None)
train_df["Target Tags"] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.tags', sep="/n", header=None)

dev_df = df = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.src', sep="/n", header=None, names=["Source"])
dev_df['Target'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.mt', sep="/n", header=None)
dev_df['Post Edits'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.pe', sep="/n", header=None)
dev_df['Source Tags'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.source_tags', sep="/n", header=None)
dev_df["Target Tags"] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.tags', sep="/n", header=None)

multi_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

train_input = [train_df['Source'][i] + multi_tokenizer.sep_token + train_df['Target'][i] for i in range(len(train_df['Source']))]
dev_input = [dev_df['Source'][i] + multi_tokenizer.sep_token + dev_df['Target'][i] for i in range(len(dev_df['Source']))]

train_en_labels = train_df["Source Tags"].map(get_labels)
train_zh_labels = train_df["Target Tags"].map(get_zh_word_labels)

dev_en_labels = dev_df["Source Tags"].map(get_labels)
dev_zh_labels = dev_df["Target Tags"].map(get_zh_word_labels)

  return func(*args, **kwargs)


In [11]:
train_en_split = train_df["Source"].map(lambda x: x.split())
train_zh_split = train_df["Target"].map(lambda x: x.split())

dev_en_split = dev_df["Source"].map(lambda x: x.split())
dev_zh_split = dev_df["Target"].map(lambda x: x.split())

In [12]:
for i in range(train_df.shape[0]):
    assert len(train_en_split[i]) == len(train_en_labels[i])
    assert len(train_zh_split[i]) == len(train_zh_labels[i]), print(i)
    
for i in range(dev_df.shape[0]):
    assert len(dev_en_split[i]) == len(dev_en_labels[i])
    assert len(dev_zh_split[i]) == len(dev_zh_labels[i])

In [13]:
train_input = [train_en_split[i] + [multi_tokenizer.eos_token] + train_zh_split[i] for i in range(len(train_en_split))]
dev_input = [dev_en_split[i] + [multi_tokenizer.eos_token] + dev_zh_split[i] for i in range(len(dev_en_split))]

train_toks = multi_tokenizer(train_input, max_length=256, padding='max_length', truncation=True, is_split_into_words=True, return_tensors='tf')
dev_toks = multi_tokenizer(dev_input, max_length=256, padding='max_length', truncation=True, is_split_into_words=True,  return_tensors='tf')

train_labels = [list(train_en_labels[i]) + [0] + list(train_zh_labels[i]) for i in range(len(train_en_labels))]
dev_labels = [list(dev_en_labels[i]) + [0] + list(dev_zh_labels[i]) for i in range(len(dev_en_labels))]

In [14]:
train_expanded_labels = expand_labels(train_labels, train_toks)
dev_expanded_labels = expand_labels(dev_labels, dev_toks)

In [15]:
wmt_train_ids = tf.data.Dataset.from_tensor_slices(train_toks['input_ids'])
wmt_train_attention = tf.data.Dataset.from_tensor_slices(train_toks['attention_mask'])
wmt_train_labels = tf.data.Dataset.from_tensor_slices(train_expanded_labels)

In [16]:
wmt_train_ds = tf.data.Dataset.zip((wmt_train_ids, wmt_train_attention))
wmt_train_ds = tf.data.Dataset.zip((wmt_train_ds, wmt_train_labels))

In [17]:
wmt_dev_ids = tf.data.Dataset.from_tensor_slices(dev_toks['input_ids'])
wmt_dev_attention = tf.data.Dataset.from_tensor_slices(dev_toks['attention_mask'])
wmt_dev_labels = tf.data.Dataset.from_tensor_slices(dev_expanded_labels)

wmt_dev_ds = tf.data.Dataset.zip((wmt_dev_ids, wmt_dev_attention))
wmt_dev_ds = tf.data.Dataset.zip((wmt_dev_ds, wmt_dev_labels))

In [18]:
wmt_train_ds = wmt_train_ds.batch(12)
wmt_dev_ds = wmt_dev_ds.batch(12)

### Import Test set data

In [19]:
test20_df = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-test20/test20.src', sep="/n", header=None, names=["Source"])
test20_df['Target'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-test20/test20.mt', sep="/n", header=None)
test20_df['Post Edits'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-test20/test20.pe', sep="/n", header=None)
test20_df['Source Tags'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-test20/test20.source_tags', sep="/n", header=None)
test20_df["Target Tags"] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-test20/test20.tags', sep="/n", header=None)

multi_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

test20_input = [test20_df['Source'][i] + multi_tokenizer.sep_token + test20_df['Target'][i] for i in range(len(test20_df['Source']))]

test20_en_labels = test20_df["Source Tags"].map(get_labels)
test20_zh_labels = test20_df["Target Tags"].map(get_zh_word_labels)

In [20]:
test20_en_split = test20_df["Source"].map(lambda x: x.split())
test20_zh_split = test20_df["Target"].map(lambda x: x.split())

In [21]:
for i in range(test20_df.shape[0]):
    assert len(test20_en_split[i]) == len(test20_en_labels[i])
    assert len(test20_zh_split[i]) == len(test20_zh_labels[i]), print(i)

In [22]:
test20_input = [test20_en_split[i] + [multi_tokenizer.eos_token] + test20_zh_split[i] for i in range(len(test20_en_split))]

In [23]:
test20_toks = multi_tokenizer(test20_input, max_length=256, padding='max_length', truncation=True, is_split_into_words=True, return_tensors='tf')

test20_labels = [list(test20_en_labels[i]) + [0] + list(test20_zh_labels[i]) for i in range(len(test20_en_labels))]

In [24]:
test20_expanded_labels = expand_labels(test20_labels, test20_toks)

In [25]:
wmt_test20_ids = tf.data.Dataset.from_tensor_slices(test20_toks['input_ids'])
wmt_test20_attention = tf.data.Dataset.from_tensor_slices(test20_toks['attention_mask'])
wmt_test20_labels = tf.data.Dataset.from_tensor_slices(test20_expanded_labels)

In [26]:
wmt_test20_ds = tf.data.Dataset.zip((wmt_test20_ids, wmt_test20_attention))
wmt_test20_ds = tf.data.Dataset.zip((wmt_test20_ds, wmt_test20_labels))

wmt_test20_ds = wmt_test20_ds.batch(12)

### Get Predictions

In [27]:
train_preds = base_model.predict(wmt_train_ds)
dev_preds = base_model.predict(wmt_dev_ds)

In [28]:
test20_preds = base_model.predict(wmt_test20_ds)

In [21]:
dev_preds.logits[0]

array([[-0.83311844],
       [ 0.1735177 ],
       [ 0.2638688 ],
       [ 0.2962104 ],
       [ 0.4133141 ],
       [ 0.06487624],
       [ 0.21404845],
       [ 0.07419204],
       [ 0.0586598 ],
       [ 0.24233077],
       [ 0.28665268],
       [ 0.1416643 ],
       [ 0.12636887],
       [ 0.02781042],
       [ 0.10851131],
       [ 0.1414358 ],
       [ 0.9385923 ],
       [ 0.9500521 ],
       [-0.9694003 ],
       [ 0.06399247],
       [ 0.05730013],
       [ 0.06293802],
       [ 0.06669658],
       [ 0.06876331],
       [ 0.09375519],
       [ 0.08492374],
       [ 0.06584972],
       [ 0.0966587 ],
       [ 0.08480928],
       [ 0.06998149],
       [ 0.07517445],
       [ 0.08481773],
       [ 0.15928108],
       [ 0.37591457],
       [ 0.28743386],
       [ 0.10822016],
       [ 0.1453422 ],
       [ 0.10779797],
       [ 0.10894126],
       [ 0.25936478],
       [ 0.2758379 ],
       [ 0.06275497],
       [ 0.0800153 ],
       [ 0.07172898],
       [ 0.1218376 ],
       [ 0

## Process Predictions into Masked Strings

In [29]:
train_en_input = [train_en_split[i] for i in range(len(train_en_split))]
dev_en_input = [dev_en_split[i] for i in range(len(dev_en_split))]
train_en_tokens = multi_tokenizer(train_en_input, is_split_into_words=True)
dev_en_tokens = multi_tokenizer(dev_en_input, is_split_into_words=True)

In [30]:
test20_en_input = [test20_en_split[i] for i in range(len(test20_en_split))]
test20_en_tokens = multi_tokenizer(test20_en_input, is_split_into_words=True)

In [31]:
multi_tokenizer.mask_token

'<mask>'

In [32]:
multi_tokenizer.encode('<mask>')

[0, 250001, 2]

In [33]:
train_en_tokens['input_ids'][0]

[0,
 581,
 4568,
 64718,
 1846,
 7068,
 51894,
 7,
 98,
 678,
 1919,
 91,
 47416,
 79442,
 19,
 6,
 5,
 2,
 2]

In [31]:
for tok in train_en_tokens['input_ids']:
    tok.append(2)
for tok in dev_en_tokens['input_ids']:
    tok.append(2)

In [32]:
for tok in test20_en_tokens['input_ids']:
    tok.append(2)

In [34]:
train_logits = train_preds.logits
dev_logits = dev_preds.logits

train_mask = np.greater(train_logits, [0.5])
dev_mask = np.greater(dev_logits, [0.5])

In [35]:
train_mask = np.squeeze(train_mask)
dev_mask = np.squeeze(dev_mask)

In [36]:
test20_logits = test20_preds.logits
test20_mask = np.greater(test20_logits, [0.5])
test20_mask = np.squeeze(test20_mask)

In [44]:
train_60_mask = np.squeeze(np.greater(train_logits, [0.6]))
dev_60_mask = np.squeeze(np.greater(dev_logits, [0.6]))
test20_60_mask = np.squeeze(np.greater(test20_logits, [0.6]))

train_75_mask = np.squeeze(np.greater(train_logits, [0.75]))
dev_75_mask = np.squeeze(np.greater(dev_logits, [0.75]))
test20_75_mask = np.squeeze(np.greater(test20_logits, [0.75]))

In [37]:
train_40_mask = np.squeeze(np.greater(train_logits, [0.4]))
dev_40_mask = np.squeeze(np.greater(dev_logits, [0.4]))
test20_40_mask = np.squeeze(np.greater(test20_logits, [0.4]))

In [38]:
for i in range(len(train_mask)):
    for j in range(len(train_en_tokens[i])):
        train_mask[i][j] = False
        
for i in range(len(dev_mask)):
    for j in range(len(dev_en_tokens[i])):
        dev_mask[i][j] = False

In [39]:
mask_train_indices = np.nonzero(train_mask)
mask_dev_indices = np.nonzero(dev_mask)

In [40]:
train_tokens = train_toks['input_ids'].numpy()
dev_tokens = dev_toks['input_ids'].numpy()

In [41]:
masked_train = np.where(train_mask, 250001, train_tokens)
masked_dev = np.where(dev_mask, 250001, dev_tokens)

In [42]:
for i in range(len(test20_mask)):
    for j in range(len(test20_en_tokens[i])):
        test20_mask[i][j] = False

mask_test20_indices = np.nonzero(test20_mask)

test20_tokens = test20_toks['input_ids'].numpy()

masked_test20 = np.where(test20_mask, 250001, test20_tokens)

In [45]:
for i in range(len(train_60_mask)):
    for j in range(len(train_en_tokens[i])):
        train_60_mask[i][j] = False
        
for i in range(len(dev_60_mask)):
    for j in range(len(dev_en_tokens[i])):
        dev_60_mask[i][j] = False
        
for i in range(len(test20_60_mask)):
    for j in range(len(test20_en_tokens[i])):
        test20_60_mask[i][j] = False
        
for i in range(len(train_75_mask)):
    for j in range(len(train_en_tokens[i])):
        train_75_mask[i][j] = False
        
for i in range(len(dev_75_mask)):
    for j in range(len(dev_en_tokens[i])):
        dev_75_mask[i][j] = False
        
for i in range(len(test20_75_mask)):
    for j in range(len(test20_en_tokens[i])):
        test20_75_mask[i][j] = False

In [38]:
for i in range(len(train_40_mask)):
    for j in range(len(train_en_tokens[i])):
        train_40_mask[i][j] = False
        
for i in range(len(dev_40_mask)):
    for j in range(len(dev_en_tokens[i])):
        dev_40_mask[i][j] = False
        
for i in range(len(test20_40_mask)):
    for j in range(len(test20_en_tokens[i])):
        test20_40_mask[i][j] = False

In [None]:
mask_train_indices_60 = np.nonzero(train_60_mask)
mask_dev_indices_60 = np.nonzero(dev_60_mask)
mask_test20_indices_60 = np.nonzero(test20_60_mask)


In [39]:
train_tokens = train_toks['input_ids'].numpy()
dev_tokens = dev_toks['input_ids'].numpy()
test20_tokens = test20_toks['input_ids'].numpy()

In [47]:
masked_60_train = np.where(train_60_mask, 250001, train_tokens)
masked_60_dev = np.where(dev_60_mask, 250001, dev_tokens)
masked_60_test20 = np.where(test20_60_mask, 250001, test20_tokens)

masked_75_train = np.where(train_75_mask, 250001, train_tokens)
masked_75_dev = np.where(dev_75_mask, 250001, dev_tokens)
masked_75_test20 = np.where(test20_75_mask, 250001, test20_tokens)

In [40]:
masked_40_train = np.where(train_40_mask, 250001, train_tokens)
masked_40_dev = np.where(dev_40_mask, 250001, dev_tokens)
masked_40_test20 = np.where(test20_40_mask, 250001, test20_tokens)

In [49]:
masked_60_dev[1]

array([     0,   2161,   1702,  14487,   1210,      6,      4,  12439,
         2069,  34377,    297,   1919,   3525,  28560,     44,  41945,
          214,    111,   1215,     44,      6,      4, 241599, 108171,
            7,   1295,   5655,   3041,   2945,      6,      4,   1829,
        50094,    953,    111,   1919,  17428,    966,  16398,    294,
        99653,      6,      5,      2,   1210,      6,    470,    427,
            6,    630,   1702,      6,    635,      6,      4,      6,
        80884,      6,   6347,   3942,      6,   4511,  47437,      6,
           43,  17428,    966,  16398,    294, 250001, 250001, 250001,
        23830,    953,      6,   5525,      6,      4,      6,  18325,
            6,    274,  47437,      6,     43,  59515,      6, 250001,
           44,  13129,      6,     43, 250001, 250001, 250001,      6,
            4,      6,   6628,      6,    465,      6,  17206, 210893,
         3300,   1275,      6,  19506,      6,     43,      6,  11480,
      

In [66]:
train_mask_list = []
dev_mask_list = []

for seq in masked_train:
    string = stringify(seq)
    train_mask_list.append(string)
    
for seq in masked_dev:
    string = stringify(seq)
    dev_mask_list.append(string)

In [24]:
test20_mask_list = []

for seq in masked_test20:
    string = stringify(seq)
    test20_mask_list.append(string)

In [50]:
train_mask_60_list = []
dev_mask_60_list = []
test20_mask_60_list = []

for seq in masked_60_train:
    string = stringify(seq)
    train_mask_60_list.append(string)
    
for seq in masked_60_dev:
    string = stringify(seq)
    dev_mask_60_list.append(string)
    
for seq in masked_60_test20:
    string = stringify(seq)
    test20_mask_60_list.append(string)

In [51]:
train_mask_75_list = []
dev_mask_75_list = []
test20_mask_75_list = []

for seq in masked_75_train:
    string = stringify(seq)
    train_mask_75_list.append(string)
    
for seq in masked_75_dev:
    string = stringify(seq)
    dev_mask_75_list.append(string)
    
for seq in masked_75_test20:
    string = stringify(seq)
    test20_mask_75_list.append(string)

In [41]:
train_mask_40_list = []
dev_mask_40_list = []
test20_mask_40_list = []

for seq in masked_40_train:
    string = stringify(seq)
    train_mask_40_list.append(string)
    
for seq in masked_40_dev:
    string = stringify(seq)
    dev_mask_40_list.append(string)
    
for seq in masked_40_test20:
    string = stringify(seq)
    test20_mask_40_list.append(string)

In [53]:
dev_mask_75_list[47]

'0 31384 6032 14432 6 4 107154 118066 17368 136247 102 23 10862 111 13625 39 1681 6 5 2 6 3074 6 274 6 170772 250001 250001 6 217398 6 43793 6 274 6 2229 6 62029 43024 6 1589 6 7499 6 3987 2008 26027 6 15498 250001 250001 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'

## Write Masked Strings to Files

In [69]:
with open('./QE Outputs/masked_train_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_mask_list))

In [70]:
with open('./QE Outputs/masked_dev_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(dev_mask_list))

In [25]:
with open('./QE Outputs/masked_test20_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(test20_mask_list))

In [54]:
with open('./QE Outputs/masked_train_60_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_mask_60_list))
    
with open('./QE Outputs/masked_dev_60_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(dev_mask_60_list))
    
with open('./QE Outputs/masked_test20_60_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(test20_mask_60_list))

In [55]:
with open('./QE Outputs/masked_train_75_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_mask_75_list))
    
with open('./QE Outputs/masked_dev_75_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(dev_mask_75_list))
    
with open('./QE Outputs/masked_test20_75_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(test20_mask_75_list))

In [42]:
with open('./QE Outputs/masked_train_40_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_mask_40_list))
    
with open('./QE Outputs/masked_dev_40_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(dev_mask_40_list))
    
with open('./QE Outputs/masked_test20_40_strings.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(test20_mask_40_list))