In [1]:
import transformers
from transformers import TFAutoModel, AutoTokenizer, PreTrainedTokenizerFast, TFAutoModelForMaskedLM, TFAutoModelForTokenClassification, TFMT5ForConditionalGeneration

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as kl
import tensorflow_addons as tfa

import re
import nltk
from nltk.tokenize import word_tokenize

import os
import gzip
import tarfile
import glob
import random
import csv
import statistics

import sklearn
import pandas as pd
import numpy as np

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
def destringify(string):
    numlist = tf.strings.split(string)
    numlist = tf.strings.to_number(numlist, out_type=tf.dtypes.int32)
    return numlist

In [3]:
def attention_maskify(string):
    numlist = tf.strings.split(string)
    numlist = tf.strings.to_number(numlist, out_type=tf.dtypes.int32)
    masklist = tf.math.not_equal(numlist, tf.constant([1]))
    return tf.cast(masklist, tf.int32)

In [4]:
import keras.backend as K
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [19]:
def point5_accuracy(y_true, y_pred):
    return tf.keras.metrics.BinaryAccuracy(threshold=0.5)(y_true, y_pred)

def zero_accuracy(y_true, y_pred):
    return tf.keras.metrics.BinaryAccuracy(threshold=0)(y_true, y_pred)

def point25_accuracy(y_true, y_pred):
    return tf.keras.metrics.BinaryAccuracy(threshold=0.25)(y_true, y_pred)

def point75_accuracy(y_true, y_pred):
    return tf.keras.metrics.BinaryAccuracy(threshold=0.75)(y_true, y_pred)


# Pretrain on UM Dataset with MLM

### Prepare Dataset

In [5]:
token_files = [file for file in glob.glob('./MLM xlm-roberta files/tokens*/*')]
label_files = [file for file in glob.glob('./MLM xlm-roberta files/labels*/*')]

In [6]:
token_file_ds = tf.data.Dataset.list_files(token_files, seed=35)
attention_file_ds = tf.data.Dataset.list_files(token_files, seed=35)
label_file_ds = tf.data.Dataset.list_files(label_files, seed=35)

In [7]:
token_ds = tf.data.TextLineDataset(token_file_ds, num_parallel_reads=tf.data.experimental.AUTOTUNE)
attention_ds = tf.data.TextLineDataset(attention_file_ds, num_parallel_reads=tf.data.experimental.AUTOTUNE)
label_ds = tf.data.TextLineDataset(label_file_ds, num_parallel_reads=tf.data.experimental.AUTOTUNE)

In [8]:
token_ds = token_ds.map(destringify)
attention_ds = attention_ds.map(attention_maskify)
label_ds = label_ds.map(destringify)

In [9]:
x_dataset = tf.data.Dataset.zip((token_ds, attention_ds))
dataset = tf.data.Dataset.zip((x_dataset, label_ds))

In [10]:
filtered_dataset = dataset.filter(lambda x, y: tf.size(x)<=256)

In [11]:
filtered_dataset = filtered_dataset.shuffle(10000)

In [12]:
pad_dataset = filtered_dataset.padded_batch(24, padding_values = ((1, 0),0)).prefetch(3)

### Build Model

In [13]:
base_model = transformers.TFXLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=1, from_pt=True)

All PyTorch model weights were used when initializing TFXLMRobertaForTokenClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
base_model.summary()

Model: "tfxlm_roberta_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 277453056 
 )                                                               
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 277,453,825
Trainable params: 277,453,825
Non-trainable params: 0
_________________________________________________________________


### Pretrain Model

Step 1: Train classifier layer

In [24]:
base_model.layers[0].trainable=False
base_model.layers[1].trainable=False

base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy'])

In [25]:
base_model.fit(pad_dataset, epochs=1, verbose=1)



<keras.callbacks.History at 0x225b61c95e0>

In [None]:
## Note: second epoch appears to be unnecessary. Started at 0.25 loss. Looking like it'll end up close to that.

Step 2: Fine-tune primary model

In [26]:
base_model.layers[0].trainable=True
base_model.layers[1].trainable=True

base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy', matthews_correlation])

In [27]:
base_model.fit(pad_dataset, epochs=1, verbose=1)



<keras.callbacks.History at 0x222fa47ef70>

In [28]:
base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy', matthews_correlation])
base_model.fit(pad_dataset, epochs=1, verbose=1)



<keras.callbacks.History at 0x22628df4640>

In [29]:
base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy', matthews_correlation])
base_model.fit(pad_dataset, epochs=1, verbose=1)



<keras.callbacks.History at 0x2268fc9fdf0>

In [30]:
base_model.save_weights('XLM-Roberta_take_3_WEIGHTS_pretrained_only')
base_model.compile(optimizer='adam', loss=None)
base_model.save('XLM-Roberta_take_3_WHOLE_MODEL_pretrained_only')



INFO:tensorflow:Assets written to: XLM-Roberta_take_3_WHOLE_MODEL_pretrained_only\assets


INFO:tensorflow:Assets written to: XLM-Roberta_take_3_WHOLE_MODEL_pretrained_only\assets


# Train model on WMT data

### Build WMT dataset

In [31]:
def get_labels(string):
    x = string.split()
    out = [y=='BAD' for y in x]
    
    return np.asarray(out).astype(int)

def get_zh_word_labels(string):
    x = string.split()
    out = [y=='BAD' for y in x]
    word = [out[i] for i in range(len(out)) if i%2==1]
    
    return np.asarray(word).astype(int)

In [32]:
train_df = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.src', sep="/n", header=None, names=["Source"])
train_df['Target'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.mt', sep="/n", header=None)
train_df['Post Edits'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.pe', sep="/n", header=None)
train_df['Source Tags'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.source_tags', sep="/n", header=None)
train_df["Target Tags"] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-train/train.tags', sep="/n", header=None)

dev_df = df = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.src', sep="/n", header=None, names=["Source"])
dev_df['Target'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.mt', sep="/n", header=None)
dev_df['Post Edits'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.pe', sep="/n", header=None)
dev_df['Source Tags'] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.source_tags', sep="/n", header=None)
dev_df["Target Tags"] = pd.read_csv('./WMT2021 Data/Extracted_WMT2021_data/en-zh-dev/dev.tags', sep="/n", header=None)

multi_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

train_input = [train_df['Source'][i] + multi_tokenizer.sep_token + train_df['Target'][i] for i in range(len(train_df['Source']))]
dev_input = [dev_df['Source'][i] + multi_tokenizer.sep_token + dev_df['Target'][i] for i in range(len(dev_df['Source']))]

train_en_labels = train_df["Source Tags"].map(get_labels)
train_zh_labels = train_df["Target Tags"].map(get_zh_word_labels)

dev_en_labels = dev_df["Source Tags"].map(get_labels)
dev_zh_labels = dev_df["Target Tags"].map(get_zh_word_labels)

  return func(*args, **kwargs)


In [33]:
train_en_split = train_df["Source"].map(lambda x: x.split())
train_zh_split = train_df["Target"].map(lambda x: x.split())

dev_en_split = dev_df["Source"].map(lambda x: x.split())
dev_zh_split = dev_df["Target"].map(lambda x: x.split())

In [34]:
for i in range(train_df.shape[0]):
    assert len(train_en_split[i]) == len(train_en_labels[i])
    assert len(train_zh_split[i]) == len(train_zh_labels[i]), print(i)
    
for i in range(dev_df.shape[0]):
    assert len(dev_en_split[i]) == len(dev_en_labels[i])
    assert len(dev_zh_split[i]) == len(dev_zh_labels[i])

In [35]:
train_input = [train_en_split[i] + [multi_tokenizer.eos_token] + train_zh_split[i] for i in range(len(train_en_split))]
dev_input = [dev_en_split[i] + [multi_tokenizer.eos_token] + dev_zh_split[i] for i in range(len(dev_en_split))]

train_toks = multi_tokenizer(train_input, max_length=256, padding='max_length', truncation=True, is_split_into_words=True, return_tensors='tf')
dev_toks = multi_tokenizer(dev_input, max_length=256, padding='max_length', truncation=True, is_split_into_words=True,  return_tensors='tf')

train_labels = [list(train_en_labels[i]) + [0] + list(train_zh_labels[i]) for i in range(len(train_en_labels))]
dev_labels = [list(dev_en_labels[i]) + [0] + list(dev_zh_labels[i]) for i in range(len(dev_en_labels))]

In [36]:
def expand_labels(labels, tokens):
    ## converts WMT's word-labels into token-labels
    new_labels = np.zeros_like(tokens['input_ids'])
    
    for i in range(len(labels)):
        word_ids = tokens.word_ids(i)
        cur_labels = labels[i]
        for j in range(len(word_ids)):
            if word_ids[j] != None:
                new_labels[i,j]= cur_labels[word_ids[j]]
    
    return new_labels


In [37]:
train_expanded_labels = expand_labels(train_labels, train_toks)
dev_expanded_labels = expand_labels(dev_labels, dev_toks)

In [38]:
wmt_train_ids = tf.data.Dataset.from_tensor_slices(train_toks['input_ids'])
wmt_train_attention = tf.data.Dataset.from_tensor_slices(train_toks['attention_mask'])
wmt_train_labels = tf.data.Dataset.from_tensor_slices(train_expanded_labels)

In [39]:
wmt_train_ds = tf.data.Dataset.zip((wmt_train_ids, wmt_train_attention))
wmt_train_ds = tf.data.Dataset.zip((wmt_train_ds, wmt_train_labels))

In [40]:
wmt_dev_ids = tf.data.Dataset.from_tensor_slices(dev_toks['input_ids'])
wmt_dev_attention = tf.data.Dataset.from_tensor_slices(dev_toks['attention_mask'])
wmt_dev_labels = tf.data.Dataset.from_tensor_slices(dev_expanded_labels)

wmt_dev_ds = tf.data.Dataset.zip((wmt_dev_ids, wmt_dev_attention))
wmt_dev_ds = tf.data.Dataset.zip((wmt_dev_ds, wmt_dev_labels))

In [41]:
wmt_train_ds = wmt_train_ds.shuffle(1000).batch(12)
wmt_dev_ds = wmt_dev_ds.shuffle(1000).batch(12)

### Train Model

In [42]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='model_take2.{epoch:02d}-{val_loss:.2f}.h5',
    save_weights_only=True,
    monitor='val_matthews_correlation',
    mode='max',
    save_best_only=True)

reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=2, min_lr=1e-8)

early_stop_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=10,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

In [43]:
base_model.compile(optimizer=tfa.optimizers.AdamW(weight_decay=1e-6, learning_rate=6e-6), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy', matthews_correlation])

history = base_model.fit(wmt_train_ds,
               validation_data=wmt_dev_ds,
               epochs=48,
               verbose=1,
                callbacks=[model_checkpoint_callback, reduce_lr_callback, early_stop_callback])

Epoch 1/48
Epoch 2/48
Epoch 3/48
Epoch 4/48
Epoch 5/48
Epoch 6/48
Epoch 7/48
Epoch 8/48
Epoch 9/48
Epoch 10/48
Epoch 11/48
Epoch 12/48
Epoch 13/48
Epoch 00013: early stopping


In [44]:
base_model.save_weights('XLM-Roberta_take_3_WEIGHTS_trained')
base_model.compile(optimizer='adam', loss=None)
base_model.save('XLM-Roberta_take_3_WHOLE_MODEL_trained')



INFO:tensorflow:Assets written to: XLM-Roberta_take_3_WHOLE_MODEL_trained\assets


INFO:tensorflow:Assets written to: XLM-Roberta_take_3_WHOLE_MODEL_trained\assets
