# MRPC

## Preparing the Environment

In [None]:
#Google Colab - Drive Mounting
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#Install missing library keras-nlp
!pip install -q keras-nlp

In [None]:
#Import the libraries
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras
import numpy as np
import urllib.request
import pandas as pd

## Data Preprocessing and Parameters Initialization

In [None]:
#Finetuning Parameters
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

In [None]:
#Uncomment three last lines for downloading mrpc data: given data is in 'msr' format and is difficult to use

MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
MRPC_DEV = 'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'

mrpc_train_file ='path_to_mrpc/msr_paraphrase_train.txt'
mrpc_test_file ='path_to_mrpc/msr_paraphrase_test.txt'
mrpc_dev_file ='path_to_mrpc/msr_dev_ids.txt'

# urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
# urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
# urllib.request.urlretrieve(MRPC_DEV, mrpc_dev_file)

In [None]:
# Download of the vocabulary from BERT: Bert-uncased
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)
#Initialization of the Word Tokenizer, with a given vocabulary and a sequence length
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file, sequence_length=SEQ_LENGTH,
)

In [None]:
# Definition of the Class Data using Keras Sequence format
class DataSequence(keras.utils.Sequence):
    '''
    Organize the data into: encoded text/ label 
    Overwrite the Keras.Utils.Sequence Class
    '''

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return (len(self.y) + FINETUNING_BATCH_SIZE - 1) // FINETUNING_BATCH_SIZE

    def __getitem__(self, index):
        s = slice(index * FINETUNING_BATCH_SIZE, (index + 1) * FINETUNING_BATCH_SIZE)
        return [item[s] for item in self.x], self.y[s]

def generate_sequence(path, dev_ids=None, select_dev=False):
    '''
    Given a file path, read the file and organize the data from the file
    Return Keras.utils.Sequence object ready to use for training, with the format: tokens/classes
    '''
    tokens, classes = [], []
    with open(path) as reader:
        reader.readline()
        for line in reader:
            line = line.strip()
            parts = line.split('\t')
            ids = (parts[1], parts[2])
            if dev_ids is not None:
                if select_dev:
                    if ids not in dev_ids:
                        continue
                else:
                    if ids in dev_ids:
                        continue
            text = (parts[3]) + '[SEP]' + (parts[4]) + '[SEP]'
            encoded = tokenizer(text)
            tokens.append(encoded)
            classes.append(int(parts[0], 10))
    tokens, classes = np.array(tokens), np.array(classes)
    return DataSequence([tokens], classes)

In [None]:
#For dev set
with open(mrpc_dev_file) as dev_reader:
    dev_ids = set([tuple(line.strip().split('\t')) for line in dev_reader])

In [None]:
#Load the training data
train_seq = generate_sequence(mrpc_train_file, dev_ids=dev_ids, select_dev=False)
dev_seq = generate_sequence(mrpc_train_file, dev_ids=dev_ids, select_dev=True)
test_seq = generate_sequence(mrpc_test_file)

## Load the model and change the head

In [None]:
# Load the pretrained model and display its structure
model = keras.models.load_model('path_to_the_pretrained_model',compile=False)
model.summary()

In [None]:
# The model's head is modified for classification

inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)
encoded_tokens = model(inputs)
#HEAD FROM https://keras.io/examples/nlp/text_classification_with_transformer/
x = keras.layers.GlobalAveragePooling1D()(encoded_tokens)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(768, activation="tanh")(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

finetuning_model = keras.models.Model(inputs=inputs, outputs=outputs)
finetuning_model.summary()

## Model Training

In [None]:
#Create tensorboard callback
logdir = "path_to_save_execution_information" #+ datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

#Compile model 
finetuning_model.compile(
    optimizer=keras.optimizers.Adam(lr=3e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
)

In [None]:
#Model training
finetuning_model.fit_generator(
    generator=train_seq,
    validation_data=dev_seq,
    epochs=FINETUNING_EPOCHS,
    callbacks=[tensorboard_callback],
)

In [None]:
# Add Tokenization layer to the model
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = finetuning_model(tokens)

#Save model
final_model = keras.Model(inputs, outputs)
final_model.save("path_to_save_model")

In [None]:
#Restore model
restored_model = keras.models.load_model("path_to_save_model", compile=False)

## Testing

In [None]:
def generate_test(path, dev_ids=None, select_dev=False):
    '''
    Given a file path, read the file and preprocess the data
    Return a numpy array of the test sentences
    '''
    tokens, classes = [], []
    with open(path) as reader:
        reader.readline()
        for line in reader:
            line = line.strip()
            parts = line.split('\t')
            ids = (parts[1], parts[2])
            if dev_ids is not None:
                if select_dev:
                    if ids not in dev_ids:
                        continue
                else:
                    if ids in dev_ids:
                        continue
            text = (parts[3]) + '[SEP]' + (parts[4]) + '[SEP]'
            encoded = (text)
            tokens.append(encoded)
            classes.append(int(parts[0], 10))
    tokens, classes = np.array(tokens), np.array(classes)
    return DataSequence([tokens], classes)

#Load the test data
test_seq = generate_test(mrpc_test_file)

In [None]:
#Generate predictions
results = restored_model.predict(test_seq, verbose=True).argmax(axis=-1)

In [None]:
#Format results into dataframe, ready to be uploaded on gluebenchmark.com
df = pd.DataFrame(results)

df.to_csv("MRPC.tsv",sep='\t', encoding='utf-8')

In [None]:
#Load Tensorboard
%reload_ext tensorboard
%tensorboard --logdir="path_to_save_execution_information"

In [None]:
#Code to automatically stop the run time for Google Colab
import time
time.sleep(60)
from google.colab import runtime
runtime.unassign()