# QQP

## Preparing the Environment

In [None]:
#Google Colab - Drive Mounting
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
#Install missing library keras-nlp
!pip install -q keras-nlp

In [None]:
#Import the libraries
import tensorflow as tf
import keras_nlp
from tensorflow import keras
import numpy as np
import os
import re
import pandas as pd
import string
import random

## Data Preprocessing and Parameters Initialization

In [None]:
#Finetuning Parameters
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128

In [None]:
# Download of the vocabulary from BERT: Bert-uncased
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)
#Initialization of the Word Tokenizer, with a given vocabulary and a sequence length
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file, sequence_length=SEQ_LENGTH,
)

In [None]:
# Load QQP train/validation/test
qqp_train_ds = tf.data.experimental.CsvDataset(
    "path_to_GLUE/QQP/" + "train.tsv", [tf.int32, tf.int32, tf.int32, tf.string, tf.string,tf.int32], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)
qqp_val_ds = tf.data.experimental.CsvDataset(
    "path_to_GLUE/QQP/" + "dev.tsv", [tf.int32, tf.int32, tf.int32, tf.string, tf.string,tf.int32], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)
qqp_test_ds = tf.data.experimental.CsvDataset(
    "path_to_GLUE/QQP/" + "test.tsv", [tf.int32, tf.string, tf.string], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)

In [None]:
def preprocess(index,src1,src2,sentence1,sentence2,label):
  """
  Given two sentences, return the two sentences combined with a separator and the label
  """
  text = sentence1 + '[SEP]' + sentence2
  return tokenizer(text),label


#Map the Data to the preprocess function
finetune_ds = qqp_train_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
finetune_val_ds = qqp_val_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [None]:
# Load the pretrained model and display its structure
model = keras.models.load_model('path_to_the_pretrained_model',compile=False)
model.summary()

In [None]:
# The model's head is modified for classification

inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)
encoded_tokens = model(inputs)
x = keras.layers.GlobalAveragePooling1D()(encoded_tokens)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(768, activation="tanh")(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

finetuning_model = keras.models.Model(inputs=inputs, outputs=outputs)
finetuning_model.summary()

## Model Training

In [None]:
#Create tensorboard callback
logdir = "path_to_save_execution_information" #+ datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

#Compile model 
finetuning_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=FINETUNING_LEARNING_RATE),
    metrics=["sparse_categorical_accuracy"],
)

In [None]:
#Model training
finetuning_model.fit(
    finetune_ds, 
    validation_data=finetune_val_ds, 
    epochs=FINETUNING_EPOCHS,
    callbacks=tensorboard_callback)

In [None]:
# Add Tokenization layer to the model
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = finetuning_model(tokens)

#Save model
final_model = keras.Model(inputs, outputs)
final_model.save("path_to_save_model")

In [None]:
# Restore the saved model
restored_model = keras.models.load_model("path_to_save_model", compile=False)

## Testing

In [None]:
def preprocess_test(index,sentence1,sentence2):
  """
  Given two sentences, return the two sentences combined with a separator
  """
  text = sentence1 + '[SEP]' + sentence2
  return tokenizer(text)

In [None]:
# Load and generate test data
finetune_test_ds = sst_test_ds.map(
    preprocess_test, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [None]:
#Generate predictions
result = fine_model.predict(finetune_test_ds, batch_size=128,verbose=True).argmax(axis=-1)

In [None]:
#Format results into dataframe, ready to be uploaded on gluebenchmark.com
df = pd.DataFrame(result)

df.to_csv("QQP.tsv",sep='\t', encoding='utf-8')

In [None]:
#Load Tensorboard
%reload_ext tensorboard
%tensorboard --logdir="path_to_save_execution_information"

In [None]:
#Code to automatically stop the run time for Google Colab
import time
time.sleep(60)
from google.colab import runtime
runtime.unassign()