# STS-B

## Preparing the Environment

In [None]:
#Google Colab - Drive Mounting
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
#Install missing library keras-nlp
!pip install -q keras-nlp

In [None]:
#Import the libraries
import tensorflow as tf
import keras_nlp
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import os
import re
import string
import random

## Data Preprocessing and Parameters Initialization

In [None]:
#Finetuning Parameters
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128

In [None]:
# Download of the vocabulary from BERT: Bert-uncased
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)
#Initialization of the Word Tokenizer, with a given vocabulary and a sequence length
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file, sequence_length=SEQ_LENGTH,
)

In [None]:
current_path = "path_to_GLUE/STS-B/"
train_path = os.path.join(current_path, 'train.tsv')
dev_path = os.path.join(current_path, 'dev.tsv')
test_path = os.path.join(current_path, 'test.tsv')

In [None]:
# Definition of the Class Data using keras Sequence format
class DataSequence(keras.utils.Sequence):
    '''
    Organize the data into: encoded text/ label 
    Overwrite the Keras.Utils.Sequence Class
    '''
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return (len(self.y) + FINETUNING_BATCH_SIZE - 1) // FINETUNING_BATCH_SIZE

    def __getitem__(self, index):
        s = slice(index * FINETUNING_BATCH_SIZE, (index + 1) * FINETUNING_BATCH_SIZE)
        return [item[s] for item in self.x], self.y[s]


def generate_sequence(path):
      '''
    Given a file path, read the file and organize the data from the file
    Return Keras.utils.Sequence object ready to use for training, with the format: tokens/classes
    '''
    tokens, classes, scores = [], [], []
    max_len = 0
    with open(path) as reader:
        reader.readline()
        for line in reader:
            line = line.strip()
            parts = line.split('\t')
            text = (parts[7]) + '[SEP]' + (parts[8]) + '[SEP]'
            encoded = tokenizer(text)
            max_len = max(max_len, len(encoded))
            tokens.append(encoded)
            # Classification Label
            classes.append(round(float(parts[9])))
            # True Label
            scores.append(float(parts[9]))
    tokens, classes = np.array(tokens), np.array(classes)
    return (DataSequence([tokens],classes)),scores

In [None]:
# Load and Generate training data
train_data, y_train = generate_sequence(train_path)
dev_data, y_dev = generate_sequence(dev_path)

## Load the model and change the head

In [None]:
# Load the pretrained model and display its structure
model = keras.models.load_model('path_to_the_pretrained_model',compile=False)
model.summary()

In [None]:
# The model's head is modified for classification

inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)
encoded_tokens = model(inputs)
x = keras.layers.GlobalAveragePooling1D()(encoded_tokens)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(768, activation="tanh")(x)  
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(units=6, activation='softmax', name='Softmax')(x)


finetuning_model = keras.models.Model(inputs=inputs, outputs=outputs)
finetuning_model.summary()

## Model Training

In [None]:
#Create tensorboard callback
logdir = "path_to_save_execution_information" #+ datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

#Compile Model
finetuning_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=FINETUNING_LEARNING_RATE),
    metrics=["sparse_categorical_accuracy"])

In [None]:
#Model training
finetuning_model.fit(
    train_data, 
    validation_data=dev_data, 
    epochs=FINETUNING_EPOCHS,
    callbacks=[tensorboard_callback])

In [None]:
# Add Tokenization layer to the model
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = finetuning_model(tokens)

#Save model
final_model = keras.Model(inputs, outputs)
final_model.save("path_to_save_model")

In [None]:
# Restore the saved model
restored_model = keras.models.load_model("path_to_save_model", compile=False)

## Testing

In [None]:
def generate_test(path):
    '''
    Given a file path, read the file and preprocess the data
    Return a numpy array of the test sentences
    '''
    tokens, classes, scores = [], [], []
    max_len = 0
    with open(path) as reader:
        reader.readline()
        for line in reader:
            line = line.strip()
            parts = line.split('\t')
            text = (parts[7]) + '[SEP]' + (parts[8]) + '[SEP]'
            encoded = tokenizer(text)
            max_len = max(max_len, len(encoded))
            tokens.append(encoded)
    tokens = np.array(tokens)
    return tokens

#Load and generate test data
test_data = generate_test(test_path)

In [None]:
#Generate predictions
classes = np.array([[0], [1], [2], [3], [4], [5]])
hot_encod= finetuning_model.predict(test_data, verbose=True)
results = np.dot(hot_encod, classes).squeeze(axis=-1)

In [None]:
#Format results into dataframe, ready to be uploaded on gluebenchmark.com
df = pd.DataFrame(results)
df.to_csv("STS-B.tsv",sep='\t', encoding='utf-8')

In [None]:
#Load Tensorboard
%reload_ext tensorboard
%tensorboard --logdir="path_to_save_execution_information"

In [None]:
#Code to automatically stop the run time for Google Colab
import time
time.sleep(60)
from google.colab import runtime
runtime.unassign()