In [1]:
!pip install tensorflow_ranking
!pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_ranking
  Downloading tensorflow_ranking-0.5.0-py2.py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 7.3 MB/s 
Collecting tensorflow-serving-api<3.0.0,>=2.0.0
  Downloading tensorflow_serving_api-2.9.1-py2.py3-none-any.whl (37 kB)
Collecting tensorflow<3,>=2.9.1
  Downloading tensorflow-2.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 5.8 kB/s 
[?25hCollecting flatbuffers<2,>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 49.2 MB/s 
[?25hCollecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import time
import tensorflow_ranking as tfr
from tensorflow_serving.apis import input_pb2 
from tqdm import tqdm
import tensorflow_hub as hub
from fuzzywuzzy import fuzz



In [3]:
# Loading model
module=hub.load('https://tfhub.dev/google/universal-sentence-encoder-qa/3')

In [4]:
# init encoders 
question_encoder = module.signatures['question_encoder']
response_encoder = module.signatures['response_encoder']
neg_response_encoder = module.signatures['response_encoder']

In [5]:
#for x in module.variables:
  # print(x)
  # print(x.name)
  # break
# v = ['QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
# for vv in v:
#   print(vv)
#   break

In [6]:
# layers to be fine-tuned
v = ['QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
var_finetune = [x for x in module.variables for vv in v if vv in x.name]

In [7]:
adam_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001,
                beta_1 = 0.9,
                beta_2 = 0.999,
                epsilon = 1e-07)

In [8]:
def triplet_loss(anchor_vector, positive_vector, negative_vector, margin):
    """Computes the triplet loss with semi-hard negative mining.
    The loss encourages the positive distances (between a pair of embeddings with
    the same labels) to be smaller than the minimum negative distance among
    which are at least greater than the positive distance plus the margin constant
    (called semi-hard negative) in the mini-batch. If no such negative exists,
    uses the largest negative distance instead.
    See: https://arxiv.org/abs/1503.03832.

    :type anchor_vector: tf.Tensor
    :type positive_vector: tf.Tensor
    :type negative_vector: tf.Tensor
    :type metric: str
    :type margin: float
    :param anchor_vector: The anchor vector in this use case should be the encoded query. 
    :param positive_vector: The positive vector in this use case should be the encoded response. 
    :param negative_vector: The negative vector in this use case should be the wrong encoded response. 
    :param metric: Specify loss function
    :param margin: Margin parameter in loss function. See link above. 
    :return: the triplet loss value, as a tf.float32 scalar.
    """
    cosine_distance = tf.keras.losses.CosineSimilarity(axis=1)
    d_pos = cosine_distance(anchor_vector, positive_vector)
    d_neg = cosine_distance(anchor_vector, negative_vector)
    # print("d_pos: ", d_pos)
    # print("d_neg: ", d_neg)
    loss = tf.maximum(0., margin - d_pos + d_neg)  # 0 indicates orthogonality and values closer to -1 indicate greater similarity. The values closer to 1 indicate greater dissimilarity.
    # print("triplet_loss: ", loss)
    loss = tf.reduce_mean(loss)
    # print("triplet_loss_reduce_mean: ", loss)
    return loss

In [9]:
import statistics

In [10]:
def finetune_weights(question, 
                     answer,
                     neg_answer,
                     question_encoder,
                     response_encoder,
                     neg_response_encoder,
                     var_finetune,
                     optimizer,
                     batch_size,
                     epoches,
                     margin=0.3,
                     loss='triplet'):
                         #context=[], 
                         #neg_answer=[],
                         #neg_answer_context=[], 
                         #label=[]):
        """
        Finetune the model with GradientTape

        :type question: list of str
        :type answer: list of str
        :type context: list of str
        :type neg_answer: list of str
        :type neg_answer_context: list of str
        :type margin: float
        :type label: list of int
        :type loss: str
        :param question: List of string queries
        :param answer: List of string responses
        :param context: List of string response contexts, this is applicable to the USE model
        :param neg_answer: List of string responses that do not match with the queries. This is applicable for triplet / contrastive loss.
        :param neg_answer_context: Similar to neg_answer for the USE model to ingest
        :param label: List of int
        :param margin: Marrgin tuning parameter for triplet / contrastive loss
        :param loss: Specify loss function
        :return:  numpy array of mean loss value
        """
        cost_history = []
        epoch=0
        while epoch<epoches:
          for idx in range(0, len(question), batch_size):

            with tf.GradientTape() as tape:
                # tape.watch(var_finetune)
                # get encodings
                question_embeddings = question_encoder(
                    tf.constant(question[idx:idx+batch_size])
                )['outputs']

                response_embeddings = response_encoder(
                    input=tf.constant(answer[idx:idx+batch_size]),
                    context=tf.constant(answer[idx:idx+batch_size])
                )['outputs']

                #print(question_embeddings)
                #print(response_embeddings)

                
                if loss == 'cosine':
                    """
                    # https://www.tensorflow.org/api_docs/python/tf/keras/losses/CosineSimilarity

                    """
                    cost = tf.keras.losses.CosineSimilarity(axis=1)
                    cost_value = cost(question_embeddings, response_embeddings)
                    
                elif loss == 'triplet':
                    """
                    Triplet loss uses a non-official self-implementated loss function outside of TF based on cosine distance

                    """
                    neg_response_embeddings = neg_response_encoder(
                        input=tf.constant(neg_answer[idx:idx+batch_size]),
                        context=tf.constant(neg_answer[idx:idx+batch_size])
                    )['outputs']

                    cost_value = triplet_loss(
                        question_embeddings,
                        response_embeddings,
                        neg_response_embeddings,
                        margin=margin
                    )


            # record loss
            cost_history.append(cost_value.numpy().mean())
            #print("cost_value: ", cost_value)
            #print(var_finetune)
            # apply gradient
            grads = tape.gradient(cost_value, var_finetune)
            #print(grads)
            #print(type(grads))
            optimizer.apply_gradients(zip(grads, var_finetune))

          print (statistics.mean(cost_history))
          print ("epoch: ",epoch+1)
          epoch+=1

In [11]:
# mount the Google Drive 
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [12]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [13]:
path_csv  = "/content/drive/MyDrive/NUHS/"
df = pd.read_csv(os.path.join(path_csv, "FINAL_TL_Source.csv")).dropna()

In [14]:
questions_list = df['anchor']
responses_list = df['positive']
neg_response_list = df['negative']

In [15]:
finetune_weights(question = questions_list, 
                 answer = responses_list,
                 neg_answer = neg_response_list,
                 question_encoder = question_encoder,
                 response_encoder = response_encoder,
                 neg_response_encoder = neg_response_encoder,
                 var_finetune = var_finetune,
                 optimizer = adam_optimizer,
                 batch_size = 64,
                 epoches = 8)

0.19948919
epoch:  1
0.12186948
epoch:  2
0.087740794
epoch:  3
0.06823169
epoch:  4
0.055498417
epoch:  5
0.046484556
epoch:  6
0.039877873
epoch:  7
0.03490441
epoch:  8


In [16]:
tf.saved_model.save(module,"/content/drive/MyDrive/NUHS/USE_triplet",signatures={'default':module.signatures['default'],
                                         'question_encoder':module.signatures['question_encoder'],
                                         'response_encoder':module.signatures['response_encoder']})

INFO:tensorflow:Assets written to: /content/drive/MyDrive/NUHS/USE_triplet/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/NUHS/USE_triplet/assets


In [17]:
model=tf.saved_model.load('/content/drive/MyDrive/NUHS/USE_triplet')


In [18]:
model.signatures

_SignatureMap({'default': <ConcreteFunction signature_wrapper(*, input) at 0x7F9AE4EBA6D0>, 'question_encoder': <ConcreteFunction signature_wrapper(*, input) at 0x7F9AE67DEF90>, 'response_encoder': <ConcreteFunction signature_wrapper(*, input, context) at 0x7F9AECEE6610>})

In [19]:
for sig in model.signatures:
  print(sig)

default
question_encoder
response_encoder


In [20]:
x = tf.keras.layers.Input(shape=[], dtype=tf.string)
y = hub.KerasLayer(model, 
                    trainable=False)(x)
z1 = tf.keras.layers.Dense(128,activation='relu')(y)
z2 = tf.keras.layers.Dense(32,activation='relu')(z1)
z3 = tf.keras.layers.Dense(8,activation='relu')(z2)
z = tf.keras.layers.Dense(1, activation='sigmoid')(z3)
model1 = tf.keras.models.Model(x, z)

In [21]:
model1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer (KerasLayer)    (None, 512)               148698369 
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dense_2 (Dense)             (None, 8)                 264       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 148,768,434
Trainable params: 70,065
Non-traina

In [22]:
model1.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [23]:
path_csv  = "/content/drive/MyDrive/NUHS/"
df = pd.read_csv(os.path.join(path_csv, "query_question.csv")).dropna()

In [24]:
df['pair'] = df['query'].map(str) + ' [SEP] ' + df['question'].map(str)

In [25]:
train, test= train_test_split(df, test_size=0.2, random_state=1, stratify=df['count'])

In [26]:
X_train=train.drop(['count'], axis=1)

In [27]:
test,val= train_test_split(test,test_size=0.5,random_state=1, stratify=test['count'])

In [28]:
from imblearn.under_sampling import RandomUnderSampler 

In [29]:
positive = train[train["count"] == 1]
negative = train[train["count"] == 0]

In [30]:
from sklearn.utils import resample
negative_downsample = resample(negative,
             replace=False,
             n_samples=2*len(positive),
             random_state=42)

In [31]:
train_downsample = pd.concat([negative_downsample, positive])

In [32]:
train_downsample=train_downsample.sample(frac=1)

In [33]:
train_downsample

Unnamed: 0,query,question,count,pair
12500,fee for citizens,are singapore citizen and permanent resident n...,0,fee for citizens [SEP] are singapore citizen a...
15628,i do not have the bill reference numbers,what is mobile registration e-service on the o...,0,i do not have the bill reference numbers [SEP]...
14844,how to take que# thru the apps. i can't find it.,is there any payment limit in the onenuhs app?,0,how to take que# thru the apps. i can't find i...
11226,i need to change appointment,how do i book a nup swab test appointment?,0,i need to change appointment [SEP] how do i bo...
1732,i would like to speak to a representative,speak with a call operator,1,i would like to speak to a representative [SEP...
...,...,...,...,...
6019,i missed my appoinment on 11 of february @ cck...,how do i book a nup swab test appointment?,0,i missed my appoinment on 11 of february @ cck...
23889,what appointment did i miss today?,how are my appointments displayed in the onenu...,0,what appointment did i miss today? [SEP] how a...
11247,why i can’t see my child name in my list,what is medication delivery service?,0,why i can’t see my child name in my list [SEP]...
11237,make appointment in 5weeks time how?,how do i reduce my waiting time at the clinic?,0,make appointment in 5weeks time how? [SEP] how...


In [34]:
model1.fit(train_downsample['pair'], 
          train_downsample['count'], 
          epochs=8,
          batch_size=64, 
          validation_data=(val['pair'], val['count']))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9ae130d150>

In [35]:
pred=model1.predict(test['pair'])



In [36]:
for i, x in enumerate(pred):
    if x <0.5: pred[i] = 0
    else: pred[i] = 1

In [37]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(test['count'],pred).ravel()

In [38]:
tn, fp, fn, tp

(2365, 249, 280, 276)

In [39]:
from sklearn.metrics import f1_score
f1_score(test['count'],pred)

0.5106382978723403

In [40]:
from sklearn.metrics import roc_auc_score

In [41]:
roc_auc_score(test['count'], pred)

0.7005732827662889