In [1]:
# mount the Google Drive 
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [3]:
path_csv  = "/content/drive/MyDrive/NUHS/"
df = pd.read_csv(os.path.join(path_csv, "query_question.csv")).dropna()

In [4]:
df['pair'] = df['query'].map(str) + ' [SEP] ' + df['question'].map(str)

In [5]:
train, test= train_test_split(df, test_size=0.2, random_state=1, stratify=df['count'])

In [6]:
X_train=train.drop(['count'], axis=1)

In [7]:
test,val= train_test_split(test,test_size=0.5,random_state=1, stratify=test['count'])

In [8]:
from imblearn.under_sampling import RandomUnderSampler 

In [9]:
positive = train[train["count"] == 1]
negative = train[train["count"] == 0]

In [10]:
from sklearn.utils import resample
negative_downsample = resample(negative,
             replace=False,
             n_samples=2*len(positive),
             random_state=42)

In [11]:
train_downsample = pd.concat([negative_downsample, positive])

In [12]:
train_downsample=train_downsample.sample(frac=1)

In [13]:
train_downsample

Unnamed: 0,query,question,count,pair
13283,speak to doctor,do i need to be referred by doctors to use med...,0,speak to doctor [SEP] do i need to be referred...
1883,how do i take queue number frim this app,is the clinic queue updated real-time in onenu...,1,how do i take queue number frim this app [SEP]...
28700,mmg appt at bukit batok polyclinic,how do i arrange for an appointment to change ...,0,mmg appt at bukit batok polyclinic [SEP] how d...
1172,i want my medicine to be deliver to home,i want to request for home delivery for my med...,1,i want my medicine to be deliver to home [SEP]...
5169,how to get appointment for meningitis vaccine,how do i book a nup swab test appointment?,1,how to get appointment for meningitis vaccine ...
...,...,...,...,...
30027,how to do on line registration for my medical ...,what is shown under appointments in onenuhs app?,0,how to do on line registration for my medical ...
29056,appointment related,how are my appointments displayed in the onenu...,0,appointment related [SEP] how are my appointme...
25049,collection of medications,medical certificate,0,collection of medications [SEP] medical certif...
18563,"i have book appointment , now how to get quene...",can i walk in to see doctor instead of having ...,0,"i have book appointment , now how to get quene..."


In [14]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install seaborn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

In [16]:
encoder = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [17]:
import tensorflow as tf
import tensorflow_hub as hub

In [18]:
x = tf.keras.layers.Input(shape=[], dtype=tf.string)
y = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4', 
                    trainable=True)(x)
z1 = tf.keras.layers.Dense(128, activation='relu')(y)
z2 = tf.keras.layers.Dense(32, activation='relu')(z1)
z3 = tf.keras.layers.Dense(8, activation='relu')(z2)
z = tf.keras.layers.Dense(1, activation='sigmoid')(z3)
model = tf.keras.models.Model(x, z)

In [19]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dense_2 (Dense)             (None, 8)                 264       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 256,867,889
Trainable params: 256,867,889
Non-t

In [20]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [21]:
model.fit(train_downsample['pair'], 
          train_downsample['count'], 
          epochs=4,
          batch_size=16, 
          validation_data=(val['pair'], val['count']))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fae34db7a10>

In [22]:
pred=model.predict(test['pair'])

In [23]:
for i, x in enumerate(pred):
    if x <0.5: pred[i] = 0
    else: pred[i] = 1

In [24]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(test['count'],pred).ravel()

In [25]:
tn, fp, fn, tp

(2249, 365, 174, 382)

In [26]:
from sklearn.metrics import f1_score
f1_score(test['count'],pred)

0.5863392171910975

In [27]:
from sklearn.metrics import roc_auc_score

In [28]:
roc_auc_score(test['count'], pred)

0.7737088064819758