In [1]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data
import keras
import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline

Using plaidml.keras.backend backend.


In [3]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [4]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.5


In [5]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [6]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

In [7]:
df = spark.read.load("questionBertEmbeddings.parquet")

In [8]:
df.show(5)

+------+------+------+--------------------------------------+--------------------------------------+-----+
|    id|  qid1|  qid2|question1_finished_sentence_embeddings|question2_finished_sentence_embeddings|label|
+------+------+------+--------------------------------------+--------------------------------------+-----+
|119994|194709|194710|                  [[0.3533832728862...|                  [[0.1582194119691...|  1.0|
|119995|194711|171467|                  [[0.3274492025375...|                  [[0.8858540058135...|  0.0|
|119996|194712|194713|                  [[-0.307045459747...|                  [[0.4329249262809...|  0.0|
|119997|111655| 64454|                  [[-0.437017947435...|                  [[-0.389505982398...|  0.0|
|119998| 67996|194714|                  [[0.6548286676406...|                  [[0.4485912322998...|  1.0|
+------+------+------+--------------------------------------+--------------------------------------+-----+
only showing top 5 rows



In [9]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- qid1: long (nullable = true)
 |-- qid2: long (nullable = true)
 |-- question1_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- question2_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- label: double (nullable = true)



In [10]:
df_renamed = df.withColumnRenamed("question1_finished_sentence_embeddings","features1")\
.withColumnRenamed("question2_finished_sentence_embeddings","features2")

In [11]:
df_renamed.printSchema()

root
 |-- id: long (nullable = true)
 |-- qid1: long (nullable = true)
 |-- qid2: long (nullable = true)
 |-- features1: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- features2: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- label: double (nullable = true)



In [12]:
train_data, test_data = df_renamed.randomSplit([.8, .2], seed=1234)

In [13]:
def build_data(df, chunks=10):
    x_train_1 = []
    x_train_2 = []
    y_train = []

    row_count = df.count()
    i = 0
    
    chunks = df.randomSplit(weights=[1/chunks] * chunks)

    for chunk in chunks:
        rows = chunk.collect()
        for row in rows:
            if i % 100000 == 0:
                print('row {} / {} ({:.1f} %)'.format(i, row_count, 100 * i / row_count))
            f1 = np.array(row['features1'][0]).reshape(-1,1)
            f2 = np.array(row['features2'][0]).reshape(-1,1)
            label = row['label']
            x_train_1.append(f1)
            x_train_2.append(f2)
            y_train.append(label)
            i += 1

    #x_train = np.array([np.array(x_train_1), np.array(x_train_2)])
    y_train = np.array(y_train)
    return x_train_1, x_train_2, y_train

In [14]:
from keras.preprocessing import sequence

In [15]:
x_train_1, x_train_2, y_train = build_data(train_data)

row 0 / 323344 (0.0 %)
row 100000 / 323344 (30.9 %)
row 200000 / 323344 (61.9 %)
row 300000 / 323344 (92.8 %)


In [16]:
x_train_1 = sequence.pad_sequences(x_train_1, maxlen=768)
x_train_2 = sequence.pad_sequences(x_train_2, maxlen=768)

In [17]:
x_test_1, x_test_2, y_test = build_data(test_data)

row 0 / 80943 (0.0 %)


In [18]:
x_test_1 = sequence.pad_sequences(x_test_1, maxlen=768)
x_test_2 = sequence.pad_sequences(x_test_2, maxlen=768)

In [19]:
print(x_train_1.shape, x_train_2.shape)
print(x_test_1.shape, x_test_2.shape)

(323344, 768, 1) (323344, 768, 1)
(80943, 768, 1) (80943, 768, 1)


In [58]:
spark.stop()

In [20]:
print('Train Labels:\n', pd.Series(y_train).value_counts())
print('Test Labels:\n', pd.Series(y_test).value_counts())

Train Labels:
 0.0    204157
1.0    119187
dtype: int64
Test Labels:
 0.0    50867
1.0    30076
dtype: int64


In [40]:
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda, Dot
from keras.optimizers import RMSprop
from keras import backend as K

#### Siamese Approach (Distance Based)

In [22]:
def create_base_network(input_shape):
    '''Base network to be shared (eq. to feature extraction).
    '''
    _input = Input(shape=input_shape)
    x = Flatten()(_input)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    return Model(_input, x)

In [23]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, 'float64')))

def absolute_loss(y_true, y_pred):
    return K.mean(K.abs(y_pred - y_true))

In [24]:
# network definition
base_network = create_base_network((768, 1))

input_a = Input(shape=(768, 1))
input_b = Input(shape=(768, 1))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)
rms = RMSprop()
model.compile(loss='binary_crossentropy', optimizer=rms, metrics=['accuracy'])

INFO:plaidml:Opening device "opencl_amd_radeon_pro_5500m_compute_engine.0"


In [236]:
model.fit([x_train_1, x_train_2], y_train,
          batch_size=128,
          epochs=epochs,
          validation_data=([x_test_1, x_test_2], y_test))

Train on 323344 samples, validate on 80943 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13f4f6510>

In [47]:
from sklearn.metrics import classification_report, accuracy_score

In [238]:
y_pred = model.predict([x_test_1, x_test_2])

In [239]:
y_p = np.array([0.0 if x<0.5 else 1.0 for x in y_pred])

In [240]:
y_p

array([0., 0., 0., ..., 0., 1., 1.])

In [241]:
print(classification_report(y_test, y_p))

              precision    recall  f1-score   support

         0.0       0.67      0.97      0.79     50867
         1.0       0.80      0.20      0.32     30076

    accuracy                           0.68     80943
   macro avg       0.73      0.59      0.56     80943
weighted avg       0.72      0.68      0.62     80943



### Cosine Similarity

In [60]:
# Keras dot layer provides l2 normalized dot production, which gives cosine proximity
out = Dot(axes=1, normalize=True)([processed_a, processed_b])

model = Model([input_a, input_b], out)
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[accuracy])

In [61]:
model.fit([x_train_1, x_train_2], y_train,
          batch_size=128,
          epochs=5,
          validation_data=([x_test_1, x_test_2], y_test))

Train on 323344 samples, validate on 80943 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14a4a68d0>

In [62]:
y_pred = model.predict([x_test_1, x_test_2])
y_p = np.array([0.0 if x<0.5 else 1.0 for x in y_pred])
print(classification_report(y_test, y_p))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     50867
         1.0       0.37      1.00      0.54     30076

    accuracy                           0.37     80943
   macro avg       0.19      0.50      0.27     80943
weighted avg       0.14      0.37      0.20     80943



  _warn_prf(average, modifier, msg_start, len(result))


### RNN

In [27]:
from keras.layers import recurrent, concatenate, Embedding

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
HIDDEN_SIZE = 50
BATCH_SIZE = 64
EPOCHS = 1

In [28]:
question1 = Input(shape=(768,1))
encoded_question1 = RNN(HIDDEN_SIZE)(question1)

question2 = Input(shape=(768,1))
encoded_question2 = RNN(HIDDEN_SIZE)(question2)

merged = concatenate([encoded_question1, encoded_question2])
preds = Dense(1, activation='softmax')(merged)

model = Model([question1, question2], preds)
rms = RMSprop()
model.compile(optimizer=rms,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit([x_train_1, x_train_2], y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=([x_test_1, x_test_2], y_test))

In [52]:
from keras import layers, models
from keras.optimizers import Nadam


In [67]:
def build_modelBase(num_units, dropout_rate=0.2, activation='relu'):
    return models.Sequential([
        layers.Dense(num_units, activation=activation, use_bias=True),
        layers.Dropout(dropout_rate),
        layers.Dense(num_units, activation=activation, use_bias=True),
        layers.Dropout(dropout_rate)
    ])

def normalizer(axis):
    def _normalize(att_weights):
        exp_weights = K.exp(att_weights)
        sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
        return exp_weights/sum_weights
    return _normalize

# function to sum a vector
def sum_word(x):
    return K.sum(x, axis=1) 

def build_model(max_length=250, num_hidden=200, dropout_rate=0.2, learn_rate=0.0001, optimizer='rmsprop'):
  
    # clear Keras session to free up GPU memory
    K.clear_session()
    
    # input_a -> question1
    # input_b -> question2
    input_a = layers.Input(shape=(768,1))
    input_b = layers.Input(shape=(768,1))
    
    ## step 2: encode
    # compute attention weights
    Q1 = build_modelBase(num_hidden, dropout_rate=dropout_rate)
    
    ## step 3: attend 
    # combine the soft-aligned vectors with the corresponding word vectors 
    Q2 = build_modelBase(num_hidden, dropout_rate=dropout_rate)   
    
    a = Q1(input_a)
    b = Q2(input_b)
    
    #att = tf.keras.layers.Attention()([a, b])
    att_weights = layers.dot([a, b], axes=-1, normalize=True) 
    
    # normalize the attention weights
    norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
    norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
    
    # compute version of question a that is soft-aligned with every word of b
    alpha = layers.dot([norm_weights_a, a], axes=1)
    # compute version of question b that is soft-aligned with every word of a
    beta  = layers.dot([norm_weights_b, b], axes=1)
    
    comp1 = layers.concatenate([a, beta])
    comp2 = layers.concatenate([b, alpha])
    
    G = build_modelBase(num_hidden, dropout_rate=dropout_rate)  
    v1 = layers.TimeDistributed(G)(comp1)
    v2 = layers.TimeDistributed(G)(comp2)
    
    # and reduce the vectors computed above to a single vector per question
    v1_sum = layers.Lambda(sum_word)(v1)
    v2_sum = layers.Lambda(sum_word)(v2)
    
    concat = layers.concatenate([v1_sum, v2_sum])
        
    ## step 4: predict 
    H = build_modelBase(num_hidden, dropout_rate=dropout_rate)
    out = H(concat)
    out = layers.Dense(1, activation='sigmoid', use_bias=True)(out)
    
    # optimizer for gradient descent
    model = Model([input_a, input_b], out)
    if optimizer == 'sgd':
        opt = SGD(lr=learn_rate)
    elif optimizer == 'adam':
        opt = Adam(lr=learn_rate)
    elif optimizer == 'rmsprop':
        opt = RMSprop(lr=learn_rate)
    else:
        opt = Nadam(lr=learn_rate)
    
    model.compile(optimizer=opt,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [68]:
model = build_model()

INFO:plaidml:Opening device "opencl_amd_radeon_pro_5500m_compute_engine.0"
INFO:plaidml:Opening device "opencl_amd_radeon_pro_5500m_compute_engine.0"
  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))
  .format(self.name, input_shape))


In [None]:
model.fit([x_train_1, x_train_2], y_train,
          batch_size=32,
          epochs=10)