In [0]:
## Code adapted from https://github.com/likejazz/Siamese-LSTM

In [1]:
import nltk
import spacy
import numpy as np
import pandas as pd
import os
from time import time
import matplotlib
import re

from nltk.corpus import stopwords
from gensim.models import KeyedVectors

import gensim

import itertools

matplotlib.use('Agg')
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Input, Embedding, LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
#!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [3]:
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz

--2019-12-15 06:38:31--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 52.216.84.8
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|52.216.84.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 325403502 (310M) [application/x-gzip]
Saving to: ‘numberbatch-en-19.08.txt.gz’


2019-12-15 06:38:52 (15.1 MB/s) - ‘numberbatch-en-19.08.txt.gz’ saved [325403502/325403502]



In [0]:
!gunzip numberbatch-en-19.08.txt.gz

In [0]:
!ls

numberbatch-en-19.08.txt  sample_data  train_masked_new.csv


In [13]:
!pwd

/content


In [0]:
train_df = pd.read_csv("train_masked_new.csv")

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,special,whole,label,masked
0,0,0,orange juice,he pour orange juice on his cereal,0,he pour [MASK] on his cereal
1,1,0,milk,he pour milk on his cereal,1,he pour [MASK] on his cereal
2,2,1,apple,he drink apple,0,he drink [MASK]
3,3,1,milk,he drink milk,1,he drink [MASK]
4,4,2,a,jeff run a mile today,1,jeff run [MASK] mile tod[MASK]y


In [7]:
# Stopwords
stop_list = set(stopwords.words('english'))
numberbatch = KeyedVectors.load_word2vec_format("/content/numberbatch-en-19.08.txt", binary=False, unicode_errors='ignore')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from gensim.utils import simple_preprocess
def preprocess(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    return simple_preprocess(text)

In [9]:
train_df = pd.read_csv('/content/train_masked_new.csv')
train_df['whole_bow'] = train_df['whole']
train_df['masked_bow'] = train_df['masked']

word2id = {}
id2word = {}
word_idx = 0
oov = {}

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20

for index, row in train_df.iterrows():
    if index != 0 and index % 1000 == 0:
        print("{:,} sentences embedded.".format(index), flush=True)

    for col in ['whole_bow', 'masked_bow']:
        bow = []  # bag of words of this sentence
        for word in preprocess(row[col]):
            # Check for unwanted words
            if word in stop_list:
                continue

            # If a word is missing from word2vec model.
            if word not in numberbatch.vocab:
                if word not in oov:
                    oov[word] = 1
                else:
                    oov[word] += 1
                continue # simply ignore this word as it is OOV in our model

            # put the word into vocab dictionary
            if word not in word2id:
                word_idx += 1
                word2id[word] = word_idx
                id2word[word_idx] = word
                bow.append(word_idx)
            else:
                bow.append(word2id[word])

        # Generate bag of words representation
        train_df.at[index, col] = bow

1,000 sentences embedded.
2,000 sentences embedded.
3,000 sentences embedded.
4,000 sentences embedded.
5,000 sentences embedded.
6,000 sentences embedded.
7,000 sentences embedded.
8,000 sentences embedded.
9,000 sentences embedded.
10,000 sentences embedded.
11,000 sentences embedded.
12,000 sentences embedded.
13,000 sentences embedded.
14,000 sentences embedded.
15,000 sentences embedded.
16,000 sentences embedded.
17,000 sentences embedded.
18,000 sentences embedded.
19,000 sentences embedded.
20,000 sentences embedded.
21,000 sentences embedded.


In [0]:
embeddings = 1 * np.random.randn(len(word2id) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, idx in word2id.items():
    if word in numberbatch.vocab:
        embeddings[idx] = numberbatch.word_vec(word) # retrieve numberbatch embeddings for each word

In [20]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,special,whole,label,masked,whole_bow,masked_bow
0,0,0,orange juice,he pour orange juice on his cereal,0,he pour [MASK] on his cereal,"[1, 2, 3, 4]","[1, 5, 4]"
1,1,0,milk,he pour milk on his cereal,1,he pour [MASK] on his cereal,"[1, 6, 4]","[1, 5, 4]"
2,2,1,apple,he drink apple,0,he drink [MASK],"[7, 8]","[7, 5]"
3,3,1,milk,he drink milk,1,he drink [MASK],"[7, 6]","[7, 5]"
4,4,2,a,jeff run a mile today,1,jeff run [MASK] mile tod[MASK]y,"[9, 10, 11, 12]","[9, 10, 5, 11, 13, 5]"


In [12]:
embeddings.shape

(5893, 300)

In [30]:
len(train_df['id'].unique())

9992

In [0]:
hold_out_test = train_df[train_df.id < 2000]

In [36]:
hold_out_test.head()

Unnamed: 0.1,Unnamed: 0,id,special,whole,label,masked,whole_bow,masked_bow
0,0,0,orange juice,he pour orange juice on his cereal,0,he pour [MASK] on his cereal,"[1, 2, 3, 4]","[1, 5, 4]"
1,1,0,milk,he pour milk on his cereal,1,he pour [MASK] on his cereal,"[1, 6, 4]","[1, 5, 4]"
2,2,1,apple,he drink apple,0,he drink [MASK],"[7, 8]","[7, 5]"
3,3,1,milk,he drink milk,1,he drink [MASK],"[7, 6]","[7, 5]"
4,4,2,a,jeff run a mile today,1,jeff run [MASK] mile tod[MASK]y,"[9, 10, 11, 12]","[9, 10, 5, 11, 13, 5]"


In [38]:
for i in range(2000):
    if i not in hold_out_test['id'].unique():
        print(i)

248
1539


In [0]:
train_data = train_df[train_df.id >= 2000]

In [0]:
validation_size = int(len(train_data) * 0.2)
training_size = len(train_data) - validation_size

X = train_data[['whole_bow', 'masked_bow']]
Y = train_data['label']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

In [45]:
X_train

Unnamed: 0,whole_bow,masked_bow
8135,"[185, 459, 2599, 166, 300, 3313]","[185, 459, 2599, 5, 166, 300, 3313]"
20827,"[3326, 2820, 136, 46, 529, 491]","[5, 2820, 136, 46, 529, 491]"
13925,"[1871, 404, 590, 363]","[1871, 5, 590, 363]"
15980,"[1342, 4767, 382]","[1342, 5, 382]"
15908,"[722, 299, 12]","[5, 299, 12]"
...,...,...
19722,"[2978, 877, 67, 3907, 363]","[2978, 5, 67, 3907, 363]"
9504,"[928, 905]","[928, 5, 905]"
6109,"[166, 612, 64]","[166, 612, 5]"
11731,"[1126, 3280, 1775, 988, 174]","[1126, 3280, 5, 988, 174]"


In [0]:
def split_and_zero_padding(df, max_seq_length):
    # Split to dicts
    X = {'left': df['whole_bow'], 'right': df['masked_bow']}

    # Zero padding
    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset

#X_temp = {'left': X_train['whole_bow'], 'right': X_train['masked_bow']}
#X_train['left'] = pad_sequences(X_temp['left'], padding='pre', truncating='post', maxlen=max_seq_length)
#X_train['right'] = pad_sequences(X_temp['right'], padding='pre', truncating='post', maxlen=max_seq_length)

class ManDist(Layer):
    """
    Keras Custom Layer that calculates Manhattan Distance.
    """

    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        #self.result = K.pow(1+K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)),-1)
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [0]:

X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

Y_train = Y_train.values
Y_validation = Y_validation.values

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [47]:
# --

# Model variables
gpus = 1
batch_size = 1024 * gpus
n_epoch = 50
n_hidden = 30

# Define the shared model
x = Sequential()
x.add(Embedding(len(embeddings), embedding_dim,
                weights=[embeddings], input_shape=(max_seq_length,), trainable=False))
# CNN
# x.add(Conv1D(250, kernel_size=5, activation='relu'))
# x.add(GlobalMaxPool1D())
# x.add(Dense(250, activation='relu'))
# x.add(Dropout(0.3))
# x.add(Dense(1, activation='sigmoid'))
# LSTM
x.add(LSTM(n_hidden))

shared_model = x

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

# Pack it all up into a Manhattan Distance model
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

if gpus >= 2:
    # `multi_gpu_model()` is a so quite buggy. it breaks the saved model.
    model = tf.keras.utils.multi_gpu_model(model, gpus=gpus)
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model.summary()
shared_model.summary()

# Start trainings
training_start_time = time()
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                           batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation))
training_end_time = time()
print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,
                                                        training_end_time - training_start_time))

model.save('/content/SiameseLSTM.h5')

# Plot accuracy
plt.subplot(211)
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot loss
plt.subplot(212)
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout(h_pad=1.0)
plt.savefig('/content/history-graph.png')

print(str(malstm_trained.history['val_acc'][-1])[:6] +
      "(max: " + str(max(malstm_trained.history['val_acc']))[:6] + ")")
print("Done.")

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 30)           1807620     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
man_dist_1 (ManDist)            (None, 1)            0           sequential_1[1][0]         

In [0]:
X_test = hold_out_test[['whole_bow', 'masked_bow']]
Y_test = hold_out_test['label']

X_test = split_and_zero_padding(X_test, max_seq_length)
#X_validation = split_and_zero_padding(X_validation, max_seq_length)

Y_test = Y_test.values
#Y_validation = Y_validation.values

# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape
assert len(X_test['left']) == len(Y_test)

In [50]:
model = tf.keras.models.load_model('/content/SiameseLSTM.h5', custom_objects={'ManDist': ManDist})
model.summary()

prediction = model.predict([X_test['left'], X_test['right']])
print(prediction)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 30)           1807620     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
man_dist_1 (ManDist)            (None, 1)            0           sequential_1[1][0]         

In [58]:
X_test['left'].shape

(4424, 20)

In [59]:
len(X_test['left'])

4424

In [60]:
len(Y_test)

4424

In [61]:
len(hold_out_test)

4424

In [62]:
hold_out_test['predicted_label'] = prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [64]:
hold_out_test[hold_out_test['id']==5]

Unnamed: 0.1,Unnamed: 0,id,special,whole,label,masked,whole_bow,masked_bow,predicted_label
10,10,5,walk in,a walk in closet be large than a normal closet,1,a [MASK] closet be large than a normal closet,"[19, 20, 21, 22, 20]","[5, 20, 21, 22, 20]",0.594448
11,11,5,normal,a walk in closet be large than a normal closet,1,a walk in closet be large than a [MASK] closet,"[19, 20, 21, 22, 20]","[19, 20, 21, 5, 20]",0.331226
12,12,5,normal,a normal closet be large than a walk in closet,0,a [MASK] closet be large than a walk in closet,"[22, 20, 21, 19, 20]","[5, 20, 21, 19, 20]",0.318269
13,13,5,walk in,a normal closet be large than a walk in closet,0,a normal closet be large than a [MASK] closet,"[22, 20, 21, 19, 20]","[22, 20, 21, 5, 20]",0.51408


In [65]:
(0.594448+0.331226)/2

0.462837

In [66]:
(0.514080+0.318269)/2

0.4161745

In [0]:
try_data = hold_out_test[hold_out_test.id.values == 5]
score_1 = try_data.predicted_label[try_data["label"]==1].values


In [80]:
np.mean(score_1)

0.46283728

In [81]:
predict_result = {}
for i in range(0,2000): 
    #if i in hold_out_test['id'].unique():
        try_data = hold_out_test[hold_out_test.id.values == i]
        score_1 = np.mean(try_data.predicted_label[try_data["label"]==1].values)
        score_0 = np.mean(try_data.predicted_label[try_data["label"]==0].values)
        score = [score_0,score_1]
        #predicted_result_label = score.index(max(score))
        predict_result[i] = score.index(max(score))
    #else:
       # predict_result[i] = "NA"




  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [86]:
sum(predict_result.values())/1998

0.6656656656656657