In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np 
import pandas as pd 
import os

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau


from keras.layers import Dense, Embedding, LSTM, Input, Lambda
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import keras.backend as K
from keras.optimizers import Adadelta
import re

In [None]:
data = pd.read_csv('/kaggle/input/nnfl-lab-4/train.csv')
pd.set_option('display.max_colwidth',-1)
data.head()

In [None]:
df = pd.read_csv('/kaggle/input/nnfl-lab-4/test.csv')
pd.set_option('display.max_colwidth',-1)
df.head()

In [None]:
data.shape

In [None]:
data

In [None]:
train_1 = data.iloc[:,1]
train_1 = list(train_1)
print(train_1)

In [None]:
train_2 = data.iloc[:,2]
train_2 = list(train_2)
print(train_2)

In [None]:
full_train = train_1 + train_2
print(full_train)

In [None]:
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')


In [None]:
tokenizer.fit_on_texts(full_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(word_index)

In [None]:
X_1 = tokenizer.texts_to_sequences(data['Sentence1'].values)
print(X_1[0])
maxlen = 60
X_1 = pad_sequences(X_1, maxlen=maxlen)
print("Padded Sequences: ")
print(X_1)
print(X_1[0])

In [None]:
X_1.shape

In [None]:
X_2 = tokenizer.texts_to_sequences(data['Sentence2'].values)
print(X_2[0])
maxlen = 60
X_2 = pad_sequences(X_2, maxlen=maxlen)
print("Padded Sequences: ")
print(X_2)
print(X_2[0])

In [None]:
X_2.shape

In [None]:
training_portion = 0.8
y = list(data.iloc[:,3])

In [None]:
X_t1 = tokenizer.texts_to_sequences(df['Sentence1'].values)
print(X_t1[0])
maxlen = 60
X_t1 = pad_sequences(X_t1, maxlen=maxlen)
print("Padded Sequences: ")
print(X_t1)
print(X_t1[0])

In [None]:
X_t1.shape

In [None]:
X_t2 = tokenizer.texts_to_sequences(df['Sentence2'].values)
print(X_t2[0])
maxlen = 60
X_t2 = pad_sequences(X_t2, maxlen=maxlen)
print("Padded Sequences: ")
print(X_t2)
print(X_t2[0])

In [None]:
X_t2.shape

In [None]:
training_size = int(len(X_1)*training_portion)

X_train1 = X_1[:training_size,:]
X_train2 = X_2[:training_size,:]
y_train  = y[:training_size]
X_val1   = X_1[training_size:,:]
X_val2   = X_2[training_size:,:]
y_val    = y[training_size:]


In [None]:
X_train1.shape
X_train2.shape
len(y_train)

In [None]:
embedding_dim = 40 #Change to observe effects
lstm_out = 256
gradient_clipping_norm = 2.50
batch_size = 128
n_epoch = 40


In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

modelcheckpoint=ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.hdf5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)

callbacks = [earlystop,modelcheckpoint,reduce_lr]

In [None]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))



left_input = Input(shape=(maxlen,), dtype='int32')
right_input = Input(shape=(maxlen,), dtype='int32')

embedding_layer = Embedding(num_words, embedding_dim, input_length=maxlen, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(lstm_out)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])


malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm,learning_rate=1.0,rho=0.95)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

print(malstm.summary())



In [None]:


malstm_trained = malstm.fit([X_train1,X_train2], y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([X_val1,X_val2], y_val), callbacks = callbacks)



In [29]:
malstm.load_weights("weights.32-0.23.hdf5")
print("Loaded model from disk")

Loaded model from disk


In [30]:
loss = malstm.evaluate([X_val1,X_val2], y_val,batch_size = batch_size, callbacks=[earlystop])
print(loss)

[0.22710131168365477, 0.6571875214576721]


In [31]:
y_pred = malstm.predict([X_t1,X_t2])

In [32]:
y_pred

array([[0.23524886],
       [0.20745057],
       [0.21609601],
       ...,
       [0.24221262],
       [0.2869611 ],
       [0.41483414]], dtype=float32)

In [33]:
y_out = y_pred>0.5

In [34]:
y_out

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [35]:
y_out = []
for i in range(len(y_pred)):
    val = y_pred[i]
    if val >= 0.5:
        y_out.append(1)
    else:
        y_out.append(0)

In [36]:
y_out

[0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,


In [37]:
np.max(y_pred)

1.0

In [38]:
np.min(y_pred)

0.049453802

In [None]:

len(y_pred)

In [None]:
df.shape

In [None]:



df['Class'] = y_out

In [None]:
df_2 = df[['ID', 'Class']]

In [None]:




df_2.index = df_2.index + 1

In [None]:
df_2.to_csv("submission_1.csv",index=False)

In [None]:
df_2.head()

In [None]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):
  csv = df.to_csv(index=False)
  b64 = base64.b64encode(csv.encode())
  payload = b64.decode()
  html = '<a download="{filename}" href="data:text/csv;base64,{payload}"target="_blank">{title}</a>'
  html = html.format(payload=payload,title=title,filename=filename)
  return HTML(html)
create_download_link(df_2)