In [25]:
import numpy as np 
import pandas as pd 
import re
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [29]:
df = pd.read_csv('smai_project/train.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [30]:
MAX_NB_WORDS = 200000
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(list(df['question1'].values.astype(str))+list(df['question2'].values.astype(str)))

In [31]:
word_index = tokenizer.word_index

Embedding Matrix

In [32]:
embedding_index = {}
with open('smai_project/glove.840B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])   
        vectors = np.asarray(values[-300:], dtype='float32')
        embedding_index[word] = vectors
    f.close()
embedding_matrix = np.random.random((len(word_index)+1, 300))
for word, i in word_index.items():
    if embedding_index.get(word) is not None:
        embedding_matrix[i] = embedding_index.get(word)

Processing the data

In [33]:
from sklearn.model_selection import train_test_split
y = df['is_duplicate']
X_train,X_test,y_train,y_test = train_test_split(df,y, test_size = float(3/10), random_state =0)
X_valid,X_test,y_valid,y_test = train_test_split(X_test,y_test, test_size = float(1/3), random_state = 0)

In [35]:
X_train_q1 = tokenizer.texts_to_sequences(X_train['question1'].values.astype(str))
X_train_q2 = tokenizer.texts_to_sequences(X_train['question2'].values.astype(str))

X_train_q1 = pad_sequences(X_train_q1, maxlen = 30, padding='post')
X_train_q2 = pad_sequences(X_train_q2, maxlen = 30, padding='post')

In [36]:
X_valid_q1 = tokenizer.texts_to_sequences(X_valid['question1'].values.astype(str))
X_valid_q2 = tokenizer.texts_to_sequences(X_valid['question2'].values.astype(str))

X_valid_q1 = pad_sequences(X_valid_q1, maxlen = 30, padding='post')
X_valid_q2 = pad_sequences(X_valid_q2, maxlen = 30, padding='post')

In [37]:
X_test_q1 = tokenizer.texts_to_sequences(X_test['question1'].values.astype(str))
X_test_q2 = tokenizer.texts_to_sequences(X_test['question2'].values.astype(str))

X_test_q1 = pad_sequences(X_test_q1, maxlen = 30, padding='post')
X_test_q2 = pad_sequences(X_test_q2, maxlen = 30, padding='post')

LSTM

In [38]:
# Model for Q1
q1_model = tf.keras.Sequential()
q1_model.add(Embedding(input_dim = len(word_index)+1,output_dim = 300,input_length = 30,weights = [embedding_matrix]))
q1_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
q1_model.add(Dropout(0.1))
q1_model.add(LSTM(128, return_sequences = True))
q1_model.add(LSTM(128))
q1_model.add(Dense(60, activation = 'tanh'))
q1_model.add(Dense(2, activation = 'sigmoid'))


In [39]:
# Model for Q2
q2_model = tf.keras.Sequential()
q2_model.add(Embedding(input_dim = len(word_index)+1,output_dim = 300,input_length = 30,weights = [embedding_matrix]))
q2_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
q2_model.add(Dropout(0.1))
q2_model.add(LSTM(128, return_sequences = True))
q2_model.add(LSTM(128))
q2_model.add(Dense(60, activation = 'tanh'))
q2_model.add(Dense(2, activation = 'sigmoid'))

In [40]:
# Merging the output of the two models
merged = concatenate([q1_model.output, q2_model.output])
merged = Flatten()(merged)
merged = Dense(60, activation = 'tanh')(merged)
merged = Dropout(0.1)(merged)
merged = Dense(50, activation = 'relu')(merged)
merged = Dropout(0.1)(merged)
merged = Dense(2, activation = 'sigmoid')(merged)

In [41]:
lstm = Model([q1_model.input, q2_model.input], merged)
lstm.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',metrics = ['accuracy'])
history = lstm.fit([X_train_q1,X_train_q2],y_train, batch_size = 2000, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy

In [42]:
score = lstm.evaluate([X_valid_q1, X_valid_q2],y_valid,batch_size=2000,verbose=1)
score[1]



0.7732691764831543

In [43]:
score = lstm.evaluate([X_test_q1, X_test_q2],y_test,batch_size=2000,verbose=1)
score[1]



0.7701154947280884

F-score

In [46]:
from sklearn.metrics import f1_score
y_pred = lstm.predict([X_valid_q1, X_valid_q2], batch_size=2000, verbose=1)
y_pred_class=np.argmax(y_pred,axis=1)
f1 = f1_score(y_valid, y_pred_class)
print(f1)

0.6861271379410707


In [47]:
y_pred = lstm.predict([X_test_q1, X_test_q2], batch_size=2000, verbose=1)
y_pred_class=np.argmax(y_pred,axis=1)
f1 = f1_score(y_test, y_pred_class)
print(f1)

0.6809474768280123
