In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Bidirectional, GlobalMaxPool1D, dot
from keras.models import Model

## Loading and Preprocessing data

In [2]:
df = pd.read_csv('../input/question-pairs-dataset/questions.csv')
df = df.drop(['id','qid1','qid2'],axis=1)
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Removing special characters, uppercase letters and lemmatizing"""
    
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = " ".join(text)
    return text

In [4]:
df['question1'] = df.question1.apply(lambda x: clean_text(str(x)))
df['question2'] = df.question2.apply(lambda x: clean_text(str(x)))
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,what be the step by step guide to invest in sh...,what be the step by step guide to invest in sh...,0
1,what be the story of kohinoor kohinoor diamond,what would happen if the indian government ste...,0
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0
3,why be i mentally very lonely how can i solve it,find the remainder when math2324math be divide...,0
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


In [5]:
y = df['is_duplicate']

In [6]:
total_text = pd.concat([df['question1'], df['question2']]).reset_index(drop=True)

max_features = 6000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(total_text)

question1_sequenced = tokenizer.texts_to_sequences(df['question1'])
question2_sequenced = tokenizer.texts_to_sequences(df['question2'])

In [7]:
maxlen = 100
question1_padded = pad_sequences(question1_sequenced, maxlen=maxlen)
question2_padded = pad_sequences(question2_sequenced, maxlen=maxlen)

## Training the Model

In [8]:
embedding_size = 128
batch_size = 100
epochs = 3

In [9]:
inp1 = Input(shape=(100,))
inp2 = Input(shape=(100,))

x1 = Embedding(max_features, embedding_size)(inp1)
x2 = Embedding(max_features, embedding_size)(inp2)

x1 = Bidirectional(LSTM(32, return_sequences = True))(x1)
x2 = Bidirectional(LSTM(32, return_sequences = True))(x2)

x1 = GlobalMaxPool1D()(x1)
x2 = GlobalMaxPool1D()(x2)

x =  dot([x1, x2], axes=1)

x = Dense(40, activation='relu')(x)
x = Dropout(0.05)(x)
x = Dense(10, activation='relu')(x)

output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[inp1, inp2], outputs=output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
history = model.fit([question1_padded, question2_padded], y, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Find the most similar question

Finding the most similar sentances for each sentances given, using the cosine similarity.

In [11]:
q1 = ['Donald trump is the president of USA', 
      'I love samsung phones but they are lagging', 
      'I know fast food is not healthy but tasty']

q2 = ['Tell me more about apple mobile accessories', 
      'Barack obama is a black by birth', 
      ' Pizza is great when served in hot']

In [12]:
def cosine_similarity(a, b):
    """Generating cosin similarity"""
    return np.dot(a, b)/np.sqrt(a.dot(a)*b.dot(b))

In [13]:
# Loading the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

questions = q2
# disabling other components in the pipeline since only the vectorization is conducted
with nlp.disable_pipes():
    vectors = np.array([nlp(question).vector for question in questions])
    
vectors.shape

(3, 300)

In [14]:
similarity = []
similar = []

for question in q1:


    question_vec = nlp(question).vector

    ## Center the document vectors
    # Calculate the mean for the document vectors
    vec_mean = vectors.mean(axis=0)
    
    # Subtract the mean from the vectors
    centered = vectors - vec_mean

    # Calculating similarities for each document in the dataset
    sims = np.array([cosine_similarity(question_vec - vec_mean, vec) for vec in centered])

    # Get the index for the most similar document
    most_similar = sims.argmax()
    
    # List the results
    similarity.append(sims)
    similar.append(most_similar)

In [15]:
display(similarity)
display(similar)

[array([-0.30900288,  0.32573262, -0.04206669], dtype=float32),
 array([ 0.43283004, -0.32563302, -0.09692521], dtype=float32),
 array([ 0.05160734, -0.36558956,  0.37828922], dtype=float32)]

[1, 0, 2]

In [16]:
print(similarity)
print(similar)

[array([-0.30900288,  0.32573262, -0.04206669], dtype=float32), array([ 0.43283004, -0.32563302, -0.09692521], dtype=float32), array([ 0.05160734, -0.36558956,  0.37828922], dtype=float32)]
[1, 0, 2]
