In [1]:
import pandas as pd
import numpy as np
import os

import scipy
import string
import csv

#import nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#immport tokenize, stopwords 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

#import warnings
import warnings

#import sklearn and matplotlib
from sklearn import preprocessing
import spacy
import matplotlib.pyplot as plt 
import plotly.graph_objects as go

#import warning
warnings.filterwarnings('ignore')
import re


#import the data
train=pd.read_csv('Quora.csv')
train1=train.copy()

train.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [2]:
#append the both set of questions in dataset
Q1=train1.iloc[:,[2,4]]
Q2=train1.iloc[:,[1,3]]

df = pd.DataFrame( np.concatenate( (Q2.values, Q1.values), axis=0 ) )
df.columns = ['id','question' ]
df


Unnamed: 0,id,question
0,1,What is the step by step guide to invest in sh...
1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,5,How can I increase the speed of my internet co...
3,7,Why am I mentally very lonely? How can I solve...
4,9,"Which one dissolve in water quikly sugar, salt..."
...,...,...
9995,9851,How does a pirate radio station work?
9996,9853,Which mobile is good within 20k?
9997,9855,What actually happened in predestination?
9998,9857,Are Near Death Experiences (NDEs) real?


# A.	 Building vectors using **Doc2Vec**

In [3]:
# importing doc2vec from gensim 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 

# tokenizing the sentences
tok_quora=[word_tokenize(wrd) for wrd in df.question]

#creating training data
Quora_training_data=[TaggedDocument(d, [i]) for i, d in enumerate(tok_quora)]   


In [4]:
# trainin doc2vec model
doc_model = Doc2Vec(Quora_training_data, vector_size = 100, window = 5, min_count = 3, epochs = 25)

In [5]:
#function to get vectors from model

def fetch_embeddings(model,tokens):
  tokens = [x for x in word_tokenize(tokens) if x in list(doc_model.wv.vocab)]
  #if words is not present then vector becomes zero
  if len(tokens)>=1:
    return doc_model.infer_vector(tokens)
  else:
    return np.array([0]*100)  


In [6]:
#Storing all embedded sentence vectors in a list 

#defining empty list and iterating through all the questions

doc_embeddings=[]                                     
for w in df.question:
    doc_embeddings.append(list(fetch_embeddings(doc_model, w)))  
#conveting it into array
doc_embeddings=np.asarray(doc_embeddings)


# B.	 Sentence Transformers using BERT model

In [20]:
#install SBERT
!pip install sentence-transformers

#import the SBERT
from sentence_transformers import SentenceTransformer

#let use paraphrase-MiniLM-L12-v2 pre trained model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

x=[i for i in df.question]
#lets get embeddings for each question
sentence_embeddings_BERT= sbert_model.encode(x)

#lets see the shape
sentence_embeddings_BERT.shape

sentence_embeddings_BERT



array([[-0.15299696, -0.30485195,  0.00183832, ..., -0.27034327,
        -0.4260835 ,  0.31928647],
       [-0.16776392,  0.67119426, -0.51778895, ..., -0.08420195,
         0.00470432,  0.3200466 ],
       [ 0.09468909, -0.00629827,  0.06894321, ..., -0.18716985,
        -0.2556718 ,  0.04188535],
       ...,
       [-0.03576591,  0.24553823,  0.10434522, ...,  0.1781153 ,
        -0.00719658, -0.19804102],
       [ 0.15157805, -0.32189795,  0.01304498, ..., -0.05349696,
        -0.33300248,  0.18506213],
       [ 0.2177582 , -0.07103815,  0.11279581, ..., -0.5100467 ,
         0.05511575, -0.1291719 ]], dtype=float32)

# C.	GPT

In [8]:
#Installing the GPT
!pip install pytorch_pretrained_bert 

#Importing required tokenizer, OpenAiGPT model
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel

#initializing the tokenizer
tok_gpt= OpenAIGPTTokenizer.from_pretrained('openai-gpt')  

#Initializing the gpt Model
model_gpt= OpenAIGPTModel.from_pretrained('openai-gpt')
model_gpt.eval()


Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 21.3 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 25.4 MB/s eta 0:00:01[K     |████████                        | 30 kB 26.3 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 19.6 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 8.3 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 9.5 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 9.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 10.1 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 10.4 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 8.0 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 8.0 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 8.0 MB/s eta 0:00:01[K     |██████████████████████████████

100%|██████████| 815973/815973 [00:00<00:00, 2197331.83B/s]
100%|██████████| 458495/458495 [00:00<00:00, 1501209.52B/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
100%|██████████| 478750579/478750579 [00:13<00:00, 35642100.42B/s]
100%|██████████| 656/656 [00:00<00:00, 281278.21B/s]


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): BertLayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): BertLayerNorm()
    )
    (2): Block(
      (attn): Atte

In [None]:
def Fetch_gpt_vectors(question):
  
  #tokenize words
  words = word_tokenize(question)
  emb = np.zeros((1,768))

  #get vectore for each word
  for word in words:
      w= tok_gpt.tokenize(word)
      indexed_words = tok_gpt.convert_tokens_to_ids(w)
      tns_word = torch.tensor([indexed_words])

      with torch.no_grad():
          try:
     #get mean vector
            emb += np.array(torch.mean(model_gpt(tns_word),1))
          except Exception as e:
            continue
  
  emb /= len(words)
  return emb

gpt_emb = np.zeros((1000, 768))

# get vectors

for v in range(1000):
    txt = df.loc[v,'question']
    
    gpt_emb[v] = Fetch_gpt_vectors(txt)

gpt_emb


# Finding Similar questions

In [10]:
#defining function to derive cosine similarity

#import 
from sklearn.metrics.pairwise import cosine_similarity
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(vec1,vec2):
    #find the score
    return dot(vec1, vec2)/(norm(vec1)*norm(vec2)) 


In [13]:
#Function which gts Top N similar questions from data 

def top_n_questions(user,embeddings,df):    
    
    #getting cosine similarities of overall data set with input queries from user
    x=cosine_similarity(user,embeddings).tolist()[0]
    temp_list=list(x)

    #sorting
    sort_res = sorted(range(len(x)), key = lambda sub: x[sub])[:]
    sim_score=[temp_list[i] for i in reversed(sort_res)]

    #print
    print(sort_res[0:5])
 
    #index fetching
    L=[]
    for i in reversed(sort_res):
        L.append(i)

    #get the index from dataframe 
    return df.iloc[L[0:5], [0,1]]

#function to fetch the results based on the model selected

def get_input_vector(query,model):    
    
    print(query)

    #Doc2vec model
    if model=='Doc2Vec':
      k=fetch_embeddings(doc_model,query)
      k=k.reshape(1, -1)

    # sbert  model
    elif model=='BERT':
      k=sbert_model.encode(str(query))
      k=k.reshape(1, -1)

    # gpt model
    elif model=='GPT':
      k=Fetch_gpt_vectors(query)  

    return k


In [16]:
# Example 1 - Doc2vec model

top_n_questions(get_input_vector('How is Narendra Modi as a person?','Doc2Vec'),doc_embeddings,df) 


In [25]:
# Example 2 - GPT model


top_n_questions(get_input_vector('How is Narendra Modi as a person?','GPT'),gpt_emb,df) 


In [23]:
# Example 3 - BERT

top_n_questions(get_input_vector('How is Narendra Modi as a person?','BERT'),sentence_embeddings_BERT,df) 


# **Implementation: Supervised Learning**

In [26]:
# import packages required.
import pandas as pd
import numpy as np
import scipy
import os
import string
import csv

#import nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#import tokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

#import warnings
import warnings

#import sklearn and matplotlib
from sklearn import preprocessing
import spacy
import matplotlib.pyplot as plt 
import plotly.graph_objects as go

#import warning
warnings.filterwarnings('ignore')
import re

from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#import Tokenizer from keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

#importing Keras necessary libraries
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Dropout, LSTM


#importing train data - Import the full data
quora_questions=pd.read_csv('Quora.csv') 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
#function for data cleaning
def txt_process(input_text):

    # Removing punctuation from input text
    input_text = ''.join([x for x in input_text if x not in punctuation])
    
    # Cleaning the text
    input_text = re.sub(r"[^A-Za-z0-9]", " ", input_text)
    input_text = re.sub(r"\'s", " ", input_text)
      
    # remove stop words
    input_text = input_text.split()
    input_text = [x for x in input_text if not x in stop_words]
    input_text = " ".join(input_text)
    
    # Return a list of words
    return(input_text)


#applying above function on both question ids
quora_questions['question1_cleaned'] = quora_questions.apply(lambda x: txt_process(x['question1']), axis = 1)  
quora_questions['question2_cleaned'] = quora_questions.apply(lambda x: txt_process(x['question2']), axis = 1)



In [28]:
#stacking
question_text = np.hstack([quora_questions.question1_cleaned, quora_questions.question2_cleaned])

#tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(question_text)

#creating new columns for both ids where tokenized form of sentence is created 
quora_questions['tokenizer_1'] = tokenizer.texts_to_sequences(quora_questions.question1_cleaned)
quora_questions['tokenizer_2'] = tokenizer.texts_to_sequences(quora_questions.question2_cleaned)

quora_questions.head(5)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_cleaned,question2_cleaned,tokenizer_1,tokenizer_2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What step step guide invest share market india,What step step guide invest share market,"[1, 1054, 1054, 3819, 577, 431, 369, 9]","[1, 1054, 1054, 3819, 577, 431, 369]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What story Kohinoor KohiNoor Diamond,What would happen Indian government stole Kohi...,"[1, 325, 2313, 2313, 3820]","[1, 14, 132, 42, 133, 4595, 2313, 2313, 3820, ..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How I increase speed internet connection using...,How Internet speed increased hacking DNS,"[3, 2, 109, 432, 237, 1461, 84, 2960]","[3, 237, 432, 2037, 1319, 8527]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why I mentally lonely How I solve,Find remainder math2324math divided 2423,"[4, 2, 1462, 3821, 3, 2, 578]","[37, 8528, 8529, 8530, 8531]"
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,Which one dissolve water quikly sugar salt met...,Which fish would survive salt water,"[8, 15, 2961, 161, 5948, 1304, 1305, 5949, 130...","[8, 4258, 14, 1928, 1305, 161]"


In [29]:
#combining both tokens in one list question1 followed by question2
quora_questions['tokenizer'] = quora_questions['tokenizer_1'] + quora_questions['tokenizer_2']

#defining max length 
m_len = 500

#max tokens
max_token = np.max(quora_questions.tokenizer.max())


In [30]:
#defining X and target data 
y = quora_questions[['is_duplicate']]
X = quora_questions[['tokenizer']]

#padding X with a maximum length
X = sequence.pad_sequences(X.tokenizer, maxlen = m_len)

#splitting data into train and test
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.25, random_state=10)


#defining the LSTM model
quora_model = Sequential()

#adding embeedding layer
quora_model.add(Embedding(70000, 64))

#adding drop out layer
quora_model.add(Dropout(0.15))

#LSTM layer
quora_model.add(LSTM(16))

#adding sigmoid layer
quora_model.add(Dense(1, activation = 'sigmoid'))

#defining loss and optimizer
quora_model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])


quora_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          4480000   
_________________________________________________________________
dropout (Dropout)            (None, None, 64)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 16)                5184      
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 4,485,201
Trainable params: 4,485,201
Non-trainable params: 0
_________________________________________________________________


In [31]:
#training the model and validating on test data
quora_model.fit(X_train, y_train, epochs = 2, batch_size=64,validation_data=(X_test,y_test))


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa2dd5a3490>

In [32]:
# evaluation of the model
import sklearn
from sklearn.metrics import classification_report

#prediction on train data
tr_prediction=quora_model.predict(X_train) 

#replacing probabilities >0.5 with 1 and other 0
tr_prediction[tr_prediction>0.5]=1 
tr_prediction[tr_prediction<0.5]=0
tr_prediction

#true values of train data
tr_true=y_train.values

#accuracy
Accuracy=sklearn.metrics.accuracy_score(np.array(tr_true), 
                                     np.array(tr_prediction))

print(Accuracy)
0.7811906400332337

#classification report with f1 score

print(classification_report(tr_true, tr_prediction, target_names=['Not similar','similar']))


0.6165333333333334
              precision    recall  f1-score   support

 Not similar       0.62      1.00      0.76      2312
     similar       0.00      0.00      0.00      1438

    accuracy                           0.62      3750
   macro avg       0.31      0.50      0.38      3750
weighted avg       0.38      0.62      0.47      3750



In [34]:
#predicting on test data
test_prediction=quora_model.predict(X_test)

#generating classes
test_prediction[test_prediction>0.5]=1 
test_prediction[test_prediction<0.5]=0
test_prediction

#true values for test
test_true=y_test.values

# accuracy on test data
Accuracy=sklearn.metrics.accuracy_score(np.array(test_true), 
                                     np.array(test_prediction))

print('Accuracy is %f'%(Accuracy*100)+' %')

print(classification_report(test_true, test_prediction, target_names=['Not similar','similar']))


Accuracy is 62.160000 %
              precision    recall  f1-score   support

 Not similar       0.62      1.00      0.77       777
     similar       0.00      0.00      0.00       473

    accuracy                           0.62      1250
   macro avg       0.31      0.50      0.38      1250
weighted avg       0.39      0.62      0.48      1250



In [35]:
def find_similarity_score(q1,q2):

  #clean first question
  Q1_C=  txt_process(q1) 
  #print(q1)

  #clean first question
  Q2_C = txt_process(q2)    
  #print(q2)

  #converting 1st question into tokens
  Q1_C = tokenizer.texts_to_sequences([Q1_C])  

  #converting 2nd question into token
  Q2_C = tokenizer.texts_to_sequences([Q2_C])

  #combining both tokens as we did for train data
  Q_final = Q1_C[0] + Q2_C[0]                      
  
  #padding combined sequence to max length
  Q_Test = sequence.pad_sequences([Q_final], maxlen = 500)  
  
  #predicting probability of given pair
  Prob=quora_model.predict(Q_Test)         
  print(Prob)

  #if p>0.5 then similar
  if Prob[0]>0.5:                  
    return 'Quora Questions are similar'
  else:
    return 'Quora Questions are Not similar'


In [39]:
#example 1
find_similarity_score('Who is Narendra Modi?','What is identity of Narendra Modi?')

In [40]:
#example 2
find_similarity_score('is there life after death?','Do people belive in afterlife')


In [41]:
#example 3
find_similarity_score('Should I have a hair transplant at age 24? How much would it cost?','How much cost does hair transplant require?')
