In [1]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import re
import tqdm
import time
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60.
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [3]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

In [4]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
stemmer=SnowballStemmer("english")
#example: stemmer.stem("having") -> have

In [6]:
# Create a set of stopwords
stop = set(stopwords.words('english'))

In [7]:
# This is the function makeing the lemmatization
lemma = WordNetLemmatizer()

In [8]:
lemma.lemmatize('bought','v')

'buy'

In [9]:
lemma.lemmatize('you')

'you'

In [10]:
stemmer.stem('women')

'women'

In [11]:
# pre processing data
def cleanData(sentence):
    processedList = ""
    
    # convert to lowercase, ignore all special characters - keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    #sentence = re.sub(r'\n',r' ',sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stop])
    # lemmatize
    #sentence = " ".join(lemma.lemmatize(word) for word in sentence.split())
    # stemming 
    sentence = " ".join(stemmer.stem(word) for word in sentence.split())
    return sentence

In [12]:
cleanData("I'm a women and you?")

'im women'

In [13]:
train_data['question1_cleaned'] = train_data['question1'].map(lambda x: cleanData(x))
train_data['question2_cleaned'] = train_data['question2'].map(lambda x: cleanData(x))

In [14]:
train, test = train_test_split(train_data, test_size=0.33)
print(train.shape)
print(test.shape)

(216517, 6)
(106643, 6)


In [15]:
train['merged_cleaned'] = train['question1_cleaned'] + train['question2_cleaned']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate,question1_cleaned,question2_cleaned
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guid invest share market india,step step guid invest share market
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,stori kohinoor kohinoor diamond,would happen indian govern stole kohinoor kohi...
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increas speed internet connect use vpn,internet speed increas hack dns
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mental lone solv,find remaind math2324math divid 2423
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolv water quik sugar salt methan carbo...,fish would surviv salt water


In [17]:
cleaned_sentences = train['merged_cleaned'].map(lambda x: x.split('.'))

In [18]:
# corpus [[w1,w2,w3..],[..]]

corpus = []
col = cleaned_sentences

for line in col.tolist():
    for list_i in line:
        words = [x for x in list_i.split()]
        corpus.append(words)

In [19]:
len(corpus)

256445

In [28]:
size = 100 # default 100
window_size = 5 # default 5
epochs = 100
min_count = 5 # default 5
workers = 4

start_time = datetime.datetime.now()
# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

end_time = datetime.datetime.now()
seconds_elapsed = (end_time - start_time).total_seconds()
print("It took {} to execute this".format(hms_string(seconds_elapsed)))

It took 0:10:43.64 to execute this


In [29]:
sentence_1 = 'This room is dirty'
sentence_2 = 'dirty and disgusting room'

distance = model.wv.n_similarity(sentence_1.lower().split(), sentence_2.lower().split())
print(distance)

KeyError: "word 'dirty' not in vocabulary"

In [30]:
sentence_1 = 'I like cats'
sentence_2 = 'I do not like cats'

distance = model.wv.n_similarity(sentence_1.lower().split(), sentence_2.lower().split())
print(distance)

KeyError: "word 'cats' not in vocabulary"

In [31]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

index2word_set = set(model.wv.index2word)
#get average vector for sentence 1
sentence_1_avg_vector = avg_sentence_vector(sentence_1.split(), model=model, num_features=100, index2word_set=index2word_set)

#get average vector for sentence 2
sentence_2_avg_vector = avg_sentence_vector(sentence_2.split(), model=model, num_features=100, index2word_set=index2word_set)

sen1_sen2_similarity =  cosine_similarity(sentence_1_avg_vector.reshape(1, -1),sentence_2_avg_vector.reshape(1, -1))
value = sen1_sen2_similarity[0][0]

  if __name__ == '__main__':


In [None]:
#def avg_sentence_vector(words, model, num_features, index2word_set):
#    #function to average all words vectors in a given paragraph
#    
#    nwords = 0
#    for word in words:
#        if word in index2word_set:
#            nwords = nwords+1
#    featureVec = np.zeros((nwords,), dtype="float32")
#    for word in words:
#        if word in index2word_set:
#            featureVec = np.add(featureVec, model[word])
#
#    if nwords>0:
#        featureVec = np.divide(featureVec, nwords)
#    return featureVec
#
#index2word_set = set(model.wv.index2word)
##get average vector for sentence 1
#sentence_1_avg_vector = avg_sentence_vector(sentence_1.split(), model=model, num_features=100, index2word_set=index2word_set)
#
##get average vector for sentence 2
#sentence_2_avg_vector = avg_sentence_vector(sentence_2.split(), model=model, num_features=100, index2word_set=index2word_set)
#
#sen1_sen2_similarity =  cosine_similarity(sentence_1_avg_vector.reshape(1, -1),sentence_2_avg_vector.reshape(1, -1))
#value = sen1_sen2_similarity[0][0]

In [32]:
value

0.4831862

In [45]:
def print_accuracy(model, test, threshold=0.5):
    index2word_set = set(model.wv.index2word)
    
    calculate_avg_vector = lambda cleaned_question: avg_sentence_vector(cleaned_question.split(), model=model, num_features=100, index2word_set=index2word_set)
    cosine_similarity_func = lambda s1_avg_vec, s2_avg_vec: cosine_similarity(s1_avg_vec.reshape(1, -1),s2_avg_vec.reshape(1, -1))
    
    def pred(row):
        avg_vec_1 = calculate_avg_vector(row['question1_cleaned'])
        avg_vec_2 = calculate_avg_vector(row['question2_cleaned'])
        
        similarity = cosine_similarity_func(avg_vec_1, avg_vec_2)
        value = similarity[0][0]
        return 1 if value >= threshold else 0 # is duplicated
    
    
    test['predicted'] = test.apply(pred, axis=1)

    test['equal'] = test['is_duplicate'] == test['predicted']
    print('Accuracy:',np.sum(test['equal'])/len(test['equal']), 'Threshold: ',threshold)
    
start = 0.4
end = 0.9
step = 0.04
for thres in np.linspace(start,end,(end-start)/step):
    print_accuracy(model, test, thres)

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Accuracy: 0.4191367459655111 Threshold:  0.4
Accuracy: 0.43372748328535393 Threshold:  0.4454545454545455
Accuracy: 0.44670536275236067 Threshold:  0.49090909090909096
Accuracy: 0.45987078383016233 Threshold:  0.5363636363636364
Accuracy: 0.4723891863507216 Threshold:  0.5818181818181818
Accuracy: 0.4890710126309275 Threshold:  0.6272727272727273
Accuracy: 0.5124855827386702 Threshold:  0.6727272727272727
Accuracy: 0.5451178230169819 Threshold:  0.7181818181818183
Accuracy: 0.5875209812177077 Threshold:  0.7636363636363637
Accuracy: 0.6346970734131635 Threshold:  0.8090909090909091
Accuracy: 0.6748028468816518 Threshold:  0.8545454545454546
Accuracy: 0.6907532608797577 Threshold:  0.9


In [46]:
test.head()

Unnamed: 0,id,question1,question2,is_duplicate,question1_cleaned,question2_cleaned,predicted,equal
147293,184208,Which has a stronger impact in determining the...,How do coupons affect the demand curve?,0,stronger impact determin market price suppli c...,coupon affect demand curv,0,True
201300,252095,What are the advantages and disadvantages of t...,What are the advantages of the Green Revolutio...,1,advantag disadvantag green revolut,advantag green revolut disadvantag,1,True
313340,392053,How did Germany defeat France so easily during...,Why did the Italian military do so poorly in W...,0,germani defeat franc easili world war ii,italian militari poor wwii,0,True
111357,139183,Is Reliance Jio tariff really cheap?,Why are Reliance Jio tariff plans so costly in...,0,relianc jio tariff realli cheap,relianc jio tariff plan cost india relianc jio...,1,False
250036,313085,What are the things that first year computer s...,What are some things that second year computer...,0,thing first year comput scienc bsc student,thing second year comput scienc student know,1,False
