In [62]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import re
import tqdm
import time
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.insert(0, '../common/')
import csv_utils
import importlib
importlib.reload(csv_utils)

<module 'csv_utils' from '../common\\csv_utils.py'>

In [41]:
isRealTest = True

In [30]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60.
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [31]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

In [32]:
path = os.path.join('..','data','test_data.csv')
test_data = pd.read_csv(path)

In [33]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [34]:
stemmer=SnowballStemmer("english")
#example: stemmer.stem("having") -> have

In [35]:
# Create a set of stopwords
stop = set(stopwords.words('english'))

In [36]:
# This is the function makeing the lemmatization
lemma = WordNetLemmatizer()

In [37]:
# pre processing data
def cleanData(sentence):
    processedList = ""
    
    # convert to lowercase, ignore all special characters - keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    #sentence = re.sub(r'\n',r' ',sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stop])
    # lemmatize
    #sentence = " ".join(lemma.lemmatize(word) for word in sentence.split())
    # stemming 
    sentence = " ".join(stemmer.stem(word) for word in sentence.split())
    return sentence

In [38]:
cleanData("I'm a women and you?")

'im women'

In [40]:
def cleandf(df):
    df['question1_cleaned'] = df['question1'].map(lambda x: cleanData(x))
    df['question2_cleaned'] = df['question2'].map(lambda x: cleanData(x))
    
cleandf(train_data)
cleandf(test_data)

In [42]:
if isRealTest:
    train, test = train_data, test_data
else:
    train, test = train_test_split(train_data, test_size=0.33)
print(train.shape)
print(test.shape)

(323160, 6)
(81126, 5)


In [43]:
train['merged_cleaned'] = train['question1_cleaned'] + train['question2_cleaned']

In [44]:
train_data.head()

Unnamed: 0,id,question1,question2,is_duplicate,question1_cleaned,question2_cleaned,merged_cleaned
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guid invest share market india,step step guid invest share market,step step guid invest share market indiastep s...
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,stori kohinoor kohinoor diamond,would happen indian govern stole kohinoor kohi...,stori kohinoor kohinoor diamondwould happen in...
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increas speed internet connect use vpn,internet speed increas hack dns,increas speed internet connect use vpninternet...
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mental lone solv,find remaind math2324math divid 2423,mental lone solvfind remaind math2324math divi...
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolv water quik sugar salt methan carbo...,fish would surviv salt water,one dissolv water quik sugar salt methan carbo...


In [45]:
cleaned_sentences = train['merged_cleaned'].map(lambda x: x.split('.'))

In [46]:
# corpus [[w1,w2,w3..],[..]]

corpus = []
col = cleaned_sentences

for line in col.tolist():
    for list_i in line:
        words = [x for x in list_i.split()]
        corpus.append(words)

In [47]:
len(corpus)

382519

In [48]:
size = 100 # default 100
window_size = 5 # default 5
epochs = 100
min_count = 5 # default 5
workers = 4

start_time = datetime.datetime.now()
# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

end_time = datetime.datetime.now()
seconds_elapsed = (end_time - start_time).total_seconds()
print("It took {} to execute this".format(hms_string(seconds_elapsed)))

It took 0:16:16.38 to execute this


In [49]:
#sentence_1 = 'This room is dirty'
#sentence_2 = 'dirty and disgusting room'
#
#distance = model.wv.n_similarity(sentence_1.lower().split(), sentence_2.lower().split())
#print(distance)

In [50]:
#sentence_1 = 'I like cats'
#sentence_2 = 'I do not like cats'
#
#distance = model.wv.n_similarity(sentence_1.lower().split(), sentence_2.lower().split())
#print(distance)

In [51]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

index2word_set = set(model.wv.index2word)
#get average vector for sentence 1
sentence_1_avg_vector = avg_sentence_vector(sentence_1.split(), model=model, num_features=100, index2word_set=index2word_set)

#get average vector for sentence 2
sentence_2_avg_vector = avg_sentence_vector(sentence_2.split(), model=model, num_features=100, index2word_set=index2word_set)

sen1_sen2_similarity =  cosine_similarity(sentence_1_avg_vector.reshape(1, -1),sentence_2_avg_vector.reshape(1, -1))
value = sen1_sen2_similarity[0][0]

  if __name__ == '__main__':


In [52]:
#def avg_sentence_vector(words, model, num_features, index2word_set):
#    #function to average all words vectors in a given paragraph
#    
#    nwords = 0
#    for word in words:
#        if word in index2word_set:
#            nwords = nwords+1
#    featureVec = np.zeros((nwords,), dtype="float32")
#    for word in words:
#        if word in index2word_set:
#            featureVec = np.add(featureVec, model[word])
#
#    if nwords>0:
#        featureVec = np.divide(featureVec, nwords)
#    return featureVec
#
#index2word_set = set(model.wv.index2word)
##get average vector for sentence 1
#sentence_1_avg_vector = avg_sentence_vector(sentence_1.split(), model=model, num_features=100, index2word_set=index2word_set)
#
##get average vector for sentence 2
#sentence_2_avg_vector = avg_sentence_vector(sentence_2.split(), model=model, num_features=100, index2word_set=index2word_set)
#
#sen1_sen2_similarity =  cosine_similarity(sentence_1_avg_vector.reshape(1, -1),sentence_2_avg_vector.reshape(1, -1))
#value = sen1_sen2_similarity[0][0]

In [53]:
value

0.42730284

In [54]:
def print_accuracy(model, test, isRealTest, threshold=0.5):
    index2word_set = set(model.wv.index2word)
    
    calculate_avg_vector = lambda cleaned_question: avg_sentence_vector(cleaned_question.split(), model=model, num_features=100, index2word_set=index2word_set)
    cosine_similarity_func = lambda s1_avg_vec, s2_avg_vec: cosine_similarity(s1_avg_vec.reshape(1, -1),s2_avg_vec.reshape(1, -1))
    
    def pred(row):
        avg_vec_1 = calculate_avg_vector(row['question1_cleaned'])
        avg_vec_2 = calculate_avg_vector(row['question2_cleaned'])
        
        similarity = cosine_similarity_func(avg_vec_1, avg_vec_2)
        value = similarity[0][0]
        return 1 if value >= threshold else 0 # is duplicated
    
    
    test['predicted'] = test.apply(pred, axis=1)
    
    if not isRealTest:
        test['equal'] = test['is_duplicate'] == test['predicted']
        acc = np.sum(test['equal'])/len(test['equal'])
        print('Accuracy:',acc, 'Threshold: ',threshold)
        return acc
start = 0.4
end = 0.9
step = 0.04

#bestThreshold = 0
#acc = 0
#for thres in np.linspace(start,end,(end-start)/step):
#    acc_tmp = print_accuracy(model, train, thres)
#    if acc < acc_tmp:
#        bestThreshold = thres
        
#print_accuracy(model, test, isRealTest=False, threshold=0.9)
print_accuracy(model, test_data, isRealTest=isRealTest, threshold=0.9)

  if __name__ == '__main__':


In [58]:
test

Unnamed: 0,test_id,question1,question2,question1_cleaned,question2_cleaned,predicted
0,15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,would trump presid mean current intern master ...,trump presid affect student present us plan st...,0
1,20,Why do rockets look white?,Why are rockets and boosters painted white?,rocket look white,rocket booster paint white,0
2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,what caus someon jealous,avoid jealous someon,0
3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,much 30 kv hp,find convers chart cc horsepow,0
4,34,What is the best travel website in spain?,What is the best travel website?,best travel websit spain,best travel websit,0
5,46,How did Darth Vader fought Darth Maul in Star ...,Does Quora have a character limit for profile ...,darth vader fought darth maul star war legend,quora charact limit profil descript,0
6,48,What are some examples of products that can be...,What are some of the products made from crude ...,exampl product make crude oil,product made crude oil,1
7,51,Will a Blu Ray play on a regular DVD player? I...,How can you play a Blu Ray DVD on a regular DV...,blu ray play regular dvd player,play blu ray dvd regular dvd player,1
8,54,How GST affects the CAs and tax officers?,Why can't I do my homework?,gst affect cas tax offic,cant homework,0
9,56,Who is israil friend?,Is my boyfriend lying about his true feelings ...,israil friend,boyfriend lie true feel friend secret attract,0


In [64]:
csv_utils.create_csvs(test_data.predicted, test_data.test_id.values)

saved in:  E:\Dropbox\folch\UVA\Applied Machine Learning\Project_quora\models\..\data\submissions\submission_0604PM-November-23-2018.csv


score: 0.69281