# PyData-BH 4 - Kaggle + Quora challenge

## Intro to Word2Vec 

### Import pandas and gensim, load data (This could take a while. And will probably be more memory than my computer has avaiable)

word2vec model, trained on wikipedia, is avaiable at https://ibm.box.com/s/cnw0975zzpbdpndm8hmw9d0umhp63yef

In [134]:
%matplotlib inline
import pandas as pd
import gensim
import matplotlib
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np
from scipy import spatial

In [2]:
model = gensim.models.Word2Vec.load('data/word2vec.bin')
#load only the wordvectors. We won't make any more trainning on it, so let's save some memory.
wv = model.wv 
del model

In [142]:
wv.most_similar("machine-learning")

[(u'algorithms', 0.590934157371521),
 (u'optimization', 0.5430251955986023),
 (u'computational', 0.5244624614715576),
 (u'data-driven', 0.5172451734542847),
 (u'algorithm', 0.5059086084365845),
 (u'bayesian', 0.50047367811203),
 (u'analytics', 0.4976360499858856),
 (u'stochastic', 0.4931342303752899),
 (u'algorithmic', 0.48803848028182983),
 (u'preprocessing', 0.4855253994464874)]

## Loading quora datasets

In [4]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [5]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Let's build a Function that **calculate the average of every single word on the question**

In [159]:
def pre_processing(string):
    string = string.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(string)
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    return filtered_words

def avg_we(question):
    words = pre_processing(question)
    embeddings = []
    for word in words:
        try:
            embeddings.append(wv[word])
        except:
            continue
    embeddings = np.array(embeddings)
    question_embedding = embeddings.mean(axis=0)
    return question_embedding

average_distances = {0:0.0, 1:0.0}
count_positive = 0
count_negative = 0

for i in train_data.head(50).iterrows():
    question1_embedding = avg_we(i[1].question1)
    question2_embedding = avg_we(i[1].question2)
    similarity = spatial.distance.cosine(question1_embedding, question2_embedding)
    average_distances[i[1].is_duplicate] += similarity
    if i[1].is_duplicate == 1:
        count_positive += 1
    else:
        count_negative += 1

print("average negative distance: {}".format(average_distances[0]/count_negative))
print("average positive distance: {}".format(average_distances[1]/count_positive))



average negative distance: 0.229533290342
average positive distance: 0.124747100846


In [158]:
wv["machine-learning"]

array([ -8.75649080e-02,   6.76306844e-01,   1.61541015e-01,
        -4.55041304e-02,   3.88373077e-01,  -2.86693156e-01,
        -7.50976324e-01,  -1.92044824e-02,  -1.38332412e-01,
         7.82331079e-02,  -1.27561390e-01,  -9.28292274e-01,
        -1.23954266e-01,  -2.48100441e-02,   5.75234711e-01,
         6.60666704e-01,  -2.12539583e-01,   3.31095010e-02,
        -6.77211821e-01,   1.35473981e-01,   1.00618124e-01,
        -2.01440067e-03,   3.65662038e-01,  -1.65189490e-01,
         6.03020906e-01,   3.07306319e-01,  -8.81805599e-01,
        -9.39803943e-02,  -4.65456754e-01,   4.39432114e-02,
        -4.21882778e-01,   2.08005056e-01,   1.50679782e-01,
        -3.96851480e-01,   9.45733767e-03,  -4.94944453e-01,
         8.24004948e-01,  -4.70810622e-01,  -1.90350980e-01,
        -2.68070936e-01,  -2.31635705e-01,   6.49417862e-02,
         1.03897847e-01,   1.79845959e-01,  -4.28606659e-01,
        -1.27368122e-01,  -4.22698021e-01,   5.54938853e-01,
        -4.64785129e-01,

In [154]:
question = "What is the step by step guide to invest in share market in india?"
words = pre_processing(question)
embeddings = []
for word in words:
    embeddings.append(wv[word])
embeddings = np.array(embeddings)
print(embeddings.shape)
mean_embeddings = embeddings.mean(axis=0)
mean_embeddings

(7, 300)


array([ -7.38992617e-02,   1.27554685e-01,  -8.52156954e-04,
        -1.19432449e-01,   3.69795738e-03,  -6.41243672e-03,
         8.69216677e-03,   1.29379287e-01,   3.45472619e-02,
         7.98729658e-02,  -8.37706998e-02,   1.44541515e-02,
         1.33033589e-01,  -5.17602377e-02,   1.22424245e-01,
        -3.83252688e-02,  -2.54639208e-01,   1.23551421e-01,
        -2.05630168e-01,   3.06125972e-02,   1.23958364e-01,
         7.69437030e-02,  -4.22669128e-02,   3.19810025e-02,
         5.47905043e-02,   4.38466333e-02,  -2.68614560e-01,
        -2.58008149e-02,  -1.39603347e-01,  -2.53818301e-03,
         3.94764245e-02,   7.82561451e-02,  -1.91683546e-01,
        -3.11534610e-02,   1.64337859e-01,   1.28459200e-01,
         1.41691342e-01,   1.94721632e-02,   4.60987203e-02,
        -6.95347413e-02,   8.19235072e-02,   9.84332040e-02,
        -1.57878876e-01,  -1.27554342e-01,   9.62521657e-02,
        -6.21241964e-02,   1.41862303e-01,   9.41285044e-02,
        -7.38203526e-05,

In [150]:
len(embeddings[0])
# len(words)

300

In [100]:
i = wv["india"]
b = wv["brazil"]
# spatial.distance.cosine(i,b)
wv.similar_by_vector(question1_embedding)

[(u'word', 1.0),
 (u'meaning', 0.7980709075927734),
 (u'words', 0.7763534188270569),
 (u'phrase', 0.737951397895813),
 (u'etymologically', 0.7074786424636841),
 (u'referring', 0.6826249361038208),
 (u'loanword', 0.6816995143890381),
 (u'means', 0.6796748042106628),
 (u'etymology', 0.670539915561676),
 (u'adjective', 0.662657618522644)]