In [1]:
import os

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

http://vene.ro/blog/word-movers-distance-in-python.html
http://sujitpal.blogspot.com/2015/09/sentence-similarity-using-word2vec-and.html
https://github.com/wmayner/pyemd
http://www.ariel.ac.il/sites/ofirpele/fastemd/
http://nbviewer.jupyter.org/github/vene/vene.github.io/blob/pelican/content/blog/word-movers-distance-in-python.ipynb
https://github.com/mkusner/wmd/blob/master/compute_rwmd.m

In [None]:
loc = '/Users/rbekbolatov/data/Word2Vec/%s'
if not os.path.exists(loc % "embed.dat"):
    print("Caching word embeddings in memmapped format...")
    from gensim.models.word2vec import Word2Vec
    wv = Word2Vec.load_word2vec_format(loc % "GoogleNews-vectors-negative300.bin.gz", binary=True)
    fp = np.memmap(loc % "embed.dat", dtype=np.double, mode='w+', shape=wv.syn0norm.shape)
    fp[:] = wv.syn0norm[:]
    with open(loc % "embed.vocab", "w") as f:
        for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
            print >> f, w.encode('utf-8')
            #print(w.encode('utf-8'), file=f)
    del fp, wv

W = np.memmap(loc % "embed.dat", dtype=np.double, mode="r", shape=(3000000, 300))
with open(loc % "embed.vocab") as f:
    vocab_list = map(str.strip, f.readlines())

In [None]:
vocab_dict = {w: k for k, w in enumerate(vocab_list)}

In [None]:
d1 = "Obama speaks to the media in Illinois"
d2 = "The President addresses the press in Chicago"

vect = CountVectorizer(stop_words="english").fit([d1, d2])
print("Features:",  ", ".join(vect.get_feature_names()))

from scipy.spatial.distance import cosine
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
print(v_1, v_2)
print("cosine(doc_1, doc_2) = {:.2f}".format(cosine(v_1, v_2)))

In [None]:
from sklearn.metrics import euclidean_distances
W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]]
D_ = euclidean_distances(W_)
print("d(addresses, speaks) = {:.2f}".format(D_[0, 7]))
print("d(addresses, chicago) = {:.2f}".format(D_[0, 1]))


In [None]:
from pyemd import emd

# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
D_ = D_.astype(np.double)
D_ /= D_.max()  # just for comparison purposes
print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))


In [None]:
DEFAULT_WMD = 0.0
def wmd(d1, d2):
    vect = CountVectorizer(stop_words="english").fit([d1, d2])
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    print(v_1, v_2)
    print("cosine(doc_1, doc_2) = {:.2f}".format(cosine(v_1, v_2)))
    W_ = W[[vocab_dict[w] for w in vect.get_feature_names() if w in vocab_dict]]
    if len(W_) < 1:
        return DEFAULT_WMD
    D_ = euclidean_distances(W_)
    print(D_)
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes
    return emd(v_1, v_2, D_)


In [None]:
loc = '/Users/rbekbolatov/data/Word2Vec/%s'
loc = '/home/ec2-user/data/word2vec/%s'
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format(loc % "GoogleNews-vectors-negative300.bin.gz", binary=True)

import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import euclidean_distances
from pyemd import emd
from sklearn.feature_extraction.text import CountVectorizer


DEFAULT_WMD = 0.0
def wmd(d1, d2):
    vect = CountVectorizer(stop_words="english").fit([d1, d2])
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    #print(v_1, v_2)
    print("cosine(doc_1, doc_2) = {:.2f}".format(cosine(v_1, v_2)))
    indic = [w in wv for w in vect.get_feature_names()]   
    
    v_1 = np.array([v for b,v in zip(indic, v_1) if b])
    v_2 = np.array([v for b,v in zip(indic, v_2) if b]) 
    W_ = [wv[w] for w in vect.get_feature_names() if w in wv]
    
    #print len(v_1), len(v_2), len(W_)
    # here sometimes there are words that are not in wv, 
    # then, len(v_1) can be > len(W_) and segfault
    if len(W_) < 1:
        return DEFAULT_WMD
    if v_1.sum() < 0.000001:
        return DEFAULT_WMD
    if v_2.sum() < 0.000001:
        return DEFAULT_WMD

    D_ = euclidean_distances(W_)
    #print(D_)
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes
    return emd(v_1, v_2, D_)

In [None]:
d1 = 'Obama speaks to the media in Illinois'
d2 = 'The president greets the press in Chicago'
wmd(d1, d2)


d1 = 'Simpson Strong-Tie 12-Gauge Angle'
d2 = 'angle bracket'
wmd(d1, d2)

d1 = 'Simpson Strong-Tie 12-Gauge Angle'
d2 = 'convection otr'
wmd(d1, d2)

d1 = 'Simpson Strong-Tie 12-Gauge Angle'
d2 = 'microwave over stove'
wmd(d1, d2)

d1 = 'Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking'
d2 = 'angle bracket'
wmd(d1, d2)

d1 = 'Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking'
d2 = 'convection otr'
wmd(d1, d2)


d1 = 'Whirlpool 1.9 cu. ft. Over the Range Convection Microwave in Stainless Steel with Sensor Cooking'
d2 = 'microwave over stove'
wmd(d1, d2)




In [None]:
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)


model = wv
wv['book']

sentence1 = ['human', 'interface', 'computer']
sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time']
model.wmdistance(sentence1, sentence2)





In [None]:
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()
distance = model.wmdistance(sentence_obama, sentence_president)

sentence_obama = 'Obama speaks to the media in Illinois'.split()
sentence_president = 'The president greets the press in Chicago'.split()
distance = model.wmdistance(sentence_obama, sentence_president)


sentence_obama = [w for w in sentence_obama if w not in stopwords]
sentence_president = [w for w in sentence_president if w not in stopwords]
distance = model.wmdistance(sentence_obama, sentence_president)




d1s = ['Simpson Strong-Tie 12-Gauge Angle', 
      'BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating', 
      'Delta Vero 1-Handle Shower Only Faucet Trim Kit in Chrome (Valve Not Included)']
d2s = ['angle bracket', 'deck over', 'convection otr']

for d1 in d1s:
    for d2 in d2s:
        print "%s:  %s" % (d1, d2)
        print model.wmdistance(d1, d2)
        
        
        
        

In [None]:
loc = '/home/ec2-user/data/hd/unpacked/%s'

train = pd.read_csv(loc % 'train.csv')
test = pd.read_csv(loc % 'test.csv')
from google_spell import correct_spelling

def f(row):
    d1 = correct_spelling(row['search_term']).split()
    d2 = row['product_title'].split()
    if (d1 == d2):
        return 0.0
    else:
        return model.wmdistance(d1, d2)



tt = train.sample(10000)
tt['w'] = tt.apply(f, axis=1)
#w_train = train.apply(f, axis=1)

tt[(tt.relevance == 3.0) & (tt.w != float('inf'))]['w'].mean()
1.2317953205871341

tt[(tt.relevance == 1.0) & (tt.w != float('inf'))]['w'].mean()
1.2805398115662407

tt[(tt.relevance == 2.0) & (tt.w != float('inf'))]['w'].mean()
1.2465750573587313
    

train['wmd_goognews'] = train.apply(f, axis=1)
print "good"
test['wmd_goognews'] = test.apply(f, axis=1)
print "good"


train['w'] = train['wmd_goognews']
test['w'] = test['wmd_goognews']


tt = train


train[['id', 'wmd_goognews']].to_csv('wmd_gn_train.csv', index=False)
test[['id', 'wmd_goognews']].to_csv('wmd_gn_test.csv', index=False)
    

    
def f_clean_query(row):
    return correct_spelling(row['search_term'])
    
train['cleaned_query'] = train.apply(f_clean_query, axis=1)
test['cleaned_query'] = test.apply(f_clean_query, axis=1)


