In [108]:
import itertools
from glove import Corpus, Glove
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from sklearn.metrics import pairwise

In [75]:
corpus = Corpus()
question_path = '../input/question.csv'
char_embed = "../input/char_embed.txt"
word_embed = "../input/word_embed.txt"

In [76]:
question_df = pd.read_csv(question_path)

In [26]:
question_df.head(5)

Unnamed: 0,qid,words,chars
0,Q000000,W05733 W05284 W09158 W14968 W07863,L1128 L1861 L2218 L1796 L1055 L0847 L2927
1,Q000001,W17378 W17534 W03249 W01490 W18802,L2214 L1980 L0156 L1554 L2218 L1861 L3019 L010...
2,Q000002,W17378 W08158 W20171 W11246 W14759,L2214 L2350 L2568 L1969 L2168 L0694 L3012 L256...
3,Q000003,W11385 W14103 W02556 W13157 W09749,L0762 L2172 L1971 L2034 L2039 L2271 L1346 L223...
4,Q000004,W17508 W18238 W02952 W18103,L0018 L2321 L1346 L3019 L0104 L0902 L0354 L0362


In [27]:
# sentences = list(question_df)

In [89]:
def read_corpus(words_or_chars):

    delchars = [chr(c) for c in range(256)]
    delchars = [x for x in delchars if not x.isalnum()]
    delchars.remove(' ')
    delchars = ''.join(delchars)
    
    datafile = question_df[words_or_chars]

    for line in datafile:
            yield line.translate(None, delchars).split(' ')

In [245]:
test_corpus = read_corpus('words')
for i in test_corpus[:10]:
    print i

TypeError: 'generator' object has no attribute '__getitem__'

In [90]:
def train_glove(target_group, glove_para):
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(words_or_chars=target_group), window=glove_para['window_size']) #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'],
                  no_threads=glove_para['parallelism'], verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('glove_{}.model'.format(target_group))

In [91]:
glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}

In [92]:
train_glove(target_group='words', glove_para=glove_para_word)

words
Dict size: 20890
Collocations: 621606
Training the GloVe model
Performing 2 training epochs with 4 threads
Epoch 0
Epoch 1


In [93]:
glove_ana = Glove.load('glove_words.model')

In [94]:
glove_ana.word_vectors

array([[  1.84027907e-01,  -2.14398932e-01,  -2.27377927e-01, ...,
          9.53003905e-02,  -8.31477740e-02,  -5.32879000e-02],
       [  1.51182363e-01,  -1.79867566e-01,  -2.19118887e-01, ...,
          9.47845985e-02,  -2.49266753e-02,  -4.33976716e-02],
       [  1.27913274e-01,  -1.49388389e-01,  -1.87693582e-01, ...,
          1.17320954e-01,  -6.52537049e-02,  -8.25651558e-02],
       ..., 
       [  5.66549945e-04,   1.53818538e-03,  -5.32807971e-04, ...,
          8.03831132e-04,   7.53789587e-04,   1.56021817e-03],
       [ -2.53074329e-03,   2.40074984e-03,   2.29390886e-03, ...,
         -1.86584595e-03,  -9.30238399e-04,   1.24397741e-03],
       [ -1.82180187e-03,   2.02159750e-03,   2.82539914e-03, ...,
         -1.39743396e-03,   1.84205700e-04,   2.06937313e-03]])

In [96]:
glove_ana.dictionary['W01490']

8

In [125]:
glove_ana.most_similar('W01490', number=10)

[('W10847', 0.99561416483143483),
 ('W04244', 0.99259027832717861),
 ('W04999', 0.98953934651160502),
 ('W03982', 0.98940693826906168),
 ('W08395', 0.98827300440828891),
 ('W01680', 0.98753596033642266),
 ('W08084', 0.98682693866383986),
 ('W15441', 0.98497056770214642),
 ('W03272', 0.98355782975228989)]

In [104]:
def load_word_embed(word_embed):
        word_embed_df = pd.read_csv(word_embed, delim_whitespace=True, index_col=0, header=None)
        cnt_column = word_embed_df.shape[1]
        columns = map(lambda x:"word2vec_w_"+str(x),range(cnt_column))
        word_embed_df.columns = columns
        return word_embed_df

In [116]:
df_word_emb_load = load_word_embed(word_embed=word_embed)

In [226]:
df_word_emb_load.head(5)

Unnamed: 0_level_0,word2vec_w_0,word2vec_w_1,word2vec_w_2,word2vec_w_3,word2vec_w_4,word2vec_w_5,word2vec_w_6,word2vec_w_7,word2vec_w_8,word2vec_w_9,...,word2vec_w_290,word2vec_w_291,word2vec_w_292,word2vec_w_293,word2vec_w_294,word2vec_w_295,word2vec_w_296,word2vec_w_297,word2vec_w_298,word2vec_w_299
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
W00000,0.169316,-0.063898,0.115286,-0.077671,0.067184,0.019339,0.039596,-0.026229,-0.160078,-0.054329,...,0.061151,0.044519,-0.194827,0.122456,0.122785,-0.154153,-0.116578,-0.127786,0.110593,-0.171084
W00001,1.548212,-1.052776,1.192632,0.760363,1.594398,1.478917,-1.555349,0.401968,1.588316,2.460358,...,-1.898932,0.129864,-2.062325,0.068316,0.540282,-1.68262,-0.81629,-1.464458,-0.361792,0.943322
W00002,0.934084,0.106135,-0.391749,-0.209661,-0.558696,-0.942362,-0.274353,-0.232077,-1.024267,0.028783,...,-0.357264,-0.451105,-0.724659,0.525233,0.290343,0.357838,-0.04275,1.315442,-0.167775,-0.393665
W00003,-1.407966,0.034697,-2.33398,1.991531,-0.073803,-0.966899,-0.926573,-2.161519,0.961513,0.382619,...,0.547399,1.300992,1.893234,0.833304,-0.531153,1.24911,-0.959852,2.486905,0.925652,-0.676427
W00004,-0.213362,0.478031,0.03314,-0.096893,0.077964,-0.149793,-0.163284,0.470706,0.138422,0.237029,...,-0.185111,-0.171314,0.166296,0.291282,-0.202618,0.192025,-0.145072,0.151652,0.35313,-0.073013


In [233]:
df_word_emb_load.head(0)

Unnamed: 0_level_0,word2vec_w_0,word2vec_w_1,word2vec_w_2,word2vec_w_3,word2vec_w_4,word2vec_w_5,word2vec_w_6,word2vec_w_7,word2vec_w_8,word2vec_w_9,...,word2vec_w_290,word2vec_w_291,word2vec_w_292,word2vec_w_293,word2vec_w_294,word2vec_w_295,word2vec_w_296,word2vec_w_297,word2vec_w_298,word2vec_w_299
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [146]:
word_emd_index=df_word_emb_load.index.values
i2w = {}
w2i = {}
for i,w in enumerate(word_emd_index):
    i2w[i]=w
    w2i[w]=i
    

In [236]:
len(i2w)

20891

In [109]:
# test sklearn pairwise function:
word_emb_distance = pairwise.pairwise_distances(df_word_emb_load)

In [234]:
word_emb_distance[0]

array([  0.        ,  19.31249096,   9.60848973, ...,   7.03316166,
        41.5048079 ,  38.10905227])

In [182]:
def eval_glove(default_emd, glove_ana, word_str, number = 200):
    """
    default_emd: the loaded default embedding model
    glove_ana: the loaded glove embedding model
    word_str: the word to search
    """
    # Evaluation of glove result with provided word embedding
    top_list_glove = set(glove_ana.most_similar(word_str, number=number))
    top_list_default = set(np.argsort(word_emb_distance[i2w[word_str]], axis=0)[1:number+1])
    

In [189]:
distance_martix = word_emb_distance[w2i['W00000']]

In [218]:
set_default_emd = set(np.argsort(distance_martix, axis=0)[:20000])
set_default_emd_word = set()
for index in set_default_emd:
    set_default_emd_word.add(i2w[index])
# set_default_emd_word

In [219]:
glove_test_simi = glove_ana.most_similar('W00000', number=20000)
# print glove_test_simi

In [220]:
# set_test = set()
glove_test_set = set(map(lambda x: x[0], glove_test_simi))
# glove_test_set

In [225]:
"""
set_default_emd_word
glove_test_set
"""
from __future__ import division
aa = set_default_emd_word.intersection(glove_test_set)
aa_union = set_default_emd_word.union(glove_test_set)
print len(aa_union)
print len(aa)
jacc = len(aa)/len(aa_union)
jacc

20705
19294


0.9318522096112051

In [237]:
list_example = [22,3,1]
np.average(list_example)

8.6666666666666661

In [239]:
i2w[9714]
import time
a = time.time()

In [240]:
b = time.time()
b - a

14.140982151031494

In [50]:
print('Dict size: %s' % len(corpus_word_model.dictionary))
print('Collocations: %s' % corpus_word_model.matrix.nnz)

Dict size: 20890
Collocations: 621606


In [251]:
glove_ana.word_vectors[:10]

array([[ 0.18402791, -0.21439893, -0.22737793, ...,  0.09530039,
        -0.08314777, -0.0532879 ],
       [ 0.15118236, -0.17986757, -0.21911889, ...,  0.0947846 ,
        -0.02492668, -0.04339767],
       [ 0.12791327, -0.14938839, -0.18769358, ...,  0.11732095,
        -0.0652537 , -0.08256516],
       ..., 
       [ 0.11140705, -0.11735928, -0.13390457, ..., -0.02423547,
        -0.04102832, -0.03057076],
       [ 0.15897289, -0.1862586 , -0.22906425, ...,  0.0140669 ,
        -0.04472972, -0.07458474],
       [ 0.10448041, -0.10471308, -0.15509712, ...,  0.11384611,
        -0.05147546, -0.10515619]])

In [252]:
glove_ana.dictionary['W00000']

12515

In [260]:
glove_ana.most_similar('W00000', 3)

[('W15404', 0.92140551054445341), ('W07819', 0.92124957721456568)]

In [256]:
w0 = glove_ana.word_vectors[12515]
w15404 = glove_ana.word_vectors[glove_ana.dictionary['W15404']]

In [261]:
w07819 = glove_ana.word_vectors[glove_ana.dictionary['W07819']]
(np.sum(w0 - w07819))/300

0.00040612257463097889

In [259]:
(np.sum(w0 - w15404))/300

0.00039481297297046535

In [263]:
i = 0
for item in glove_ana.dictionary:
    print item, glove_ana.dictionary[item]
    i += 1
    if i > 5:
        break 

W06252 16033
W14512 19964
W13362 7998
W13363 11622
W03989 4986
W13361 4098


In [273]:
i = 1
with open('glove_words_embed.txt', 'a+') as dictfile:
      for item in glove_ana.dictionary:
        i += 1
        dictfile.write(item + ' ')
        dictfile.write(' '.join(str(i) for i in glove_ana.word_vectors[glove_ana.dictionary[item]]))
        dictfile.write('\n')
        if i > 100:
            break