# TSNE of pretrained Glove model on wiki

1. pip install gensim, tsne, bokehe
3. Download pretrained glove model from https://nlp.stanford.edu/projects/glove/
4. load model in gensim (faster loading)
4. tsne 
5. plot with bokeh

In [1]:
# imports
from tsne import bh_sne
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
import numpy as np
from gensim.models import KeyedVectors
import datetime

In [11]:
# to load a pre-trained model from .txt, make sure you add 400000 300 as first line
# you can also set a limit of the vocabulary in case you don't need all
st = datetime.datetime.now()
word_vectors = KeyedVectors.load_word2vec_format('glove/glove.6B.300d.txt',binary=False)
print datetime.datetime.now()-st

0:03:35.795689


In [5]:
# # quicker loading
# force the unit-normalization, destructively in-place 
word_vectors.init_sims(replace=True) 
# save in binary format
word_vectors.save('glove/glove.6B.300d-gensim.bin')

In [16]:
# # load from binary
st = datetime.datetime.now()
glove_model = KeyedVectors.load('glove/glove.6B.300d-gensim.bin', mmap='r') # memory-mapped from disk
glove_model.syn0norm = glove_model.syn0  # prevent recalc of normed vectors
print datetime.datetime.now()-st
# # for more details check this post https://stackoverflow.com/questions/42986405/how-to-speed-up-gensim-word2vec-model-load-time

0:00:04.069330


In [14]:
# sanity check
glove_model.most_similar('queen')

[(u'elizabeth', 0.6771447062492371),
 (u'princess', 0.6356764435768127),
 (u'king', 0.6336469650268555),
 (u'monarch', 0.5814188122749329),
 (u'royal', 0.543052613735199),
 (u'majesty', 0.5350356698036194),
 (u'victoria', 0.5239557027816772),
 (u'throne', 0.5097099542617798),
 (u'lady', 0.5045416951179504),
 (u'crown', 0.49980059266090393)]

In [9]:
glove_model.most_similar('php')

[(u'javascript', 0.6667327880859375),
 (u'scripting', 0.598360002040863),
 (u'perl', 0.5901949405670166),
 (u'mysql', 0.5828665494918823),
 (u'server-side', 0.5826087594032288),
 (u'c++', 0.5619826316833496),
 (u'runtime', 0.5475314855575562),
 (u'open-source', 0.529973030090332),
 (u'sql', 0.521233320236206),
 (u'backend', 0.5180888772010803)]

In [10]:
vectors = np.array(glove_model.syn0norm,dtype='float')
vectors.shape

(400000, 300)

In [15]:
# perform t-SNE embedding on subset of the large vocabulary
start, end = 1500, 2000
vis_data = bh_sne(vectors[start:end])
words = glove_model.index2word[start:end]

In [16]:
# plot the result
vis_x = vis_data[:, 0]
vis_y = vis_data[:, 1]

output_file("glove_tsne.html")
source = ColumnDataSource(data=dict(
    x= vis_data[:, 0],
    y= vis_data[:, 1],
    desc= words ,
))

hover = HoverTool(tooltips=[
    ("word", "@desc"),
])

p = figure(plot_width=800, plot_height=800, tools=[hover,"wheel_zoom"],
           title="Glove Tsne")

p.circle('x', 'y', size=20, source=source)
show(p)