In [2]:
import gensim
import pandas as pd
import numpy
import plotly.offline as py
import plotly.graph_objs as go

In [5]:
# loading trained model of word2vec
model = gensim.models.KeyedVectors.load_word2vec_format('~/Documents/GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
# loading data. It consists of the following data
# key - the keyword that was search by users
# search - how many searches were performed with that keyword (popularity)
# results - how many results on average did user see on his screen (not the total number of search results)
my_csv=pd.read_clipboard()
my_csv.set_index(my_csv.Key, drop=True, inplace = True)
words=my_csv.index.tolist()

In [6]:
# defining lists to be filled
vectors = [] # this is where word (search keyword) vectors will be filled
labels = [] # this is where recognised (by model) words (search keywords) will be filled
non_classified = [] # this is where non-recognised words will be filled

In [7]:
# filling the blank lists
for x in words: 
    try: 
        vectors.append(model.word_vec(x)) # this will fill the list with 300-dimention vector representations of words (search keywords)
        labels.append(x)
    except: 
        non_classified.append(x)

# summary
print('Total words: '+ str(len(words)))
print('Classified Words: '+ str(len(labels)))
print('Non-classified Words: '+ str(len(non_classified)))


Total words: 2169
Classified Words: 1897
Non-classified Words: 272


In [8]:
# running t-SNE to reduce from 300 dimentions to 2
# please note, that t-SNE will provide different results earch time 
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 1500
low_dim_embs = tsne.fit_transform(vectors[:plot_only])

In [10]:
# as popularity for keywords varies a lot, and since we're going to visualize popularity on dot sizes, 
# which is sensitive to this kind of variation, we'll need to (linaerly) reduce the distance between the most
# and least popular keyword 'popularity'
matched_keywords=my_csv.loc[labels[:1500]]
max = matched_keywords['search'].max(axis=0)
matched_keywords['search_rescaled'] = (matched_keywords['search']/max)*40+2.5
sizes=matched_keywords.search_rescaled.tolist()

In [11]:
# method which will draw the dotplot
py.init_notebook_mode(connected=True)
def plot_with_plotly(low_dim_embs, labels, sizez, color):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings' 
  layout = go.Layout(
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ) 
  )
  trace = go.Scatter(x = low_dim_embs[:,0], y = low_dim_embs[:,1], text=labels, mode='markers', marker=dict(size=sizez, color=color, colorscale='YlGnBu'))
  data = [trace]
  fig = go.Figure(data=data, layout=layout)
  py.iplot(fig, filename = '/plot')

In [14]:
# drawing the dotplot
plot_with_plotly(low_dim_embs, labels[0:1500], sizes, matched_keywords.Results.tolist())

In [20]:
# exporting to csv for Tableau viz
final_csv = matched_keywords
final_csv['x'] = low_dim_embs[:,0]
final_csv['y'] = low_dim_embs[:,1]
final_csv.head()

Unnamed: 0_level_0,Key,search,Results,search_rescaled,x,y
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hearts,hearts,37893,85.971478,42.5,41.587002,39.519398
christmas,christmas,24402,191.22832,28.258847,-53.159241,10.016877
heart,heart,15517,116.183752,18.879806,42.42075,39.053967
thanksgiving,thanksgiving,12588,112.246734,15.787942,-1.051168,64.658424
aesthetic,aesthetic,10646,401.576935,13.73796,17.36491,-30.634157


Now we can much more easily see and analyse each keyword by cluster, popularity and average results seen by user and gather more insights about user search behavior.