**Loading Requirements**

In [None]:
!pip install 'umap-learn==0.3.10'

In [15]:
import pandas as pd
import os

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

**Loading Clean Text File From Data Folder**

In [17]:
# Change path to data folder in zip provided
path = r'/content/gdrive/MyDrive/3. IU Courses/Courses/2. Social Data Mining/Social Media Project/ILS-Z639 Final Project Deliverable - Abhinav Bajpai/data' 

In [18]:
Top5BrandSample = pd.read_pickle(os.path.join(path,'CleanText.pkl'))

In [19]:
sentences = Top5BrandSample.lemmas.values

In [20]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
bigram = Phrases(sentences, min_count=1, threshold=3, delimiter=b'_')  
bigram_phraser = Phraser(bigram)

In [21]:
Top5BrandSample['lemmasNgrams']=None
i=0
for sent in sentences:
  Top5BrandSample.at[i, 'lemmasNgrams'] = bigram_phraser[sent]
  i=i+1

In [None]:
Top5BrandSample[['lemmas','lemmasNgrams']].head(5)

In [23]:
lemmaSenetence = Top5BrandSample.lemmasNgrams.values

**Topic Models Evaluation**

In [24]:
from gensim.models import word2vec, FastText
from gensim.models import KeyedVectors

In [25]:
model = word2vec.Word2Vec(lemmaSenetence, size=50, min_count=3, iter=20) # Original Used Model

In [26]:
modelFastText = word2vec.Word2Vec(lemmaSenetence, size=50, min_count=3, iter=20, sg=1) # Benchmarked Model

In [None]:
# Check 1
model.wv.most_similar('battery', topn =10)

In [None]:
# Check 2
modelFastText.most_similar('battery', topn=10)

In [29]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from umap import UMAP
import plotly.express as px
import plotly.graph_objs as go

In [None]:
reducer = UMAP(n_components=2, metric='cosine', n_neighbors = 5, min_dist=0.1)
words = model.wv.vocab
wv_ = [model[word] for word in words]

In [31]:
"""Find the closest word in a given list of search words, if in top-n."""
def closest(word, model, search, topn):
  closest_word = model.wv.most_similar_to_given(word, search)     
  if word == closest_word or \
           word in [w for w, _ in model.wv.most_similar(closest_word, topn=5)]:
    return closest_word 
  else:
    return 'other'      

In [None]:
 reduced_wv = reducer.fit_transform(wv_)

In [33]:
 df = pd.DataFrame.from_records(reduced_wv, columns=['x', 'y'])

In [35]:
'Function to create embedding plots'                                                                
def plot_embeddings(model, search=[], topn=0, show_all=False, train_all=False, 
                    labels=False, colors=True, n_dims=2, algo='pca', **kwargs):

    def closest(word, model, search, topn):
        """Find the closest word in a given list of search words, if in top-n."""
        closest_word = model.wv.most_similar_to_given(word, search)
        if word == closest_word or \
           word in [w for w, _ in model.wv.most_similar(closest_word, topn=topn)]:
            return closest_word 
        else:
            return 'other'

    # eliminate kwargs of other methods if supplied
    if algo != 'tsne': ###
        kwargs.pop('perplexity', None) ###
    if algo != 'umap': ###
        kwargs.pop('n_neighbors', None) ###
        kwargs.pop('min_dist', None) ###
        kwargs.pop('spread', None) ###

    # define the reducer
    if algo == 'umap':
        reducer = UMAP(n_components=n_dims, metric='cosine', **kwargs)
    elif algo == 'tsne':
        reducer = TSNE(n_components=n_dims, **kwargs)
    else:
        reducer = PCA(n_components=n_dims, **kwargs)

    if len(search) == 0: # no search words: show all
        show_all = True
    if show_all:  # to show all, all must be trained
        train_all = True
        
    # identify words to plot
    if show_all:
        words = [w for w in model.wv.vocab]
    else:
        words = search + [sim_word for w in search 
                         for sim_word, _ in model.wv.most_similar(w, topn=topn)]
        words = list(set(words)) # make word list it unique for t-SNE

    # reduce
    wv = [model[word] for word in words]
    if not train_all:
        print(f"Calculating {algo} for {len(words)} words ...", end="") 
        reduced_wv = reducer.fit_transform(wv)
    else:
        print(f"Calculating {algo} for {len(model.wv.vocab)} words ...", end="") 
        reducer.fit(model.wv.vectors)
        reduced_wv = reducer.transform(wv)
    print(f" done.") ###

    # create data frame for ploty express visualization
    # with x, y (, z) and meta data for styling
    if n_dims == 2:
        df = pd.DataFrame.from_records(reduced_wv, columns=['x', 'y'])
    else:
        df = pd.DataFrame.from_records(reduced_wv, columns=['x', 'y', 'z'])

    df['word']  = words
    params = {}

    if show_all:
        df['size'] = 1
        params.update({'size_max': 3, 'size': 'size' })
    else:
        df['size'] = df['word'].map(lambda w: 30 if w in search else 5)
        params.update({'size': 'size'})

    if len(search) > 0: # colorize with closest search word
        df['label'] = df['word'].map(lambda w: w if labels or w in search else '')
        params.update({'text': 'label'})
        if colors:
            df['color'] = df['word'].apply(closest, model=model, search=search, topn=topn)
            params.update({'color': 'color'})

    #params.update({'hover_data': {c: False for c in df.columns}, 'hover_name': 'word'})

    # generate scatter plot
    if n_dims == 2:
        params.update({'width': 900, 'height': 500})
        fig = px.scatter(df, x="x", y="y", opacity=0.3, **params)
        fig.update_xaxes(showticklabels=False, showgrid=True, title='', zeroline=False, visible=True)
        fig.update_yaxes(showticklabels=False, showgrid=True, title='', zeroline=False, visible=True)
    else:
        params.update({'width': 900, 'height': 900})
        df['z'] = df['z']*2/3 # scale 3d box
        fig = px.scatter_3d(df, x="x", y="y", z="z", opacity=0.5, **params)
        fig.update_layout(scene = dict(xaxis = go.layout.scene.XAxis(title = '', showticklabels=False),
                                       yaxis = go.layout.scene.YAxis(title = '', showticklabels=False),
                                       zaxis = go.layout.scene.ZAxis(title = '', showticklabels=False)))
    fig.update_traces(textposition='middle center', marker={'line': {'width': 0}})
    fig.update_layout(font=dict(family="Franklin Gothic", size=10, color="#000000"))
    #fig.show()
    return fig

**Plotting Embedding Graphs**

In [None]:
search =['battery','screen','volume','memory','return_policy','customer_service','value','durable','network','freeze','camera','design','user_interface','security']
plot_embeddings(modelFastText, search, topn=5, show_all=False, labels=True, algo='umap', n_neighbors=15, min_dist=10, spread=25)

In [None]:
search =['battery','screen','volume','memory','return_policy','customer_service','value','durable','network','freeze','camera','design','user_interface','security']
plot_embeddings(model, search, topn=5, show_all=False, labels=True, algo='umap', n_neighbors=15, min_dist=10, spread=25)

**Plotting Context Word Grpahs**

In [38]:
import networkx as nx
import collections

In [39]:
def sim_tree(model, word, top_n, max_dist):
 graph = nx.Graph()
 graph.add_node(word, dist=0)
 to_visit = collections.deque([word])
 while len(to_visit) > 0:
   source = to_visit.popleft() # visit next node
   dist = graph.nodes[source]['dist']+1
   if dist <= max_dist: # discover new nodes
      for target, sim in model.wv.most_similar(source, topn=top_n):
        if target not in graph:
          to_visit.append(target)
          graph.add_node(target, dist=dist)
          graph.add_edge(source, target, sim=sim, dist=dist)
 return graph


In [40]:
from networkx.drawing.nx_pydot import graphviz_layout
def plot_tree(graph, node_size=1000, font_size=12):
  pos = graphviz_layout(graph, prog='twopi', root=list(graph.nodes)[0])
  colors = [graph.nodes[n]['dist'] for n in graph] # colorize by distance
  nx.draw_networkx_nodes(graph, pos, node_size=node_size, node_color=colors, cmap='Set1', alpha=0.4)
  nx.draw_networkx_labels(graph, pos, font_size=font_size)
  for (n1, n2, sim) in graph.edges(data='sim'):
    nx.draw_networkx_edges(graph, pos, [(n1, n2)], width=sim, alpha=0.2)
  #plt.show()


In [None]:
graph = sim_tree(model, 'battery', top_n=5, max_dist=2)
plot_tree(graph, node_size=500, font_size=8)


In [None]:
graph = sim_tree(modelFastText, 'value', top_n=5, max_dist=2)
plot_tree(graph, node_size=500, font_size=8)

**Reference** : 
Blueprints for Text Analytics Using Python
by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler
https://github.com/blueprints-for-text-analytics-python/blueprints-text#readme 