In [1]:
import csv
import collections
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy
import networkx as nx
from operator import itemgetter

from sklearn.metrics.pairwise import pairwise_distances

import matplotlib.pyplot as plt
from matplotlib import pyplot, patches

import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
import plotly.graph_objs as go
import plotly.offline as offline



In [2]:
init_notebook_mode(connected=True)
# py.init_notebook_mode()

In [3]:
def get_relevant_words(vis,lam=0.3,topn=10):
    a = vis.topic_info
    a['finalscore'] = a['logprob']*lam+(1-lam)*a['loglift']
    a = a.loc[:,['Category','Term','finalscore']].groupby(['Category'])\
    .apply(lambda x: x.sort_values(by='finalscore',ascending=False).head(topn))
    a = a.loc[:,'Term'].reset_index().loc[:,['Category','Term']]
    a = a[a['Category']!='Default']
    a = a.to_dict('split')['data']
    d ={}
    for k,v in a: 
        if k not in d.keys():
            d[k] =set()
            d[k].add(v)
        else:
            d[k].add(v)
    finalData = pd.DataFrame([],columns=['Topic','words with Relevance'])
    finalData['Topic']=d.keys()
    finalData['words with Relevance']=d.values()
    return finalData

In [4]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go

In [5]:
def jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

# Create Data for Dash

In [6]:
df = pd.read_csv('topic2word.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11178,11179,11180,11181,11182,11183,11184,11185,11186,11187
0,1088.772461,7.442929,170.38591,3.718293,104.670845,25.401827,400.539246,73.061195,8.35527,267.356506,...,0.055556,0.116927,0.055556,0.089427,0.056167,1.644144,0.184998,0.055556,0.277843,0.105163
1,385.975311,1969.053833,254.110367,2.937806,80.663391,64.056412,4335.508789,15.46708,6.310345,102.310829,...,0.055556,0.065265,0.066038,0.193462,0.059098,0.055703,0.158525,0.422798,0.065673,0.061143
2,4181.594727,183.127625,582.997559,3.372852,106.191917,69.522018,1655.041504,101.754631,5.06276,319.945129,...,0.087712,0.065811,8.675198,0.119221,0.116562,0.094867,0.253083,0.30333,0.11665,0.156917
3,1015.273926,12.21583,273.893524,0.831712,94.193939,59.994728,222.154495,18.197172,2.814182,206.305817,...,0.087026,0.056556,0.055556,0.102242,0.055556,0.056007,0.0807,0.497465,3.751893,0.057585
4,2994.510986,186.212708,208.836502,1.336877,91.610786,24.161583,1682.09314,239.070038,1.738698,118.172089,...,0.081216,0.821388,0.055725,0.055556,0.05899,0.06414,0.074667,0.055556,2.693255,0.133604


In [7]:
topic_name_mapper = {
    'Topic1': 'Cost & Quality',
    'Topic2': 'Bars',
    'Topic3': 'Casino Hotel',
    'Topic4': 'Fine Dining',
    'Topic5': 'Asian',
    'Topic6': 'Pizza',
    'Topic7': 'Steakhouse',
    'Topic8': 'Italian',
    'Topic9': 'Coffee Shop',
    'Topic10': 'High Customer Satisfaction',
    'Topic11': 'Night Club',
    'Topic12': 'Wait Time',
    'Topic13': 'Mexican',
    'Topic14': 'Lunch',
    'Topic15': 'Sushi',
    'Topic16': 'Fast Food',
    'Topic17': 'Breakfast',
    'Topic18': 'Low Customer Satisfaction'
}

## Extract top 5 representative words for each topic

In [8]:
vis = pd.read_pickle('vis.pkl')

In [9]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [10]:
vis

In [11]:
topic2tokenpercent = {'Topic1': 9.1,
                     'Topic2': 4.7,
                     'Topic3': 2.7,
                     'Topic4': 7.3,
                     'Topic5': 4.8,
                     'Topic6': 2.1,
                     'Topic7': 1.4,
                     'Topic8': 3.1,
                     'Topic9': 1.5,
                     'Topic10': 9.8,
                     'Topic11': 6.5,
                     'Topic12': 9.5,
                     'Topic13': 2.6,
                     'Topic14': 10.4,
                     'Topic15': 3,
                     'Topic16': 2.2,
                     'Topic17': 8.4,
                     'Topic18':10.7}

In [12]:
def get_top_n_words_list(num_topics, vis, lam=0.6, topn=5):
    """returns a sorted list of top n words, where the list follows the order Topic 1, ..., Topic n.
    Each element of the list is a string composed of a list of the top n words
    num_topics: number of topics
    vis: pyLDAvis object
    lam: relevance value
    topn: number of topics
    """
    topic_ids_ordered = ['Topic' + str(num) for num in range(1, num_topics + 1)]
    top_topic_words_df = get_relevant_words(vis, lam, topn)
    top_topic_words_df.set_index('Topic', drop=True, inplace=True)
    top_topic_words = [top_topic_words_df.loc[topic_id]['words with Relevance'] for topic_id in topic_ids_ordered]
    top_topic_words_display = [', '.join(words) for words in top_topic_words]
    return top_topic_words_display
    
    
def get_topic_size_ord(num_topics, topic2tokenpercent):
    """returns a list of token percentages, following the order Topic 1, ..., Topic n.
    topic2tokenpercent: dict linking topic names to token percentages
    """
    topic_ids_ordered = ['Topic' + str(num) for num in range(1, num_topics + 1)]
    topic_size_ord = [topic2tokenpercent[topic_id] for topic_id in topic_ids_ordered]
    return topic_size_ord

In [13]:
# sample usage and output
get_top_n_words_list(num_topics=18, vis=vis, lam=0.6, topn=5)[0]

'price, food, good, pretty, quality'

# Create the Network

In [14]:
data = df.as_matrix()

In [15]:
# Pairwise Jensen-Shannon distance between each pair of observations based on the 18 topic-probabilities
pairwise_dist = pairwise_distances(X=data, metric=jensen_shannon)

In [16]:
# arbitrary threshold for deciding whether 2 observations are 'similar' or not
threshold_all = [0.1, 0.11, 0.14, 0.18, 0.19, 0.2, 0.23, 0.25]
def th_mark(x, threshold_all):
    if x==np.min(threshold_all):
        return 'Low'
    elif x==np.max(threshold_all):
        return 'High'
    else:
        return ''
threshold_mark = {str(th):th_mark(th, threshold_all) for th in threshold_all}

In [17]:
adjacency = [np.where(pairwise_dist > threshold, 1, 0) for threshold in threshold_all]

In [18]:
# map threshold value to adjacency matrix
thresh_to_adj = {thresh: adj for thresh, adj in zip(threshold_all, adjacency)}

In [19]:
def create_graph(adj):
    # input: adjaccency matrix
    # returns a graph with the isolates removed
    G = nx.from_numpy_matrix(adj)
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)
    return G

In [20]:
# map threshold value to graph
thresh_to_graph = {thresh: create_graph(adj) for thresh, adj in zip(threshold_all, adjacency)}

In [21]:
threshold2k ={
   0.10: 0.7,
   0.11: 10,
   0.14: 0.9,
   0.18: 20,
   0.19: 10,
   0.20: 4,
   0.23: 10,
   0.25: 10
}

In [22]:
%%time
# extract node positions
fruchterman_iter = 1000

# map threshold values to positions of nodes
thresh_to_pos = {}

for thresh in thresh_to_graph:
    graph = nx.fruchterman_reingold_layout(thresh_to_graph[thresh], k = threshold2k[thresh], iterations=fruchterman_iter)
    thresh_to_pos[thresh] = graph


CPU times: user 518 ms, sys: 5.85 ms, total: 524 ms
Wall time: 523 ms


In [23]:
thresh_to_XnYn = {}
for thresh in thresh_to_pos:
    pos = thresh_to_pos[thresh]
    # define lists of node coordinates
    Xn = [pos[k][0] for k in sorted(pos.keys())]
    Yn = [pos[k][1] for k in sorted(pos.keys())]
    thresh_to_XnYn[thresh] = (Xn, Yn)
    

In [24]:
relevance_all = [0, 0.25, 0.5, 0.75, 1]
def rel_mark(x, relevance_all):
    if x==np.min(relevance_all):
        return 'Rare'
    elif x==np.max(relevance_all):
        return 'Frequent'
    else:
        return ''
relevance_mark = {str(th):rel_mark(th, relevance_all) for th in relevance_all}

In [25]:
def update_slider_mark(slider_mark, font_size):
    # update display style of position markers for the slider
    slider_mark_updated = {}
    for position in slider_mark:
        slider_mark_updated[position] = {
            'label': slider_mark[position],
            'style': {'fontSize':font_size, 'font-family': 'Arial'}
        }
    return slider_mark_updated

In [26]:
relevance_mark_updated = update_slider_mark(relevance_mark, 15)
threshold_mark_updated = update_slider_mark(threshold_mark, 15)

In [29]:
get_topic_size_ord(18, topic2tokenpercent)

app = dash.Dash()

app.layout = html.Div([
    html.Div([
    dcc.Graph(id='graph-with-slider')
        ], style={'marginLeft':140, 'marginRight':'auto'}),
    html.Div([
    html.H2('Similarity Cutoff'),
    dcc.Slider(
        id='threshold-slider',
        min=min(threshold_all),
        max=max(threshold_all),
        value=threshold_all[int(np.floor(len(threshold_all)/2))],
        step=None,
        marks=threshold_mark_updated
    ),
    ], style={'width': '47%','marginBottom': 0, 'marginTop': 0, 'marginLeft':'auto', 'marginRight':'auto',
              'fontSize':12, 'font-family': 'Arial'}
    ),
    html.Div([
    html.H2('Characteristic Words'), 
    dcc.Slider(
        id='relevance-slider',
        min=min(relevance_all),
        max=max(relevance_all),
        value=relevance_all[int(np.floor(len(relevance_all)/2))],
        step=None,
        marks=relevance_mark_updated
    )], style={'width': '47%','marginBottom': 0, 'marginTop': 50, 'marginLeft':'auto', 'marginRight':'auto',
              'fontSize':12, 'font-family': 'Arial'})  
    ])


@app.callback(
    dash.dependencies.Output('graph-with-slider', 'figure'),
    [dash.dependencies.Input('threshold-slider', 'value'), 
     dash.dependencies.Input('relevance-slider', 'value')])
def update_figure(selected_threshold, selected_relevance):

    Xn, Yn = thresh_to_XnYn[selected_threshold]
    
    node_sizes = np.array(get_topic_size_ord(18, topic2tokenpercent)) * 7
    
    # define a trace for plotly
    trace_nodes0 = dict(type='scatter', 
                        x=Xn, 
                        y=Yn,
                        mode='markers+text',
                        marker=dict(symbol='dot', size=node_sizes, color='rgb(255, 128, 0)'),
                        showlegend=False,
                        text = [],
                        textposition='bottom',
                        textfont=dict(
                            family='sans serif',
                            size=14),
                        hoverinfo='skip',
                        visible=True)
    
    # Add labels for nodes
    for index, row in df.iterrows():
        node_info = get_top_n_words_list(num_topics=18, vis=vis, lam=selected_relevance, topn=3)[index]
        trace_nodes0['text'].append(node_info)


    # Create dummy nodes for displaying topic names
    # define a trace for plotly
    trace_nodes1 = dict(type='scatter', 
                        x=Xn, 
                        y=Yn,
                        mode='markers',
                        marker=dict(symbol='dot', size=node_sizes, color='rgb(255, 128, 0)'),
                        showlegend=False,
                        text = [],
                        textposition='bottom',
                        textfont=dict(
                            family='sans serif',
                            size=14),
                        hoverinfo='text',
                        visible=True)
    
    # Add labels for nodes
    for index, row in df.iterrows():
        node_topic = topic_name_mapper['Topic' + str(index + 1)]
        trace_nodes1['text'].append(node_topic)

    # record the coordinates of the ends of edges
    Xe = []
    Ye = []
    G = thresh_to_graph[selected_threshold]
    for e in G.edges():
        pos = thresh_to_pos[selected_threshold]
        Xe.extend([pos[e[0]][0], pos[e[1]][0], None])
        Ye.extend([pos[e[0]][1], pos[e[1]][1], None])

    # trace_edges defines the graph edges as a trace of type scatter (line)
    trace_edges=dict(type='scatter',
                     mode='lines',
                     x=Xe,
                     y=Ye,
                     line=dict(width=0.1, color='rgb(51, 51, 51)'),
                     hoverinfo='none', showlegend=False)

    axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title='' 
              )
    layout=dict(title= 'Network of Topics based on User Reviews',  
                font= dict(family='Arial', size=17, textposition='center'),
                            width=1000,
                            height=800,
                            autosize=False,
                            showlegend=True,
                            xaxis=axis,
                            yaxis=axis,
                            margin=dict(
                                l=40,
                                r=40,
                                b=10,
                                t=50,
                                pad=0),
                hovermode='closest',
                paper_bgcolor='rgba(0,0,0,0)',
                plot_bgcolor='rgba(0,0,0,0)',
                
        )


    return {
        'data': [trace_edges, trace_nodes0, trace_nodes1],
        'layout': layout}


if __name__ == '__main__':
    app.run_server(port=8051)

 * Running on http://127.0.0.1:8051/ (Press CTRL+C to quit)
127.0.0.1 - - [06/May/2018 21:12:14] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 21:12:15] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 21:12:15] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 21:12:16] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 21:12:16] "[37mGET /favicon.ico HTTP/1.1[0m" 200 -


# To do:

Find good values of k for each threshold value -  and set k as well when you plot