In [1]:
import csv
import collections
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy
import networkx as nx
from operator import itemgetter

from sklearn.metrics.pairwise import pairwise_distances

import matplotlib.pyplot as plt
from matplotlib import pyplot, patches

import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
import plotly.graph_objs as go
import plotly.offline as offline

In [2]:
init_notebook_mode(connected=True)

In [3]:
def jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [4]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go

# Create Data for Dash

In [5]:
df = pd.read_feather('df_final_doc2topics.feather')
df.drop(['business_id'], axis=1, inplace=True)
data = df.drop(['name', 'is_strip', 'stars'], axis=1).as_matrix()

In [6]:
df.tail()

Unnamed: 0,name,is_strip,stars,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,...,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18
232,Wahlburgers,True,3.5,0.002344,0.059422,0.014279,0.017418,0.007613,0.008466,0.390668,...,0.0309,0.001399,0.042051,0.106462,0.021838,0.047367,0.021152,0.199925,0.014332,0.009686
233,Metro Diner,False,4.5,0.001234,0.011513,0.031174,0.000624,0.000544,0.014578,0.109041,...,0.10922,6e-06,0.062768,0.031438,0.002176,0.007427,0.011144,0.011763,0.575775,0.016012
234,Lo-Lo's Chicken and Waffles,False,3.0,0.000399,1e-05,0.00956,1e-05,0.005469,0.001808,0.003014,...,0.084914,0.004546,0.066156,0.187578,0.007661,0.003924,0.001564,0.196273,0.340406,0.086699
235,SkinnyFATS,False,4.5,0.005424,0.051373,0.00358,0.003287,0.011391,0.026667,0.207678,...,0.050492,0.042658,0.006249,1.2e-05,0.024812,0.2704,0.003818,0.046187,0.19282,0.053139
236,Virgil's Real Barbecue - Las Vegas,True,3.5,1e-05,0.077584,0.037697,0.049545,0.004495,1e-05,0.010647,...,0.035005,0.008696,0.117267,0.059909,0.046701,0.002465,0.007658,0.060285,0.428152,0.048046


In [7]:
df.shape

(237, 21)

In [8]:
data.shape

(237, 18)

In [9]:
# find the topic most closely related to each restaurant
topic_closest_ind = np.argmax(data, axis=1)

topic_names_ord = ['Cost & Quality', 'Bars', 'Casino Hotel', 'Fine Dining', 'Asian', 'Pizza', 'Steakhouse', 
                   'Italian', 'Coffee Shop', 'High Customer Satisfaction', 'Night Club', 'Wait Time', 'Mexican', 
                   'Lunch', 'Sushi', 'Fast Food', 'Breakfast', 'Low Customer Satisfaction']

# names of topics most closely related to each restaurant (ordered by the order of restaurants in df)
topic_closest = [topic_names_ord[ind] for ind in topic_closest_ind]

In [10]:
# Pairwise Jensen-Shannon distance between each pair of observations based on the 18 topic-probabilities
pairwise_dist = pairwise_distances(X=data, metric=jensen_shannon)

In [56]:
threshold2k ={
    0.55: 0.7,
    0.56: 0.9,
    0.57: 0.3,
    0.58: 5,
    0.59: 2,
    0.6: 5,
    0.61: 5,
    0.62: 5
}

In [11]:
# arbitrary threshold for deciding whether 2 observations are 'similar' or not
threshold_all = [0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62]
def th_mark(x):
    if x==np.min(threshold_all):
        return 'Low'
    elif x==np.max(threshold_all):
        return 'High'
    else:
        return ''
    
threshold_mark = {str(th):th_mark(th) for th in threshold_all}
adjacency = [np.where(pairwise_dist > threshold, 1, 0) for threshold in threshold_all]

In [12]:
# map threshold value to adjacency matrix
thresh_to_adj = {thresh: adj for thresh, adj in zip(threshold_all, adjacency)}

In [13]:
def create_graph(adj):
    # input: adjaccency matrix
    # returns a graph with the isolates removed
    G = nx.from_numpy_matrix(adj)
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)
    return G

In [14]:
# map threshold value to graph
thresh_to_graph = {thresh: create_graph(adj) for thresh, adj in zip(threshold_all, adjacency)}

In [15]:
%%time
# extract node positions
fruchterman_iter = 1000

# map threshold values to positions of nodes
thresh_to_pos = {}

for thresh in thresh_to_graph:
    graph = nx.fruchterman_reingold_layout(thresh_to_graph[thresh], k = threshold2k[thresh], iterations=fruchterman_iter)
    thresh_to_pos[thresh] = graph


CPU times: user 6.91 s, sys: 76.9 ms, total: 6.98 s
Wall time: 7 s


In [16]:
thresh_to_XnYn = {}
for thresh in thresh_to_pos:
    pos = thresh_to_pos[thresh]
    # define lists of node coordinates
    Xn_strip = [pos[k][0] for k in sorted(pos.keys()) if k in df.index[df.is_strip == True]]
    Yn_strip = [pos[k][1] for k in sorted(pos.keys()) if k in df.index[df.is_strip == True]]
    Xn_notstrip = [pos[k][0] for k in sorted(pos.keys()) if k in df.index[df.is_strip == False]]
    Yn_notstrip = [pos[k][1] for k in sorted(pos.keys()) if k in df.index[df.is_strip == False]]
    thresh_to_XnYn[thresh] = (Xn_strip, Yn_strip, Xn_notstrip, Yn_notstrip)

In [46]:
def update_slider_mark(slider_mark, font_size):
    # update display style of position markers for the slider
    slider_mark_updated = {}
    for position in slider_mark:
        slider_mark_updated[position] = {
            'label': slider_mark[position],
            'style': {'fontSize':font_size, 'font-family': 'Arial'}
        }
    return slider_mark_updated

threshold_mark_updated = update_slider_mark(threshold_mark, 15)

In [50]:
app = dash.Dash()

app.layout = html.Div([
    html.Div([
    dcc.Graph(id='graph-with-slider')
    ],style={'marginLeft':140, 'marginRight':'auto'}),
    html.Div([
    html.H2('Similarity Cutoff'),
    dcc.Slider(
        id='threshold-slider',
        min=min(threshold_all),
        max=max(threshold_all),
        value=threshold_all[int(np.floor(len(threshold_all)/2))],
        step=None,
        marks=threshold_mark_updated
    )
    ], style={'width': '47%','marginBottom': 0, 'marginTop': 0, 'marginLeft':'auto', 'marginRight':'auto',
              'fontSize':12, 'font-family': 'Arial'})
])


@app.callback(dash.dependencies.Output('graph-with-slider', 'figure'),
              [dash.dependencies.Input('threshold-slider', 'value')])
def update_figure(selected_threshold):
    # Work to be done: subset the Xn and Yn for given threshold
    Xn_strip, Yn_strip, Xn_notstrip, Yn_notstrip = thresh_to_XnYn[selected_threshold]

    # define a trace for plotly
    trace_nodes1 = dict(type='scatter', 
                        x=Xn_strip, 
                        y=Yn_strip,
                        mode='markers',
                        marker=dict(symbol='dot', 
                                    size=10, color='rgb(255,0,0)'),
                        name='On The Strip',
                        showlegend=True, 
                        text = [],
                        hoverinfo='text',
                        visible=True)
    trace_nodes2 = dict(type='scatter', 
                        x=Xn_notstrip, 
                        y=Yn_notstrip,
                        mode='markers',
                        marker=dict(symbol='dot', 
                                    size=10, color='rgb(0, 0, 255)'),
                        name='Not on The Strip',
                        showlegend=True, 
                        text = [],
                        hoverinfo='text',
                        visible=True)
    
    # Add labels for nodes
    for index, row in df.iterrows():
        # for strip restaurants
        if index in df.index[df.is_strip == True]:
            node_info = df.name.iloc[index] + ', ' + str(df.stars.iloc[index]) + '/5, Related to: ' + topic_closest[index]
            trace_nodes1['text'].append(node_info)
        # for non strip restaurants
        if index in df.index[df.is_strip == False]:
            node_info = df.name.iloc[index] + ', ' + str(df.stars.iloc[index]) + '/5, Related to: ' + topic_closest[index]
            trace_nodes2['text'].append(node_info)
        
    
    # record the coordinates of the ends of edges
    Xe = []
    Ye = []
    G = thresh_to_graph[selected_threshold]
    for e in G.edges():
        pos = thresh_to_pos[selected_threshold]
        Xe.extend([pos[e[0]][0], pos[e[1]][0], None])
        Ye.extend([pos[e[0]][1], pos[e[1]][1], None])

    # trace_edges defines the graph edges as a trace of type scatter (line)
    trace_edges=dict(type='scatter',
                     mode='lines',
                     x=Xe,
                     y=Ye,
                     line=dict(width=0.1, color='rgb(51, 51, 51)'),
                     hoverinfo='none', showlegend=False)

    axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title='' 
              )
    layout=dict(title= 'Network of Restaurants based on User Reviews',  
                font= dict(family='Arial', size=17),
                width=1000,
                height=800,
                autosize=False,
                showlegend=True,
                xaxis=axis,
                yaxis=axis,
                margin=dict(
                l=40,
                r=40,
                b=10,
                t=50,
                pad=0,
       
        ),
                hovermode='closest',
                paper_bgcolor='rgba(0,0,0,0)',
                plot_bgcolor='rgba(0,0,0,0)'
        )


    return {
        'data': [trace_edges, trace_nodes1, trace_nodes2],
        'layout': layout}


if __name__ == '__main__':
    app.run_server()

 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [06/May/2018 18:40:42] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 18:40:42] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 18:40:42] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [06/May/2018 18:40:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


# To do:

Find good values of k for each threshold value -  and set k as well when you plot