# Journal Trends Dashboard

This notebooks build the journal trends dashboard.

In [1]:
import datetime
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Import Data

Load dataset for building the Data Tab dashboard.

In [2]:
# cleaned data
data = pd.read_csv('data/data_cleaned.csv')

# get number of articles distribution per Year
article_dist = pd.DataFrame(data['Year'].value_counts().reset_index()).rename(columns=
                                                                            {'Year':'Number of Articles',
                                                                             'index':'Year'})
article_dist.sort_values(by=['Year'], inplace=True)

# list of keywords for each Year
keywords_per_year = pd.read_csv('data/keywords_per_year.csv')
keywords_dist = pd.read_csv('data/keywords_dist_per_year.csv')

# keyword counts
keyword_counts = pd.read_csv('data/keyword_counts.csv')

# vocabulary file
vocabulary = pd.read_csv('data/Vocabulary.csv')
vocabulary.sort_values(by=['Term'], inplace=True)

Load data for building Journal Trend dashboard.

In [3]:
# import module for building biased clusters
from Biased_Clusters import get_clusters_dist, get_clusters_timeline, get_top_keywords, cal_cluster_bias

In [4]:
import json
import scipy.sparse

# set global variables
num_clusters = 9
bias_amount = 4.61

# load training data
x_vector = np.load('data/x_vector.npy')

# load terms matrix
terms_sparse_matrix = scipy.sparse.load_npz('data/terms_sparse_matrix.npz')

# convert sparse matrix to a dense numpy array
# build a dataframe of the result
terms_matrix_df = pd.DataFrame(terms_sparse_matrix.todense())

# load terms label
with open("data/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

## Dashboard

### App Layout

In [5]:
from jupyter_dash import JupyterDash
import dash
import dash_daq as daq
import dash_table
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from dash import callback_context
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_gantt
from plotly.subplots import make_subplots

In [6]:
# prepare data for building Year slider
year_min = data['Year'].min()
year_max = data['Year'].max()
year_range = list(range(1980, year_max+1, 5))
year_range.append(1974)
year_range_str = [str(x) for x in year_range]

# clusters color
clusters_color = ['#da70d6','#45b1e8','#808000','#bcf60c','#0abab5','#000075',
                  '#e6194b','#c4aead','#7b68ee','#3cb44b','#911eb4','#4363d8',
                  '#f58231','#ffe119','#46f0f0', '#800000','#9a6324','#bcf60c']

# set style for header Div
divHeaderStyle = {'border': '1px solid white', 
                  'backgroundColor':'#3cb371',
                  'textAlign':'center',
                  'color':'white'}

# set style for Span header and label
spanHeaderStyle={'display':'inline-block', 'margin-left':'15px', 'margin-right':'15px', 'fontWeight':'bold'}
spanLabelStyle={'color':'blue', 'display':'inline-block'}

# terms table style
terms_style_table={'height': '378px', 'width':'480px', 'overflowY': 'auto'}
terms_style_cell={'whiteSpace': 'normal', 'textAlign': 'left', 'height':'auto',
                  # all three widths are needed
                  'minWidth': '40px', 'width': '300px', 'maxWidth': '400px',}
terms_style_header={'backgroundColor':"paleturquoise", 'border': '1px solid green', 
                    'textAlign': 'center', 'fontWeight': 'bold'}
terms_style_data={'backgroundColor':"lavender", 'border': '1px solid white', 'height': 'auto'}

# set height for figures
figHeight = 420

In [7]:
def build_k_evaluation_plot():
    df_k = pd.read_csv('results/avg_sil_per_k.csv')
    
    # createa a line plot of Average Silhouette Score vs. Number of Clusters (k)
    fig = px.line(df_k, x='Number of Topics', y='Average Silhouette Score', markers=True)

    # get index of max average Silhouette score
    top_k = df_k[df_k['Average Silhouette Score'] == df_k['Average Silhouette Score'].max()]

    fig.add_annotation(x=top_k['Number of Topics'].values[0], y=top_k['Average Silhouette Score'].values[0],
                       text="Best k = " + str(top_k['Number of Topics'].values[0]),
                       ay=-35, yanchor="bottom", 
                       showarrow=True, arrowhead=1, arrowsize=1, arrowwidth=2)

    # set plot's properties
    fig.update_layout(#title_text='Number of Topics vs. Average Silhouette Score', # title of plot
                      xaxis_title_text='<b>Number of Topics (k)</b>', # xaxis label
                      yaxis_title_text='<b>Average Silhouette Score</b>',
                      width=700, height=500)
    return fig

In [8]:
# instantiate the app
app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# create the layout of the app
app.layout = html.Main([
    html.H1('Trend Detection: Financial Journals', style={ 'fontSize': '40px'}),
    dbc.Tabs([
        dbc.Tab([            # start of Journal Trend tab
            # Journal Trend: first row div
           dbc.Row([
                   html.Table([
                       html.Tbody([
                           html.Td(style={'width':'15px'}),
                           html.Td(html.H6("Number of topics:")),
                           html.Td(style={'width':'15px'}), # empty space
                           html.Td([
                               # Slider for Number of Clusters
                               daq.Slider(id='num_clusters_slider', min=5, max=15, value=9,
                                          #handleLabel={"showCurrentValue": True,"label": "VALUE"},
                                          marks=dict(zip([str(x) for x in range(5,16,1)],
                                                         [str(x) for x in range(5,16,1)])), step=1)
                           ]),
                           html.Td(style={'width':'30px'}), # empty space
                           html.Td(html.H6('Bias Amount: ')),
                           html.Td(style={'width':'15px'}), # empty space
                           html.Td([
                               # input text box for bias amount
                               dcc.Input(id='bias_input', type='number', value=bias_amount, min=0, 
                                         style={'width':'65px'}),
                           ]),
                           html.Td(style={'width':'10px'}), # empty space
                           html.Td([
                               html.Button('Generate Topics', id='btn_submit', n_clicks=0)
                           ]), # end html.Td
                           html.Td(style={'width':'20px'}), # empty space
                           html.Td([
                               dcc.Loading(
                                    id="loading-1",
                                    type="default",
                                    children=html.Div(id="loading-output-1", 
                                                      style={'backgroundColor':"paleturquoise",
                                                             #'font-weight': 'bold',
                                                             'color': '#0000cd',
                                                             'border': '1px solid #87cefa', 'textAlign': 'center'})
                                ),
                           ]),
                           html.Td(style={'width':'30px'}), # empty space
                           html.Td(html.H6("Filter by Topics:")),
                           html.Td(style={'width':'10px'}), # empty space
                           html.Td(dcc.Checklist(id='clusters_checkbox',
                                                 labelStyle={'display': 'inline-block', 'width': '50px'},
                                                 style={'width':'430px'})),
                           #html.Td(style={'width':'5px'}), # empty space
                           html.Td([html.Button('Clear Filters', id='btn_clear_filters', n_clicks=0)]), # end html.Td
                           html.Td(style={'width':'20px'}), # empty space
                       ]), # end Tbody
                   ]), # end Table
               ], style={'height':'70px','border': '1px solid black', 'backgroundColor':'#e9ffdb'}), # end first Row
            # start second row
            dbc.Row([
                # 3 sub-tabs under Journal Trend
                dbc.Tabs([
                    dbc.Tab([
                        html.Table([
                            html.Tr([
                                html.Td([html.Div([], style={'width':'20px'})]), # empty space
                                html.Td([
                                    html.Div([html.H5("Trending Topics: Article Distribution per Year")], 
                                             style=divHeaderStyle), # end Div
                                    html.Div([dcc.Graph(id='clusters_dist_line_simple_ui')]) # end Div
                                ]), # end html.Td
                                html.Td([html.Div([], style={'width':'25px'})]), # empty space
                                html.Td([
                                    html.Div([html.H5("Abstract")], style=divHeaderStyle), # end Div
                                    html.Div([
                                        dash_table.DataTable(
                                            id='abstract_table',
                                            sort_action="native",
                                            filter_action='native',
                                            fixed_rows={'headers': True},
                                            style_table={'height': '375px', 'width':'870px', 'overflowY': 'auto'},
                                            style_cell={'whiteSpace': 'normal', 'textAlign': 'left', 'height':'auto',
                                                        'verticalAlign':'top',
                                                        # all three widths are needed
                                                        'minWidth': '40px', 'width': '800px', 'maxWidth': '850px',},
                                            style_header=terms_style_header,
                                            style_data=terms_style_data,
                                            style_cell_conditional=[{'if': {'column_id': 'Topic Id'}, 'width': '7%'}],
                                            style_as_list_view=True),    # end DataTable
                                    ], style={'backgroundColor':'#fff'}), # end html.Div
                                ]), # end Td
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([
                                    html.Div([], style={'height':'7px'})
                                ], colSpan=4)
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([
                                    html.Table([
                                        html.Tr([
                                            html.Td([html.Div([], style={'width':'70px'})]), # end html.Td
                                            html.Td([
                                                html.Div([html.H5("Summary of Trending Topics")], style=divHeaderStyle), # end Div
                                                html.Div([
                                                    dash_table.DataTable(
                                                        id='topic_clusters_table',
                                                        sort_action="native",
                                                        filter_action='native',
                                                        fixed_rows={'headers': True},
                                                        style_table={'height': '350px', 'width':'1500px'},
                                                        style_cell={'whiteSpace': 'normal', 'textAlign': 'left', 
                                                                    'height':'auto',
                                                                    'verticalAlign':'top',
                                                                    # all three widths are needed
                                                                    'minWidth': '20px', 'width': '300px', 
                                                                    'maxWidth': '1450px',},
                                                        style_header=terms_style_header,
                                                        style_data=terms_style_data,
                                                        style_cell_conditional=[
                                                            {'if': {'column_id': 'Terms'}, 'width': '380px'},
                                                            {'if': {'column_id': 'Timeline'}, 'width': '55px'},
                                                            {'if': {'column_id': 'Topic Id'}, 'width': '32px'},
                                                            {'if': {'column_id':'Number of Articles'},'width': '50px'},
                                                            {'if': {'column_id': 'Article %'}, 'width': '30px'},
                                                            {'if': {'column_id': 'Trend Score'}, 'width': '25px'},
                                                            {'if': {'column_id': 'Silhouette Score'}, 'width': '30px'},
                                                            {'if': {'column_id': 'Bias Avg Std Year'},'width': '30px'},
                                                        ],
                                                        style_as_list_view=True
                                                    ),    # end DataTable
                                                ], style={'backgroundColor':'#fff'}), # end Div
                                            ]), # end html.Td
                                        ]), # end html.Tr
                                    ]), # end html.Table
                                ], colSpan=4), # end html.Td
                            ]), # end html.Tr
                        ]), # end html.Table
                    ], label='Simple UI', style={'backgroundColor':'#dcdcdc'}), # end Simple UI Tab
                    dbc.Tab([
                        html.Table([
                            html.Tr([
                                html.Td([
                                    html.Div([
                                        html.H5("Trending Topics: Article Distribution per Year")
                                    ], style=divHeaderStyle), # end Div
                                    html.Div([dcc.Graph(id='clusters_dist_line')]) # end Div
                                ]), # end html.Td
                                html.Td([html.Div([], style={'width':'15px'})]),
                                html.Td([
                                    html.Div([html.H5("Summary")], style=divHeaderStyle), # end Div
                                    html.Div([
                                       html.Div('Overview', style={'fontSize':'14pt', 'color':'blue'}), # end Div
                                       html.Div([
                                           html.Span('Number of Selected Topics:', style=spanHeaderStyle),
                                           html.Span(id='num_topics', style=spanLabelStyle)
                                       ]), # end Div
                                        html.Div([
                                           html.Span('Bias Amount:', style=spanHeaderStyle),
                                           html.Span(id='bias_amount_span', style=spanLabelStyle)
                                       ]), # end Div
                                       html.Div([
                                           html.Span('Number of Unique Terms:', style=spanHeaderStyle),
                                           html.Span(id='num_unique_terms', style=spanLabelStyle)
                                       ]), # end Div
                                       html.P(''),
                                       html.Div(id='top_topic_div'), # end Div
                                    ], style={'height': '420px',
                                             'width':'320px',
                                             'backgroundColor':'#fff',
                                             'overflow':'auto'}) # end Div
                                ]), # end html.Td
                                html.Td([html.Div([], style={'width':'15px'})]),
                                html.Td([
                                    html.Div([
                                       html.H5("Evaluation Metrics")
                                    ], style=divHeaderStyle), # end Div
                                    html.Div([
                                       dbc.Tabs([
                                           dbc.Tab([
                                               dcc.Graph(id='trend_score')
                                           ], label='Trend Score'), # end Tab
                                           dbc.Tab([
                                               dcc.Graph(id='sil_score_fig')
                                           ], label='Silhouette Score'), # end Tab
                                           dbc.Tab([
                                               dcc.Graph(id='std_years_fig')
                                           ], label='Avg Std of Years'), # end Tab
                                           dbc.Tab([
                                               dcc.Graph(id='articles_pct')
                                           ], label='Articles %'), # end second Tab
                                       ]), # end dbc.Tabs
                                   ], style={'height': '420px','backgroundColor':'#f4f0ec'}), # end Div
                                 ]), # end html.Td
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([html.Div([], style={'height':'10px'})], colSpan=5)
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([
                                    html.Table([
                                        html.Tr([
                                            html.Td([
                                                html.Div([
                                                   html.H5("Timelines")
                                                ], style=divHeaderStyle), # end Div
                                                html.Div([dcc.Graph(id='clusters_timeline')]) # end Div
                                            ]), # end html.Td
                                            html.Td([html.Div([], style={'width':'15px'})]),
                                            html.Td([
                                                html.Div([
                                                   html.H5("Terms")
                                                ], style=divHeaderStyle), # end Div
                                                html.Div([html.Img(id='cluster_terms_fig')], 
                                                        style={'backgroundColor':'#f4f0ec'}) # end Div
                                            ]), # end html.Td
                                            html.Td([html.Div([], style={'width':'15px'})]),
                                            html.Td([
                                                html.Div([html.H5("Unique Terms")], 
                                                         style=divHeaderStyle), # end Div
                                                html.Div([
                                                   dash_table.DataTable(
                                                       id='unique_terms_table',
                                                       sort_action="native",
                                                       filter_action='native',
                                                       fixed_rows={'headers': True},
                                                       style_table=terms_style_table,
                                                       style_cell=terms_style_cell,
                                                       style_header=terms_style_header,
                                                       style_data=terms_style_data,
                                                       style_cell_conditional=
                                                       [{'if': {'column_id': 'Topic Id'}, 'width': '50px'}],
                                                       style_as_list_view=True
                                                   ) # end DataTable
                                                ], style={'backgroundColor':'#fff'}), # end html.Div
                                            ]), # end html.Td
                                        ]), # end html.Tr
                                    ]) # end html.Table
                                ], colSpan=5), # end html.Td
                            ]), # end html.Tr
                        ]), # end html.Table
                    ], label='Advanced UI', style={'backgroundColor':'#dcdcdc'}), # end dbc.Tab for Advanced UI
                    dbc.Tab([
                        html.Table([
                            html.Tr([
                                html.Td([html.Div([], style={'height':'40px'})], colSpan=4) # end html.Td
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([html.Div([], style={'width':'40px'})]), # empty space
                                html.Td([
                                    html.Div([html.H5("Number of Topics (k)")], style=divHeaderStyle), # end Div
                                    html.Div([dcc.Graph(figure=build_k_evaluation_plot())])
                                ]), # end html.Td
                                html.Td([html.Div([], style={'width':'30px'})]), # empty space
                                html.Td([
                                    html.Div([
                                        html.H5(id='bias_eval_header')
                                    ], style=divHeaderStyle), # end Div
                                    html.Div([dcc.Graph(id='bias_evaluation_fig')])
                                ]), # end html.Td
                            ]), # end html.Tr
                            html.Tr([
                                html.Td([html.Div([], style={'height':'40px'})], colSpan=4) # end html.Td
                            ]), # end html.Tr
                        ]), # end html.Table
                    ], label='Diagnostics', style={'backgroundColor':'#dcdcdc'}), # end dbc.Tab for Diagnostics
                ], style={'backgroundColor':'#d3d3d3'}), # end dbc.Tabs
            ]), # end second row of Journal Trend
       ], label='Journal Trends', style={'backgroundColor':'#dcdcdc'}), # end Advanced UI dbc.Tab
       dbc.Tab([
           dbc.Row([
               dbc.Col([
                   dbc.Row([
                       dbc.Col([html.P('')], width={"size": 1}),
                       dbc.Col([
                           html.Span('Number of Articles', style={'fontWeight':'bold'}),
                           html.Span(id='num_articles', style={'color':'blue'})
                       ]),
                       dbc.Col([
                           html.Span('Number of Unique Keywords', style={'fontWeight':'bold'}),
                           html.Span(id='num_unique_keywords', style={'color':'blue'})
                       ], width={'size':5}),
                       dbc.Col([
                           html.Span('Year(s)', style={'fontWeight':'bold'}),
                           html.Span(id='year_range', style={'color':'blue'})
                       ])
                   ]),
               ], width={"size": 6, "offset": 0}), # end first column
               dbc.Col([
                   html.H5("Filter by Year:"),
                   dcc.RangeSlider(
                       id='year_slider',
                       min=year_min, max=year_max, step=1,
                       marks=dict(zip(year_range, year_range_str)),
                       value=[year_min, year_max]
                    ),
               ], width={"size": 5}), # end second dbc.Col
           ], style={'height':75, 'border':'1px solid black', 'backgroundColor':'#e9ffdb'}),  # end first dbc.Row
           dbc.Row([
               html.Table([
                   html.Tr([
                       html.Td([html.Div([], style={'width':'80px'})]), # end html.Td
                       html.Td([
                           html.Div([
                               html.H5("Monthly Average Number of Articles")
                           ], style=divHeaderStyle), # end Div
                           html.Div(dcc.Graph(id='monthly_dist_plot')), # end Div
                       ]), # end html.Td
                       html.Td([html.Div([], style={'width':'30px'})]), # end html.Td
                       html.Td([
                           html.Div([
                               html.H5("Yearly Frequency Distribution")
                           ], style=divHeaderStyle), # end Div
                           html.Div([dcc.Graph(id='freq_dist_plot')], style={'backgroundColor':'#fff'})   
                       ]), # end html.Td
                       html.Td([html.Div([], style={'width':'49px'})]), # end html.Td
                   ]), # end html.Tr
               ]), # end html.Table
           ], style={'border':1, 'height':461, 'margin-top':'3px'}), # end second dbc.Row
           dbc.Row([
               dbc.Col([
                   html.Div([
                       html.H5("Keyword: Frequency & Timeline")
                   ], style=divHeaderStyle), # end Div
                   html.Div([
                   dash_table.DataTable(
                       id='keyword_table',
                       sort_action="native",
                       filter_action='native',
                       fixed_rows={'headers': True},
                       style_table={'height': '376px', 'overflow': 'auto'},
                       style_cell={'height': 'auto', 'whiteSpace': 'normal', 'textAlign': 'left',
                                    # all three widths are needed
                                    'minWidth': '50px', 'width': '80px', 'maxWidth': '200px',},
                       style_header={'backgroundColor':"paleturquoise", 'border': '1px solid green', 
                                     'textAlign': 'center', 'fontWeight': 'bold',
                                     'height': 'auto', 'whiteSpace':'normal'},
                       style_data={'backgroundColor':"lavender",
                                   'border': '1px solid white'},
                       style_header_conditional=[{'if': {'column_id': 'Timeline'}, 'width': '60%'},
                                                 {'if': {'column_id': 'Frequency'}, 'width': '15%'},
                                                 {'if': {'column_id': 'Keyword'}, 'width': '25%'},],
                       style_as_list_view=True,
                       ),    # end DataTable
                   ], style={'backgroundColor': '#fff'}) # end Div
               ], width={"size": 3}), # end dbc.Col
               dbc.Col([
                   html.Div([
                       html.H5("Summary Statistics")
                   ], style=divHeaderStyle), # end Div
                   html.Div([
                       html.Div(id='stats_abstract'),
                       html.Div(id='stats_num_keywords'),
                       html.Div(id='stats_year'),
                       html.Div(id='stats_month'),
                   ], style={'height':'420px', 'overflow':'auto', 'backgroundColor':'#fff'}), # end Div
               ], width={"size": 3}), # end Col
               dbc.Col([
                   html.Div([
                       html.H5("Abstract Length")
                   ], style=divHeaderStyle), # end Div
                   html.Div([dcc.Graph(id='abstract_histogram')])
                   
               ], width={"size": 3}), # end dbc.Col
               dbc.Col([
                   html.Div([
                       html.H5("Number of Keywords")
                   ], style=divHeaderStyle), # end Div
                   html.Div([dcc.Graph(id='keyword_histogram')])
               ], width={"size": 3}), # end Col
           ], style={'margin-top':'2px'}), # end third dbc.Row
       ], label='Dataset', style={'backgroundColor':'#dcdcdc'}), # end dbc.Tab
       dbc.Tab([
           html.Table([
               html.Tr([html.Td([html.Div([], style={'height':'40px'})], colSpan=3)]), # end html.Tr
               html.Tr([
                   html.Td([html.Div([], style={'width':'250px'})]), # empty space
                   html.Td([
                       html.Div([html.H5('Vocabulary Table')], style=divHeaderStyle), # end Div
                       html.Div([
                           dash_table.DataTable(
                               id='vocab_table',
                               columns=[{"name": i, "id": i} for i in vocabulary.columns],
                               data=vocabulary.to_dict('records'),
                               #css=[{"selector": ".dash-table-container tr", 
                               #      "rule":'max-height: "1500px";'}],
                               sort_action="native",
                               filter_action='native',
                               fixed_rows={'headers': True},
                               style_table={'width':'1000px', 'overflowY': 'auto'},
                               style_cell={'whiteSpace': 'normal', 'textAlign': 'left', 'height':'auto',
                                                        'verticalAlign':'top',
                                                        # all three widths are needed
                                                        'minWidth': '100px', 'width': '500px', 'maxWidth': '800px',},
                               style_header=terms_style_header,
                               style_data=terms_style_data,
                               style_cell_conditional=[{'if': {'column_id': 'Term'}, 'width': '150px'}],
                               style_as_list_view=True),    # end DataTable
                       ], style={'backgroundColor':'#fff'}), # end html.Div
                   ]), # end html.Td
               ]), # end html.Tr
               html.Tr([html.Td([html.Div([], style={'height':'40px'})], colSpan=4)]), # end html.Tr
           ]), # end html.Table
       ], label='Vocabulary', style={'backgroundColor':'#dcdcdc'}), # end dbc.Tab
    ]), # end dbc.Tabs
    
    # dcc.Store inside the app that stores the intermediate value
    dcc.Store(id='clusters_summary'),
    dcc.Store(id='clusters_term'),
    dcc.Store(id='predictions'),
    dcc.Store(id='unique_terms'),         # a set of unique terms
    dcc.Store(id='terms_per_year')       # term distribution per year
]) # end html.Main

### Helper Functions

In [9]:
def get_topics_subset(df_in, clusters_list):
    # return a subset of dataframe that contains the topic ids
    return df_in[df_in['Topic Id'].isin(clusters_list)]

In [10]:
def get_year_subset(low, high, df_in):
    '''Return a subset of data'''
    return df_in[df_in['Year'].isin(range(low, high+1))]

In [11]:
def get_unique_terms(df):
    # get a set of all unique terms
    return set(', '.join(list(df['Terms'].values)).split(', '))

In [12]:
def get_topic_unique_terms(df_summary):
    '''Unique terms per topic'''

    # create a dataframe of all terms
    terms_df = pd.DataFrame(dict({'Term': ', '.join(list(df_summary['Terms'].values)).split(', ')}))
    # count each term
    terms_df = terms_df['Term'].value_counts().reset_index()
    terms_df.rename(columns={'Term':'Freq', 'index':'Term'}, inplace=True)
    # keep only unique terms
    terms_df = terms_df[terms_df['Freq'] == 1]
    
    # get unique terms for each topic
    df_summary['Unique Terms'] = df_summary['Terms'].apply(lambda x:  
                    ', '.join(list(set(x.split(', ')).intersection(set(list(terms_df['Term'].values))))))
    
    return df_summary

### Journal Trend Tab: Callback Functions

#### Button: Generate Topics

In [13]:
@app.callback(Output('clusters_summary', 'data'),
              Output('predictions', 'data'),
              Output('clusters_checkbox', 'options'),
              Output('btn_clear_filters', 'n_clicks'),
              Output("loading-output-1", "children"),
              Input('btn_submit', 'n_clicks'),
              State('num_clusters_slider', 'value'),
              State('bias_input', 'value'),
              prevent_initial_call=False)
def generate_topics(n_clicks, num_clusters_input, bias_amount_input):
    global num_clusters, bias_amount, x_vector, terms_matrix_df, terms_label, data
    
    # update global variable for num_clusters and bias_input -- to be used by other callbacks
    num_clusters = num_clusters_input
    bias_amount = bias_amount_input
    
    # get trending topics
    summary, predictions = cal_cluster_bias(data, x_vector, terms_matrix_df, terms_label, 
                                                bias_amount, num_clusters)
    summary = get_topic_unique_terms(summary)
    
    # build cluster id check boxes
    options_list = []
    for i in range(1, num_clusters+1):
        options_list.append({'label': ' ' + str(i), 'value': i})
    
    # store dataframes -- to be called by other callback functions
    return summary.to_json(), predictions.to_json(), \
           options_list, 1, 'Topics Loaded'

#### Button: Clear Filters

In [14]:
@app.callback(Output('clusters_checkbox', 'value'),
              Input('btn_clear_filters', 'n_clicks'), 
              prevent_initial_call=True)
def clear_filters(n_clicks):
    '''Clear the checkboxes for Topic Filters'''
    return []

#### Diagnostics Tab: Bias Evaluation Figure

In [15]:
@app.callback(Output('bias_evaluation_fig', 'figure'),
              Output('bias_eval_header', 'children'),
              Input('btn_submit', 'n_clicks'),
              prevent_initial_call=False)
def build_bias_evaluation_plot(btn_submit_n_clicks):
    global num_clusters
    
    # load the model result for selected k-value
    df_result = pd.read_csv('results/results_' + str(num_clusters) + '.csv')
    df_result.rename(columns={'step':'Bias Amount', 'avg_silhouette_by_std_year': 'Average Trend Score'}, 
                     inplace=True)
    
    # create a line plot of Bias Amount vs. Avg Trend Score
    fig = px.line(df_result, x='Bias Amount', y='Average Trend Score', markers=True)
    
    # annotate the cutoff point
    cutoff = df_result[df_result['Average Trend Score'] == df_result['Average Trend Score'].max()]
    cutoff_trend = str(np.round(cutoff['Average Trend Score'].values[0], 5))
    fig.add_annotation(x=cutoff['Bias Amount'].values[0], y=cutoff['Average Trend Score'].values[0],
                       text="Cutoff (" + str(cutoff['Bias Amount'].values[0]) + ', ' + cutoff_trend + ')',
                       ay=-35, yanchor="bottom", 
                       showarrow=True, arrowhead=1, arrowsize=1, arrowwidth=2)
    
    # set plot's properties
    fig.update_layout(#title_text='<b>Average Trend Score per Bias Amount</b>',
                      xaxis_title_text='<b>Bias Amount</b>', 
                      yaxis_title_text='<b>Average Trend Score</b>',
                      width=850, height=500)
    
    return fig, 'Bias Amounts (k=' + str(num_clusters) + ')'

#### Summary Div

In [16]:
def get_selected_topics(clusters_list_in, num_clusters):
    # get a list of selected topics
    clusters_list = clusters_list_in
    if clusters_list is None or len(clusters_list) == 0:
        clusters_list = [x for x in range(1, num_clusters+1)]
    
    # return a list of cluster ids
    return clusters_list

In [17]:
# set style for stats tables
topic_style_table={'width': '270px'}
topic_style_header ={'backgroundColor':"paleturquoise", 
                     'border': '1px solid green', 
                     'textAlign': 'center', 'fontWeight':'bold',
                     'height': 'auto', 'whiteSpace':'normal'}
topic_style_data = {'backgroundColor':"lavender", 'border': '1px solid white', 'textAlign':'center'}
topic_style_cell={'whiteSpace': 'normal', 'height':'auto',
                  # all three widths are needed
                  'minWidth': '40px', 'width': '80px', 'maxWidth': '100px'}

# topic header style
topic_header_style={'height':'50px', 'fontSize':'14pt', 'color':'blue'}


def build_summary_div(df_in, divHeader):
    # build html Div for topic summary
    htmlDiv = html.Div([
        html.Div(divHeader, style=topic_header_style),
        html.Div([
            dash_table.DataTable(
                columns=[{"name": i, "id": i} for i in df_in.columns],
                data=df_in.to_dict('records'),
                style_table=topic_style_table,
                style_cell=topic_style_cell,
                style_header=topic_style_header,
                style_data=topic_style_data,
                style_as_list_view=True,
            ) # end DataTable    
        ], style={'margin-left':'15px', 'margin-bottom':'15px'}), # end Div
    ]), # end Div
    
    return htmlDiv

In [18]:
@app.callback(Output("num_topics", "children"),
              Output('top_topic_div', 'children'),
              Output("bias_amount_span", "children"),
              Output('num_unique_terms', 'children'),
              Output('unique_terms_table', 'data'),
              Output('unique_terms_table', 'columns'),
              Output('unique_terms_table', 'filter_query'),
              Output('abstract_table', 'data'),
              Output('abstract_table', 'columns'),
              Output('abstract_table', 'filter_query'),
              Output('topic_clusters_table', 'data'),
              Output('topic_clusters_table', 'columns'),
              Output('topic_clusters_table', 'filter_query'),
              Input('clusters_checkbox', 'value'),
              Input('clusters_summary', 'data'),
              Input('predictions', 'data'))
def update_summary(clusters_list_in, summary_json, predictions_json):
    global num_clusters, bias_amount
    
    # get a list of selected topics
    clusters_list = get_selected_topics(clusters_list_in, num_clusters)
    
    # get topic summary
    summary = get_topics_subset(pd.read_json(summary_json), clusters_list)
    # format numeric fields
    for col in ['Article %', 'Trend Score', 'Silhouette Score', 'Bias Avg Std Year']:
        summary[col] = summary[col].apply(lambda x: float(str(np.round(x, 4))[:7]))
    
    # build summary data frame for top trending topic
    top_topic = pd.DataFrame(summary[['Topic Id', 'Timeline', 'Number of Articles',
                                      'Article %', 'Trend Score', 'Silhouette Score', 
                                      'Bias Avg Std Year']].sort_values(by='Trend Score', ascending=False).iloc[0]).T
    top_topic = top_topic.T.reset_index()
    top_topic.rename(columns={top_topic.columns[-1]: 'Summary', 'index':' '}, inplace=True)
    
    # build html Div for top trending topic
    top_topic_div = build_summary_div(top_topic, 'Top Trending Topic')
    
    # count all unique terms
    num_unique_terms = len(get_unique_terms(summary))
    
    # topic unique terms
    topic_unique_terms = summary[['Topic Id', 'Unique Terms']]
    
    # prepare data for unique_terms_table
    topic_unique_terms_data = topic_unique_terms.to_dict('records')
    topic_unique_terms_data_cols = [{"name": i, "id": i} for i in topic_unique_terms.columns]
    
    # prepare data for building abstracts table
    predictions = get_topics_subset(pd.read_json(predictions_json), clusters_list).sort_values(by='Topic Id')
    abstracts_data = predictions[['Topic Id', 'Abstract']].to_dict('records')
    abstracts_data_cols = [{"name": i, "id": i} for i in ['Topic Id', 'Abstract']]
    
    # build topic clusters table
    selected_cols = ['Topic Id', 'Terms', 'Timeline', 'Number of Articles', 'Article %',
                     'Trend Score', 'Silhouette Score', 'Bias Avg Std Year']
    summary_data = summary[selected_cols].to_dict('records')
    summary_data_cols = [{'name': i, 'id': i} for i in selected_cols]
    
    
    return html.H2(str(len(clusters_list))), top_topic_div, html.H2(str(bias_amount)), \
           html.H2(str(num_unique_terms)), \
           topic_unique_terms_data, topic_unique_terms_data_cols, '', \
           abstracts_data, abstracts_data_cols, '', \
           summary_data, summary_data_cols, ''

#### Clusters Distribution per Year

In [19]:
def build_trend_topics_plot(predictions_json, clusters_list_in, figWidth, figHeight):
    global clusters_color, num_clusters
    
    # read predictions json file
    predictions = pd.read_json(predictions_json)
    
    # get a list of selected topics
    clusters_list = get_selected_topics(clusters_list_in, num_clusters)
    
    # get clusters distribution
    clusters_dist = get_clusters_dist(predictions)
    clusters_dist = clusters_dist[clusters_dist['Topic Id'].isin(clusters_list)]
    clusters_dist['Topic Id'] = clusters_dist['Topic Id'].astype('category')
    
    
    # create line plot, first sort by Year
    clusters_dist.sort_values(by='Year', inplace=True)
    
    # set layout for go Figure
    layout = go.Layout(autosize=False, width=figWidth, height=figHeight)
    
    # set figure property
    fig_line = go.Figure(layout=layout)
    
    for topic_id in clusters_list:
        topic_df = clusters_dist[clusters_dist['Topic Id'] == topic_id]
        fig_line.add_traces(data=go.Scatter(x=topic_df['Year'], 
                                            y=topic_df['Number of Articles'],
                                            mode='lines+markers', 
                                            line_shape='spline', 
                                            name=str(topic_id),
                                            connectgaps=True,
                                            line_color=clusters_color[topic_id-1],
                                            marker_color=clusters_color[topic_id-1]))
    
    fig_line.update_layout(xaxis_title="<b>Year</b>",
                           yaxis_title="<b>Number of Articles</b>",
                           legend_title="Topic Id", 
                           legend=dict(orientation="h", yanchor="top", y=1.20, xanchor="left", x=0))
    
    return fig_line

In [20]:
@app.callback(Output('clusters_dist_line_simple_ui', 'figure'),
              Input('predictions', 'data'),
              Input('clusters_checkbox', 'value'))
def build_trend_plot_simple_ui(predictions_json, clusters_list_in):
    
    return build_trend_topics_plot(predictions_json, clusters_list_in, 720, figHeight)

In [21]:
@app.callback(Output('clusters_dist_line', 'figure'),
              Input('predictions', 'data'),
              Input('clusters_checkbox', 'value'))
def build_trend_plot_advanced_ui(predictions_json, clusters_list_in):
    
    return build_trend_topics_plot(predictions_json, clusters_list_in, 680, figHeight)

#### Clusters Timeline

In [22]:
def prepare_timeline_for_Gantt(timeline_df):
    '''Prepare timeline data for building Gantt chart'''
    
    clusters_list = []
    start_list = []
    finish_list = []
    
    for i, cluster_id in enumerate(timeline_df['Topic Id'].values):
        timeline_list = timeline_df['Timeline'].values[i].split(', ')
    
        for time_period in timeline_list:
            time_range = time_period.split('-')
            clusters_list.append(str(cluster_id))
            start_list.append(time_range[0] + '-01-01')
            finish_list.append(time_range[-1] + '-12-31')
    
    return pd.DataFrame(dict({'Topic Id':clusters_list, 'Start':start_list, 'Finish':finish_list}))

In [23]:
@app.callback(Output('clusters_timeline', 'figure'),
              Input('clusters_summary', 'data'),
              Input('clusters_checkbox', 'value'))
def build_clusters_timeline(summary_json, clusters_list_in):
    global num_clusters, clusters_color
    
    # get a list of selected topics
    clusters_list = get_selected_topics(clusters_list_in, num_clusters)
    
    # get topic summary
    summary = get_topics_subset(pd.read_json(summary_json), clusters_list)
    
    # build timeline data for clusters
    timeline_data = prepare_timeline_for_Gantt(summary[['Topic Id', 'Timeline']])
    
    # build color map
    topic_colors = [clusters_color[i-1] for i in clusters_list]
    clusters_list_str = map(str, clusters_list)
    discrete_map_resource = dict(zip(clusters_list_str, topic_colors))
    
    # create Gantt chart
    fig = px.timeline(timeline_data, x_start="Start", x_end="Finish", y="Topic Id", color="Topic Id",
                      color_discrete_map=discrete_map_resource)
                      #color_discrete_sequence=topic_colors)
    
    # set title for axes
    fig.layout.xaxis.title = "<b>Year</b>"
    fig.layout.yaxis.title = "<b>Topic Id</b>"
    #fig.update_xaxes(dtick=1)
    
    # Set figure's title and legend
    fig.update_layout(width=630, height=380,
                      legend=dict(orientation="h", yanchor="top", y=1.20, xanchor="left", x=0))
    
    return fig

#### Cluster Terms

In [24]:
from wordcloud import WordCloud
from io import BytesIO
import base64
import matplotlib.pyplot as plt

def plot_wordcloud(data_in):
    d = {a: x for a, x in data_in.values}
    wc = WordCloud(background_color='white', width=490, height=380)
    wc.fit_words(d)
    
    return wc.to_image()

@app.callback(Output('cluster_terms_fig', 'src'),
              Input('clusters_summary', 'data'),
              Input('clusters_checkbox', 'value'))
def create_cluster_terms_plot(summary_json, clusters_list_in):
    global num_clusters
    
    # get a list of cluster ids
    clusters_list = get_selected_topics(clusters_list_in, num_clusters)
    
    # get topic summary
    summary = get_topics_subset(pd.read_json(summary_json), clusters_list)
    
    # get all terms
    all_terms = ', '.join(list(summary['Terms'].values)).split(', ')
    
    # build term frequency data frame
    df_term_feq = pd.DataFrame(dict({'Term': all_terms}))
    df_term_feq = df_term_feq['Term'].value_counts().reset_index()
    df_term_feq.rename(columns={'Term': 'Freq', 'index': 'Term'}, inplace=True)
    
    # create WordCloud
    img = BytesIO()
    plot_wordcloud(df_term_feq).save(img, format='PNG')
    
    return 'data:image/png;base64,{}'.format(base64.b64encode(img.getvalue()).decode())

#### Metric Scores Bar Chart

In [25]:
def build_metric_barchart(summary, score_label):
    
    # sort the data by score_label in descending order
    topic_scores = summary.sort_values(by=score_label, ascending=False)
    
    # get a list of topic ids
    clusters_list = list(topic_scores['Topic Id'].values)
    
    # convert data type to string
    topic_scores['Topic Id'] = topic_scores['Topic Id'].apply(lambda x: str(x))
    
    # create a horizontal bar plot
    fig = px.bar(topic_scores, 
                 x='Topic Id', y=score_label, #text=score_label, 
                 color='Topic Id',
                 color_discrete_sequence=[clusters_color[x-1] for x in clusters_list])
    fig.update_traces(textposition='outside')
    
    # add horizontal mean line
    mean = round(summary[score_label].mean(), 4)
    fig.add_hline(y=mean, line_dash="dash", line_color='red',
              annotation_text="Mean = " + str(mean), 
              annotation_position="top right",
              annotation_font_size=17,
              annotation_font_color="red")
    
    # set axis label
    fig.layout.yaxis.title = '<b>' + score_label + '</b>'
    fig.layout.xaxis.title = '<b>Topic Id</b>'
    
    # Set figure's title
    fig.update_layout(width=600, height=377,
                      legend=dict(orientation="h", yanchor="top", y=1.20, xanchor="left", x=0))
    return fig

In [26]:
@app.callback(Output('trend_score', 'figure'),
              Output('sil_score_fig', 'figure'),
              Output('std_years_fig', 'figure'),
              Output('articles_pct', 'figure'),
              Input('clusters_summary', 'data'),
              Input('clusters_checkbox', 'value'))
def create_metric_plots(summary_json, clusters_list_in):
    global num_clusters
    
    # get a list of selected topics
    clusters_list = get_selected_topics(clusters_list_in, num_clusters)
    
    # get topic summary
    summary = get_topics_subset(pd.read_json(summary_json), clusters_list)
    
    # create horizontal bar charts
    trend_score_fig = build_metric_barchart(summary, 'Trend Score')
    sil_score_fig = build_metric_barchart(summary, 'Silhouette Score')
    std_year_fig = build_metric_barchart(summary, 'Bias Avg Std Year')
    article_pct_fig = build_metric_barchart(summary, 'Article %')
    
    return trend_score_fig, sil_score_fig, std_year_fig, article_pct_fig

### Data Tab: Callback Functions

#### Placeholder: Header Texts

In [27]:
@app.callback(Output('num_unique_keywords', 'children'),
              Output('year_range', 'children'),
              Output('keyword_table', 'data'),
              Output('keyword_table', 'columns'),
              Input('year_slider', 'value'))
def update_data(slider_range):
    low, high = slider_range
    
     # get a subset of keywords data
    keywords_subset = get_year_subset(low, high, keywords_dist) 
    
    # get a list of unique keywords
    unique_keywords = list(set(keywords_subset['Token'].values))
    
    # get a subset of keyword_counts (with timeline)
    kw_counts_subset = keyword_counts[keyword_counts['Keyword'].isin(unique_keywords)]
    kw_counts_cols = [{"name": i, "id": i} for i in kw_counts_subset.columns]
    
    # get number of unique keywords
    num_keywords = len(unique_keywords)
    num_keywords = f"{num_keywords:,}"   # add coma to number if > 1000
    
    # get selected years from Year slider
    year_range = str(low) if low == high else str(low) + '-' + str(high)
    
    return html.H2(num_keywords), html.H2(year_range), \
           kw_counts_subset.to_dict('records'), kw_counts_cols, 

#### Summary Statistics

In [28]:
def build_data_stats(df_in, col_name=''):
    # build descriptive statis data frame
    df_summary = pd.DataFrame(np.round(df_in.describe()).astype(int)).T[['min', 'max', 'mean', '50%', 'std']]
    df_summary.rename(columns={'min': 'Min', 'max': 'Max', 'mean': 'Mean', '50%':'Median', 'std':'STD'},
                      inplace=True)
    
    # get the mode(s)
    modes = list(df_in.mode())
    
    # get counts for mode
    c_text = str(df_in[df_in == modes[0]].shape[0]) + ' counts'
    
    # convert a list of integer to a list of string
    modes = list(map(str, modes))
    
    if len(modes) == 2:
        # join the list with "and"
        m_text = [' and '.join(modes) + ' (' + c_text + ')']
    else:
        # join the list with a coma
        m_text = [', '.join(modes) + ' (' + c_text + ')']
    
    # add column "Mode" to dataframe
    df_summary['Mode'] = m_text
    df_summary = df_summary.T.reset_index().rename(columns={'index':''})
    
    if len(col_name) > 0:
        df_summary.rename(columns={df_summary.columns[-1]: col_name}, inplace=True)
    
    return df_summary

In [29]:
# set style for stats tables
stats_style_table={'width': '320px'}
stats_style_header ={'backgroundColor':"paleturquoise", 
                     'border': '1px solid green', 
                     'textAlign': 'center', 'fontWeight':'bold',
                     'height': 'auto', 'whiteSpace':'normal'}
stats_style_data = {'backgroundColor':"lavender", 'border': '1px solid white', 'textAlign':'center'}
stats_header_style={'height':'50px', 'fontSize':'14pt', 'color':'blue'}


def build_stats_table(df_summary, col_name):
    # build html Div for summary statistics
    htmlDiv = html.Div([
        html.Div(col_name, style=stats_header_style),
        html.Div([
            dash_table.DataTable(
                columns=[{"name": i, "id": i} for i in df_summary.columns],
                data=df_summary.to_dict('records'),
                style_table=stats_style_table,
                style_header=stats_style_header,
                style_data=stats_style_data,
                style_as_list_view=True,
            ), # end DataTable      
        ], style={'margin-left':'25px', 'margin-bottom':'15px'}), # end Div
    ]) # end Div
    
    return htmlDiv

In [30]:
@app.callback(Output('num_articles', 'children'),
              Output('stats_abstract', 'children'),
              Output('stats_num_keywords', 'children'),
              Output('stats_year', 'children'),
              Output('stats_month', 'children'),
              Input('year_slider', 'value'))
def update_stats_tables(slider_range):
    # year's range
    low, high = slider_range
    
    # get a subset of data
    data_subset = get_year_subset(low, high, data)
    
    # get the total number of articles
    num_articles = data_subset.shape[0]
    num_articles = f"{num_articles:,}"
    
    # build descriptive statistics Divs
    stats_abs = build_stats_table(build_data_stats(data_subset['Abstract Length']), 'Abstract Length')
    stats_kws = build_stats_table(build_data_stats(data_subset['Number of Keywords']), 'Number of Keywords')
    stats_year = build_stats_table(build_data_stats(data_subset['Year']), 'Year')
    stats_month = build_stats_table(build_data_stats(data_subset['Month_Cleaned'], 'Month'), 'Month')
    
    
    return html.H2(num_articles), stats_abs, stats_kws, stats_year, stats_month

#### Monthly Article Distribution

In [31]:
@app.callback(Output('monthly_dist_plot', 'figure'),
              Input('year_slider', 'value'))
def update_montly_dist_plot(slider_range):
    low, high = slider_range
    
    # get a subset of data
    year_dist = get_year_subset(low, high, data)
    
    # get monthly article distribution
    df = year_dist[year_dist['Month_Cleaned'] != -1]
    
    # compute monthly average number of articles
    df = df.groupby(['Month_Cleaned', 'Year'], 
               as_index=False)['Year'].agg('count').groupby(['Month_Cleaned'],
                                                            as_index=False).agg('mean')
    # rename column
    col_label = 'Average Number of Articles'
    df.rename(columns={'Year':col_label}, inplace=True)
    
    # round average number of articles to nearest integer
    df[col_label] = df[col_label].apply(lambda x: round(x))
    
    # map numeric month to short month name
    df['Month'] = df['Month_Cleaned'].apply(lambda month: datetime.datetime.strptime(str(month), "%m").strftime('%b'))
    df = df.sort_values(by='Month_Cleaned', ascending=True)   # sort data by month
    
    # create line plot
    fig1 = px.line(df, x="Month", y=col_label, color_discrete_sequence=['orange']*df.shape[0])
    
    # create scatter plot
    fig2 = px.scatter(df, x="Month", y=col_label, text=col_label,
                      color_discrete_sequence=['white']*df.shape[0],
                      size=[1.4]*df.shape[0])
    fig2.update_traces(textposition="middle center")

    # combine fig1 and fig2 into one figure
    fig = go.Figure(data= fig1.data + fig2.data)
    
    # set title and axis labels
    #fig.layout.title = 'Monthly Articles Distribution'
    fig.layout.yaxis.title = '<b>' + col_label + '</b>'
    fig.layout.xaxis.title = '<b>Month</b>'
    
    # Set figure's size
    fig.update_layout(width=700, height=figHeight)
    
    return fig

#### Figure: Frequency Distribution per Year

In [32]:
@app.callback(Output('freq_dist_plot', 'figure'),
              Input('year_slider', 'value'))
def update_freq_dist_plot(slider_range):
    low, high = slider_range
    
    # get subset of data
    keywords_subset = get_year_subset(low, high, keywords_per_year)
    articles_subset = get_year_subset(low, high, article_dist)
    data_subset = get_year_subset(low, high, data)
    
    # Create traces
    fig = go.Figure()
    
    # plot Number of Keywords distribution
    fig.add_trace(go.Scatter(x=keywords_subset['Year'], y=keywords_subset['Number of Keywords'],
                             mode='lines+markers', name='Total Number of Keywords'))
    
    # plot Number of Unique Keywords distribution
    fig.add_trace(go.Scatter(x=keywords_subset['Year'], y=keywords_subset['Number of Unique Keywords'],
                        mode='lines+markers',
                        name='Total Number of Unique Keywords'))
    
    # plot articles distribution
    fig.add_trace(go.Scatter(x=articles_subset['Year'], y=articles_subset['Number of Articles'], 
                             mode='lines+markers', name='Total Number of Articles'))

    # get median abstract length per year
    abstract_len = data_subset.groupby('Year')['Abstract Length'].median().reset_index().rename(
        columns={'Abstract Length': 'Median Abstract Length'})
    fig.add_trace(go.Scatter(x=abstract_len['Year'], y=abstract_len['Median Abstract Length'],
                             mode='lines+markers', name='Median Abstract Length'))

    if low < 2010 and high > 2014:
        # draw rectangle box
        fig.add_vrect(x0="2010", x1="2014", 
                      annotation_text="Peak Period", annotation_position="top left",
                      fillcolor="orange", opacity=0.25, line_width=0)


    # set title for axes
    fig.layout.xaxis.title = "<b>Year</b>"
    fig.layout.yaxis.title = "<b>Frequency</b>"
    
    # set dtick distance 
    #fig.update_xaxes(dtick=2)
    
    # Set figure's title and legend
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.01, xanchor="right", x=0.98), 
                      width=800, height=figHeight)
    
    return fig

#### Figure: Abstract & Keyword Histograms

In [33]:
def build_histogram(df_in, col, y_label='Number of Articles'):
    '''
    Create and return a histogram of data distribution.
    
    Parameters:
    - df_in: a pandas dataframe
    - col: name of column
    '''
    
    # draw the histogram
    fig = px.histogram(df_in, x=col)

    # compute mean and median
    mean_len = round(df_in[col].mean())
    med_len = round(df_in[col].median())
    
    # set label for verital mean line
    line_label = 'Mean = Median = ' if mean_len == med_len else 'Mean = '
    line_label = line_label + str(mean_len)
    
    # draw vertial median line if mean != median
    annot_pos_mean = 'top right'  # mean
    if mean_len != med_len:
        # set annotation position
        annot_pos = 'top left'        # median
        if med_len > mean_len:
            annot_pos_mean = 'top left'  # mean
            annot_pos = 'top right'      # median
        
        # draw vertical line for median
        fig.add_vline(x=med_len, 
                      line_width=3, line_dash="dash", line_color="#3cb371",
                      annotation_text="Median = " + str(med_len), 
                      annotation_position= annot_pos,
                      annotation_font_color="#3cb371")
    
    # draw vertical line for mean
    fig.add_vline(x=mean_len, 
                  line_width=3, line_dash="dash", line_color="red",
                  annotation_text=line_label, 
                  annotation_position=annot_pos_mean,
                  annotation_font_color="red")
    
    # set dtick distance 
    #fig.layout.title = "Distribution of " + col     # set plot's title
    fig.layout.xaxis.title = '<b>' + col + '</b>'
    fig.layout.yaxis.title = '<b>' + y_label + '</b>'   # set label for y-axis
    
    # Set figure's title and legend
    fig.update_layout(height=figHeight)
    
    return fig

In [34]:
@app.callback(Output('abstract_histogram', 'figure'),
              Output('keyword_histogram', 'figure'),
              Input('year_slider', 'value'))
def update_histograms(slider_range):
    low, high = slider_range
    
    # get subset of data
    data_subset = get_year_subset(low, high, data)
    
    # create histograms of abstract and keyword
    abtract_hist = build_histogram(data_subset, 'Abstract Length')
    keyword_hist = build_histogram(data_subset, 'Number of Keywords')
    
    return abtract_hist, keyword_hist

### Running Dash App

In [35]:
if __name__ == '__main__':
    app.run_server(mode='external', port=9000, debug=False)

 * Running on http://127.0.0.1:9000/ (Press CTRL+C to quit)
127.0.0.1 - - [15/Dec/2021 01:01:48] "[37mGET /_alive_764cb49c-d735-4210-b0d4-b48569ec3198 HTTP/1.1[0m" 200 -


Dash app running on http://127.0.0.1:9000/


127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-component-suites/dash_core_components/async-graph.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-component-suites/dash_table/async-highlight.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-component-suites/dash_table/async-table.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-component-suites/dash_core_components/async-slider.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mGET /_dash-component-suites/dash_core_components/async-plotlyjs.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2021 01:01:51] "[37mPOST 