### Install and load required packages

In [1]:
'''
remove docstring to install required packages

!pip install sentence_transformers
!conda install -c conda-forge hdbscan -y
!pip install bertopic
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip uninstall cupy-cuda115 -y
!pip uninstall cupy-cuda11x -y
!pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64
!pip install ipywidgets --upgrade
!pip install dash
!pip install --upgrade ipykernel
!pip install jupyterlab-dash
!pip install dash_bootstrap_components
!pip install wordcloud
'''

'\nremove docstring to install required packages\n\n!pip install sentence_transformers\n!conda install -c conda-forge hdbscan -y\n!pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com\n!pip uninstall cupy-cuda115 -y\n!pip uninstall cupy-cuda11x -y\n!pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64\n!pip install ipywidgets --upgrade\n!pip install dash\n!pip install --upgrade ipykernel\n!pip install jupyterlab-dash\n!pip install dash_bootstrap_components\n!pip install wordcloud\n'

In [2]:
# imports
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.backend._utils import select_backend
import pandas as pd
import numpy as np
import os
import dash
from dash import html
import dash_core_components as dcc
import plotly.express as px
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc
from dash import Dash, dash_table
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import base64
from io import BytesIO

### Prepare the data

In [3]:
# read preprocessed data
df = pd.read_csv('processed_df.csv')

In [4]:
# safe some important key data from df to integrate into data table later
docs = df.Abstract.tolist()
title = df.Title.tolist()
year = df.Year.tolist()
journal = df.Journal.tolist()
author = df.Name.tolist()

In [5]:
# define a custom vectorizer class
class CustomVectorizer(CountVectorizer): 
       
    stop_grams = []    
    
    def __init__(self, stop_grams = [], **opts):
        self.stop_grams = stop_grams
        super().__init__(**opts)
    
    def remove_ngrams(self, doc):
        for stop_gram in self.stop_grams:
            doc = doc.replace(stop_gram, "")
        return doc
    
    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):
        
        # load stop words using CountVectorizer's built in method
        stop_words = list(self.get_stop_words())
        
        preprocessor = self.build_preprocessor()
        tokenizer = self.build_tokenizer()
        remove_ngrams = self.remove_ngrams
        
        
        # create the analyzer that will be returned by this method
        def analyser(doc):
                
            # apply the preprocessing and tokenzation steps
            doc_clean = preprocessor(doc.lower())
            
            # remove phrase stopwords
            doc_clean = remove_ngrams(doc)
            
            # tokenize using default tokenizer
            tokens = tokenizer(doc_clean)            
            
            # use CountVectorizer's _word_ngrams built in method
            # to remove stop words and extract n-grams
            return(self._word_ngrams(tokens, stop_words))
        
        return(analyser)

In [6]:
# Load embeddings and model
embeddings = np.load("embeddings.npy")
embedding_model = SentenceTransformer('all-mpnet-base-v2')
model = select_backend(embedding_model)
model_bert = BERTopic.load("final_model", embedding_model = model)

In [7]:
# get the topics
topics = model_bert.get_topics()

# get the topic labels
labels = model_bert.custom_labels_

### Visualizations

In [8]:
# Topics over time
topics_over_time = pd.read_csv('tot_both.csv')
tot = model_bert.visualize_topics_over_time(topics_over_time, custom_labels = True)

In [9]:
# Visualization of documents with titles instead of abstract in the plot when hovering over
vis_docs = model_bert.visualize_documents(title, embeddings = embeddings, width = 1250, custom_labels = True)

In [10]:
# Similarity matrix
vis_heatmap = model_bert.visualize_heatmap(width = 1250, custom_labels = True)

In [11]:
# Topics per Journal
classes = list(df["Journal"])
topics_per_class = pd.read_csv('tpc.csv')
tpc = model_bert.visualize_topics_per_class(topics_per_class, top_n_topics = 66, width = 1250, custom_labels = True)

In [12]:
# Safe Info about Documents and Topics in a suitable dataframe
docs_info = model_bert.get_document_info(docs)
docs_info['Title'] = title
docs_info['Year'] = year
docs_info['Journal'] = journal
docs_info['Author'] = author
docs_info.drop('Representative_document', axis = 1)
for i in range(len(topics)):
    docs_info['Name'][docs_info.Topic == i-1] = labels[i]
docs_info.rename(columns={'Name':'Topicname'}, inplace=True)
docs_info.rename(columns={'Top_n_words':'Top 15 words'}, inplace=True)

docs_info = docs_info[['Title', 'Year', 'Author', 'Journal', 'Topic', 'Topicname', 'Document', 'Top 15 words','Probability']]
docs_info.sort_values(by='Topic')

Unnamed: 0,Title,Year,Author,Journal,Topic,Topicname,Document,Top 15 words,Probability
14038,We Don’T Know We Don’T Know: Asserting Ignorance.,2019,Massimiliano Carrara,Synthese,-1,Stop words,The pragmatic logic of assertions shows a conn...,paper - one - argument - knowledge - show - ep...,0.711914
4652,Requirements of intention in light of belief.,2020,Carlos Núñez,Philosophical Studies,-1,Stop words,Much work in the philosophy of action in the l...,paper - one - argument - knowledge - show - ep...,0.425701
4648,"Perdurantism, Fecklessness and the Veil of Ign...",2020,Michael Tze-Sung Longenecker,Philosophical Studies,-1,Stop words,There has been a growing charge that perdurant...,paper - one - argument - knowledge - show - ep...,0.420331
4645,"We talk to people, not contexts.",2020,Daniel W. Harris,Philosophical Studies,-1,Stop words,"According to a popular family of theories, ass...",paper - one - argument - knowledge - show - ep...,0.526087
4636,“Ontological Commitment and Ontological Commit...,2020,Jared Warren,Philosophical Studies,-1,Stop words,The standard account of ontological commitment...,paper - one - argument - knowledge - show - ep...,0.837428
...,...,...,...,...,...,...,...,...,...
11979,The Analytic-Synthetic Distinction and the Cla...,2010,Willem R. De Jong,Synthese,88,Quine,This paper concentrates on some aspects of the...,Quine - analyticity - analytic - Two Dogmas - ...,1.000000
10088,"Quine, Analyticity and Philosophy of Mathematics.",2004,John P. Burgess,Philosophical Quarterly,88,Quine,Quine correctly argues that Carnaps distinctio...,Quine - analyticity - analytic - Two Dogmas - ...,1.000000
14691,Logic in Analytic Philosophy: A Quantitative A...,2020,Guido Bonino,Synthese,88,Quine,"Using quantitative methods, we investigate the...",Quine - analyticity - analytic - Two Dogmas - ...,1.000000
2110,On Quine on Carnap on Ontology.,2001,Marc Alspector-Kelly,Philosophical Studies,88,Quine,"W. V. Quine assumed that in Empiricism, Semant...",Quine - analyticity - analytic - Two Dogmas - ...,1.000000


### Dashboard

In [None]:
# build dashboard

app = dash.Dash(__name__)

'''
TODO:
- adjust font for markdown elements
- add elements see below
- dropdown menus
- remaining vis. descriptions

'''

intro_dash = '''
This is a Plotly dashboard built by master's students of Applied Statistics at the Georg August University 
Göttingen. We developed a topic model with the help of the BERTopic package by Martin Grootendorst.
Our objective is to give philosophers the option to browse through visualizations of the Topic Model,
which approximates topics for abstracts of philosophical papers published since 2000 in 
the top 11 philosophical journals (see data set for names of papers).
Ultimately, the goal is to give researchers some direction for future publications.

The labeling of Topics was performed manually in cooperation with our project partner, 
Dr. Tobias Störzinger. All other results are based on the model developed with BERTopic. 
To review model specifications, view this project on [GitHub](https://github.com/DominikMann/BertTopic-Philosophy-Topic-Modeling), and for a deeper understanding, 
read our paper on this project (URL).

Note that this model cannot perfectly resemble the true underlying topic structure, 
and the methods used produce outliers that result in a trash-category (Topic 'Stop words'/ -1).
These topics and papers belonging to them cannot be meaningfully interpreted. 
Furthermore, the underlying publications used to build the model are not evenly distributed 
over journals and years; therefore, interpretations of representativeness should be made carefully. 
Also note that all papers which did not provide an abstract were not considered.

In this dashboard you can browse through topics created by the model via
- a Data Table 
- a Wordcloud
- a Graph displaying Topics per Journal
- a Graph displaying Topics over Time 
- a dimensionality-reduced cluster/ distance map
- a Similarity Matrix
- a Similarity search for Topics similar to your own search term
'''
# hier müssen noch die richtigen Links zum Paper rein!
# Probabilities sind noch falsch erklärt!

text_table = '''
This data table displays the assignment of the generated topics to the abstracts. 
For the whole picture some key data (Author, Title, Year, Journal) about the publishments is displayed as well. You can also view
the top 15 words representing one topic and the probabilities of each topic in each abstract.
Hover over the columns which are not fully shown (abstracts) to read the full text.
You can filter the data using the dropdown below the datatable. 
Selecting multiple options in multiple categories is possible.
You can also use the filtering option below the header. Here you must use operators to view results.
All possible operators are listed [here](https://dash.plotly.com/datatable/filtering).
'''

text_wc = '''
This is a Word Cloud generated for each topic. You can view each topic with its top 15 word representations.
The more representative a word is for a topic, the bigger it will be displayed in the Word Cloud.
Use the dropdown menu to select a topic for which you want the Word Cloud to be displayed.
'''

text_tpc = '''
This plot displays the number of abstracts per journal for each topic. 
You may select topics from the list of topics on the right to limit the number of topics shown. 
Clicking once on a topic will hide it. If you double-click on one or multiple topcis, only these are shown.
Hovering over the plot, you can see how the words that define a topic differ over Journals. 
Use the magnifying-glass tool to select a specific area of the graph, e.g. just one Journal.
'''

text_tot = '''
This plot displays the number of documents per topic (frequency) over the years.
You may select topics from the list of topics on the right to limit the number of topics shown. 
Clicking once on a topic will hide it. If you double-click on one or multiple topcis, only these are shown.
Hovering over the plot, you can see how the words that define a topic differ over time.
'''

text_vis_docs = '''
Each of the dots in the plot represents one document. Their similarity is shown by the distance towards each other. 
As the distance increases, they are less similar. Due to the underlying algorithm that reduces their complex dimensionality into these displayed 
two dimensions, their global structure might not be displayed perfectly.
The clusters colored in distinct colors are representing the topics. 
You may click on the topics from the list of topics on the right to only show selected topics. Clicking once on a topic will hide it. 
If you double-click on one or multiple topcis, only these are shown.
You can also zoom in and out of the map for a broader overview or for viewing a specific area.
'''

text_vis_heatmap = '''
This matrix shows the similarity (measured as cosine distance) of each topic to all other topics. The higher the similarity score, the more
similar the topcis are.
By default a reduced number of topics is shown. Zoom into the plot using the toolbar to show a higher resolution of topics.
'''

text_simsearch = '''
This is an interactive search, which enables you to find topics that are similar to a word chosen by you. 
Select a number of similar topics that should be displayed. Type in a search term and press enter to find topics that are similar.
You are given the topic, its top 15 words and a similarity score. You can use the score to compare which topic is most similar to your search term.
The higher the score the more closely related is the search term to the topic.
'''

# Define function for generatig word cloud
def plot_wordcloud(topic):
    text = {word: value for word, value in model_bert.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    return wc.to_image()

# Define function for similarity search
def get_similar(word, top_n = 3, model_bert=model_bert):
    similar_topics, similarity = model_bert.find_topics(word, top_n=top_n)
    sim_df = pd.DataFrame(columns=['Topic Number', 'Topic Name', 'Topic Words', 'Similarity Score'])
    sim_df.index.name = 'Topic Number'

    for i in range(len(similar_topics)):
        top_num = similar_topics[i]
        tops = model_bert.get_topic(top_num)
        names = labels[top_num + 1]
        top_words = list(item[0] for item in tops)
        top_words = ", ".join(top_words)
        sim_score = similarity[i]

        topic_list = [top_num, names, top_words, sim_score]

        sim_df = sim_df.append(pd.Series(topic_list, index=sim_df.columns, name=top_num))
        
    return sim_df


app.layout = html.Div( children=[
    
    # Welcome
    html.H1('Philosophical Topic Modellig – Visualizations', 
            style={'textAlign':'center', 'font-family':'sans-serif', 'font-size':30}),
    
    dcc.Markdown(children = intro_dash, style = {'display': 'inline-block',
                                                   'width': '1200px',
                                                   'padding': '10px 20px',
                                                   'text-align': 'left',
                                                   'vertical-align': 'top',
                                                   'font-family':'sans-serif',
                                                   'font-size':12
                                                }),
    # DataTable
    dbc.Container([
        html.H2('Data Table', style={'textAlign':'center','font-family':'sans-serif', 'font-size':25}),
        dbc.Label('Show number of rows', style= {'font-family':'sans-serif','font-size':12}),
        row_drop := dcc.Dropdown(value = 10, clearable=False, style={'width':'35%', 'font-family':'sans-serif','font-size':12},
                 options=[10, 25, 50, 100, 200]),

        docs_info_table := dash_table.DataTable(
                columns = [ {'name': i, 'id': i, 'deletable': True} for i in docs_info.columns
                    ],
                data = docs_info.to_dict('records'),
                filter_action = 'native',
                page_size = 10,

                style_data={
                    'width': '150px', 'minWidth': '150px', 'maxWidth': '150px',
                    'overflow': 'hidden',
                    'textOverflow': 'ellipsis',},
                style_header={
                    'font-family':'sans-serif'},
                style_cell={'font-family':'sans-serif',
                            'font-size':12},
                style_as_list_view=True,
        ),
        

        dbc.Row([
            dbc.Col([
                html.Label('Filter the data', style= {'font-family':'sans-serif','font-size':12}),
                year_drop := dcc.Dropdown([x for x in sorted(docs_info.Year.unique())], 
                                          multi=True, 
                                          placeholder="select Years", 
                                          style= {'font-family':'sans-serif','font-size':12})
            ], width=3),
            dbc.Col([
                journal_drop := dcc.Dropdown([x for x in sorted(docs_info.Journal.unique())],
                                             multi=True,
                                             placeholder="select Journals",
                                             style= {'font-family':'sans-serif','font-size':12})
            ], width=3),
            dbc.Col([
                topic_drop := dcc.Dropdown([x for x in sorted(docs_info.Topic.unique())],
                                           multi=True,
                                           placeholder="select Topic Numbers",
                                           style= {'font-family':'sans-serif','font-size':12})
            ], width=3),
            dbc.Col([
                name_drop := dcc.Dropdown([x for x in sorted(docs_info.Topicname.unique())],
                                           multi=True,
                                           placeholder="select Topics by Name",
                                           style= {'font-family':'sans-serif','font-size':12})
            ], width=3),
        ]),
        
        dcc.Markdown(children = text_table, style = {'display': 'inline-block',
                                                     'width': '1200px',
                                                     'padding': '10px 20px',
                                                     'text-align': 'left',
                                                     'vertical-align': 'top',
                                                     'font-family':'sans-serif',
                                                     'font-size':12,
                                                     'marginBottom': 100,
                                                     'marginTop': 0
                                                    }),

    ]),
    
    
    # Word cloud
    html.Div([
        html.H2('Word Cloud', style={'textAlign':'center', 'font-family':'sans-serif', 'font-size':25}),
        dbc.Label('Select a topic to generate its Word Cloud', style= {'font-family':'sans-serif','font-size':12}),
        html.Br(),
        dcc.Dropdown(id='topicwc_drop',
                     options=[{'label': f"Topic '{labels[i+1]}'", 'value': i} for i in topics.keys()],
                     value=0,
                     style = {'display': 'inline-block',
                              'width':'35%',
                              'padding': '10px 20px',
                              'text-align': 'left',
                              'vertical-align': 'top',
                              'font-family':'sans-serif',
                              'font-size':12}),
        html.Br(),
                           
        html.Img(id="image_wc", style = {'display': 'inline-block',
                                        'width': '500px',
                                        'padding': '10px 20px',
                                        'text-align': 'center',
                                        'vertical-align': 'top'}),
        
        dcc.Markdown(children = text_wc, style = {'display': 'inline-block',
                                                     'width': '1200px',
                                                     'padding': '10px 20px',
                                                     'text-align': 'left',
                                                     'vertical-align': 'top',
                                                     'font-family':'sans-serif',
                                                     'font-size':12,
                                                     'marginBottom': 100,
                                                     'marginTop': 0
                                                    }),
    ]),
    
    
    # Topics per Journal
    html.Div(
        children = [
            dcc.Graph(id = "tpc", figure = tpc, style = {'display': 'inline-block'}
                     ),
            dcc.Markdown(children = text_tpc, style = {'display': 'inline-block',
                                                       'width': '1200px',
                                                       'padding': '10px 20px',
                                                       'text-align': 'left',
                                                       'vertical-align': 'top',
                                                       'font-family':'sans-serif',
                                                       'font-size':12,
                                                       'marginBottom': 100,
                                                       'marginTop': -25
                                                      }),
            
        ]
    ),
    
    
    # Topics over Time
    dbc.Container([
        dcc.Graph(id = "tot", figure = tot, style = {'display': 'inline-block'}
                     ),
        dcc.Markdown(children = text_tot, style = {'display': 'inline-block',
                                                   'width': '1200px',
                                                   'padding': '10px 20px',
                                                   'text-align': 'left',
                                                   'vertical-align': 'top',
                                                   'font-family':'sans-serif',
                                                   'font-size':12,
                                                   'marginBottom': 100,
                                                   'marginTop': -25
                                                  }),
        
        
    ]),

    # Cluster Map
    html.Div(
        children=[
            dcc.Graph(id = "vis_docs", figure = vis_docs, style = {'display': 'inline-block'}
                     ),
            dcc.Markdown(children = text_vis_docs, style = {'display': 'inline-block',
                                                            'width': '1200px',
                                                            'padding': '10px 20px',
                                                            'text-align': 'left',
                                                            'vertical-align': 'top',
                                                            'font-family':'sans-serif',
                                                            'font-size':12,
                                                            'marginBottom': 100,
                                                            'marginTop': -25
                                                           }),
        ]
    ),
 
    # Similarity Matrix
    html.Div(
        children=[
            dcc.Graph(id = "vis_heatmap", figure = vis_heatmap, style = {'display': 'inline-block'}
                     ),
            dcc.Markdown(children = text_vis_heatmap, style = {'display': 'inline-block',
                                                               'width': '1200px',
                                                               'padding': '10px 20px',
                                                               'text-align': 'left',
                                                               'vertical-align': 'top',
                                                               'font-family':'sans-serif',
                                                               'font-size':12,
                                                               'marginBottom': 100,
                                                               'marginTop': -25
                                                              }),
            
        ]
    ),
    
    # Similarity search
    html.Div(
        children=[
            html.H2('Similarity Search', style={'textAlign':'center', 'font-family':'sans-serif', 'font-size':25}),
            dbc.Label('Select the number of similar topics shown', style= {'font-family':'sans-serif','font-size':12}),
            html.Br(),
            dcc.Dropdown(id = 'top_n_drop', 
                         options=[{'label': f'{i}', 'value': i} for i in range(1, 11)],
                         value=3, style = {'display': 'inline-block',
                                           'width':'35%',
                                           'padding': '10px 20px',
                                           'text-align': 'left',
                                           'vertical-align': 'top',
                                           'font-family':'sans-serif',
                                           'font-size':12}),
            
            html.Br(),
            dbc.Label('Enter a word you want to search for:', style= {'font-family':'sans-serif', 'font-size':12,}),
            html.Div(["Input: ",
                      dcc.Input(id='word_input', type='text', debounce=True, placeholder='word to search for',
                                spellCheck=True)
                     ]),
            html.Br(),
            html.Div(id='word_output', style= {'font-family':'sans-serif', 'font-size':12, 'marginTop': -20}),
            dcc.Markdown(children = text_simsearch, style = {'display': 'inline-block',
                                                            'width': '1200px',
                                                            'padding': '10px 20px',
                                                            'text-align': 'left',
                                                            'vertical-align': 'top',
                                                            'font-family':'sans-serif',
                                                            'font-size':12,
                                                            'marginBottom': 100,
                                                            'marginTop': -15
                                                           }),
        ]
    )

]
                     )

@app.callback(
    Output(docs_info_table, 'data'),
    Output(docs_info_table, 'page_size'),
    Input(year_drop, 'value'),
    Input(journal_drop, 'value'),
    Input(topic_drop, 'value'),
    Input(name_drop, 'value'),
    Input(row_drop, 'value')
)

def update_dropdown_options(year_v, journal_v, topic_v, name_v, row_v):
    dff = docs_info.copy()

    if year_v:
        dff = dff[dff.Year.isin(year_v)]
        
    if journal_v:
        dff = dff[dff.Journal.isin(journal_v)]
        
    if topic_v:
        dff = dff[dff.Topic.isin(topic_v)]
    
    if name_v:
        dff = dff[dff.Topicname.isin(name_v)]
        
    return dff.to_dict('records'), row_v

@app.callback(
    Output('image_wc', 'src'),
    Input('image_wc', 'id'),
    Input('topicwc_drop', 'value'))    

def make_image(b, topicwc_drop):
    img = BytesIO()
    plot_wordcloud(topicwc_drop).save(img, format='PNG')
    
    return 'data:image/png;base64,{}'.format(base64.b64encode(img.getvalue()).decode())

@app.callback(
    Output('word_output', 'children'),
    Input('word_input', 'value'),
    Input ('top_n_drop', 'value')
)

def func_sim(word_input, top_n_drop):
    if word_input:
        df_sim = get_similar(word_input, top_n_drop)
        data_sim = df_sim.to_dict('records')
        columns_sim =  [{"name": i, "id": i,} for i in (df_sim.columns)]
        return dash_table.DataTable(data=data_sim, columns=columns_sim,
                            style_data={
                                'width': '150px', 'minWidth': '150px', 'maxWidth': '700px',
                                'overflow': 'hidden',
                                'textOverflow': 'ellipsis',
                                'border':'0.5px solid'},
                            style_header={
                                'font-family':'sans-serif',
                                'border':'0.5px solid'},
                            style_cell={'font-family':'sans-serif',
                                        'font-size':12,
                                        'textAlign':'left'},
                            style_as_list_view=True,
                           )


if __name__ == "__main__":
    app.run_server(host="localhost",port=8041, debug=True, use_reloader=False)

Dash is running on http://localhost:8041/

Dash is running on http://localhost:8041/

Dash is running on http://localhost:8041/

 * Serving Flask app '__main__'
 * Debug mode: on
