In [7]:
import pandas as pd
import numpy as np

# LDA, tSNE
from sklearn.manifold import TSNE
from gensim.models.ldamodel import LdaModel

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
import seaborn as sns
import mplcyberpunk

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes

%matplotlib inline
output_notebook()

# Load Data

In [2]:
df = pd.read_csv("archive/papers.csv")

In [3]:
%%time
# Remove numerals:
df['paper_text_tokens'] = df.paper_text.map(lambda x: re.sub(r'\d+', '', x))
# Lower case:
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: x.lower())
print(df['paper_text_tokens'][0][:500])



self-organization of associative database
and its applications
hisashi suzuki and suguru arimoto
osaka university, toyonaka, osaka , japan
abstract
an efficient method of self-organizing associative databases is proposed together with
applications to robot eyesight systems. the proposed databases can associate any input
with some output. in the first half part of discussion, an algorithm of self-organization is
proposed. from an aspect of hardware, it produces a new style of neural network. in
CPU times: user 4.63 s, sys: 92.4 ms, total: 4.72 s
Wall time: 4.72 s


In [4]:
%%time
# Tokenize
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
print(df['paper_text_tokens'][0][:25])

['self', 'organization', 'of', 'associative', 'database', 'and', 'its', 'applications', 'hisashi', 'suzuki', 'and', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'an', 'efficient', 'method', 'of', 'self', 'organizing']
CPU times: user 7.56 s, sys: 400 ms, total: 7.96 s
Wall time: 7.96 s


In [5]:
%%time
# Stemming
snowball = SnowballStemmer('english')
df['paper_text_token'] = df.paper_text_tokens.map(lambda x: [snowball.stem(token) for token in x])
print(df['paper_text_tokens'][0][:25])

['self', 'organization', 'of', 'associative', 'database', 'and', 'its', 'applications', 'hisashi', 'suzuki', 'and', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'an', 'efficient', 'method', 'of', 'self', 'organizing']
CPU times: user 4min 23s, sys: 772 ms, total: 4min 24s
Wall time: 4min 24s


In [6]:
# import nltk
# nltk.download('stopwords')

In [8]:
%%time
# Remocing words like and, the, of
stop_en = stopwords.words('english')
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x:[t for t in x if t not in stop_en])
print(df['paper_text_tokens'][0][:25])

['self', 'organization', 'associative', 'database', 'applications', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'efficient', 'method', 'self', 'organizing', 'associative', 'databases', 'proposed', 'together', 'applications', 'robot']
CPU times: user 47.3 s, sys: 75.8 ms, total: 47.4 s
Wall time: 47.4 s


In [9]:
%%time 
# Removing words have less than 2 characters
df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [t for t in x if len(t) > 1])
print(df['paper_text_tokens'][0][:25])

['self', 'organization', 'associative', 'database', 'applications', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'efficient', 'method', 'self', 'organizing', 'associative', 'databases', 'proposed', 'together', 'applications', 'robot']
CPU times: user 2.03 s, sys: 32.5 ms, total: 2.06 s
Wall time: 2.06 s


# LDA
Finally, we use LDA to extract topic structure from the corpus of texts

In [10]:
from gensim import corpora, models
np.random.seed(2021)
texts = df['paper_text_tokens'].values
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary,
                                   num_topics=8, passes=5, minimum_probability=0)

In [11]:
ldamodel.print_topics()

[(0,
  '0.014*"network" + 0.011*"neural" + 0.009*"input" + 0.007*"networks" + 0.006*"neurons" + 0.006*"learning" + 0.006*"time" + 0.006*"output" + 0.005*"figure" + 0.005*"units"'),
 (1,
  '0.010*"matrix" + 0.008*"algorithm" + 0.006*"data" + 0.006*"problem" + 0.005*"set" + 0.005*"theorem" + 0.004*"kernel" + 0.004*"let" + 0.004*"function" + 0.004*"log"'),
 (2,
  '0.013*"image" + 0.008*"images" + 0.008*"learning" + 0.007*"training" + 0.007*"model" + 0.006*"using" + 0.006*"data" + 0.005*"deep" + 0.005*"object" + 0.005*"features"'),
 (3,
  '0.013*"model" + 0.008*"models" + 0.007*"network" + 0.006*"networks" + 0.006*"data" + 0.005*"set" + 0.005*"training" + 0.005*"neural" + 0.005*"learning" + 0.005*"using"'),
 (4,
  '0.015*"learning" + 0.009*"data" + 0.008*"set" + 0.008*"algorithm" + 0.007*"training" + 0.007*"loss" + 0.005*"function" + 0.005*"classification" + 0.005*"error" + 0.005*"class"'),
 (5,
  '0.019*"model" + 0.012*"data" + 0.008*"time" + 0.006*"models" + 0.005*"figure" + 0.005*"using

In [12]:
# Refractoring resuts of lda into numpy matrix (number_of papers * number_of_topics)
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])
# Reducing dimensionality using t-SNE algorithm
tsne = TSNE(random_state=2021, perplexity=30, early_exaggeration=120)
embedding = tsne.fit_transform(hm)
embedding = pd.DataFrame(embedding, columns=['x', 'y'])
embedding['hue'] = hm.argmax(axis=1)

# Plotting
Using Bokeh for scatter plot with interactions. Hover mouse over a dot to see the title of the respective paper:


In [13]:
source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            colors = [all_palettes['Set1'][8][i] for i in embedding.hue],
            title = df.title,
            year = df.year,
            alpha = [0.9] * embedding.shape[0],
            size = [7] * embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Title:</span>
            <span style="font-size: 12px">@title</span>
            <span style="font-size: 12px; font-weight: bold;">Year:</span>
            <span style="font-size: 12px">@year</span>
        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Papers')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

callback = CustomJS(args=dict(source=source), code=
    """
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    title = data['title']
    year = data['year']
    size = data['size']
    for (i = 0; i < x.length; i++) {
        if (year[i] <= f) {
            alpha[i] = 0.9
            size[i] = 7
        } else {
            alpha[i] = 0.05
            size[i] = 4
        }
    }
    source.change.emit();
    """)

slider = Slider(start=df.year.min(), end=df.year.max(), value=2016, step=1, title="Before year")
slider.js_on_change('value', callback)

layout = column(slider, plot_tsne)
show(layout)