# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [1]:
#%load_ext Cython
#initialize profiler
#%load_ext line_profiler

import time
import json
import spacy
import regex as re
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF as NonnegativeMatrixFactorization
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE
import numpy as np
import sklearn

from lapjv import lapjv
from scipy.spatial.distance import cdist
from scipy.interpolate import griddata
from numpy.linalg import norm
from sklearn.preprocessing import normalize

from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML

from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, OpenURL, TapTool
from bokeh.plotting import figure, ColumnDataSource
from bokeh.palettes import d3, brewer
from bokeh.layouts import row, column
output_notebook()

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [2]:
def loadProjects():
    with open('../../../assets/data/project_output/projects.json', 'r') as datafile:
        return json.load(datafile)

def loadGermanStopwords():
    with open('../../../assets/data/nlp/stopwords_de.json', 'r') as datafile:
        return json.load(datafile)

def loadEnglishStopwords():
    with open('../../../assets/data/nlp/stopwords_eng.json', 'r') as datafile:
        return json.load(datafile)

def cleanProjectTexts():
    cleanedProjectTexts = {}
    stopwordsDE = set(loadGermanStopwords())
    for project in loadProjects():
        if project['abstract'] and project['abstract'] != 'Keine Zusammenfassung vorhanden' and detect(project['abstract']) == 'de':
            if len(cleanedProjectTexts) >= 100:
                break
            letters_only = re.sub('[^\p{L} ]', ' ', project['abstract'])
            words = letters_only.lower().split()
            usefulWords = [x for x in words if not (x in stopwordsDE or len(x) <= 2 )]
            if usefulWords:
                cleanedProjectTexts[project['id']] = ' '.join(usefulWords)
    return cleanedProjectTexts

## TF-IDF computation
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [3]:
nlp = spacy.load('de')
def lemmatize(text):
    return [token.lemma_ for token in nlp(text)]

def TfIdf(dict):
    start = time.time()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lemmatize)
    tfs = tfidf_vectorizer.fit_transform(list(dict.values()))
    print('TFIDF execution time: ', time.time() - start)
    return (tfidf_vectorizer, tfs)


numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192, got 176



In [4]:
tfidf_vectorizer, tfs = TfIdf(cleanProjectTexts())

TFIDF execution time:  2.4716567993164062


# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

In [5]:
def LSA(tfs,num_topics=40):
    start = time.time()
    lsa = TruncatedSVD(n_components=num_topics, random_state=0).fit(tfs)
    print('LSA execution time: ', time.time() - start)
    
    #tfidf_feature_names = [str(token) for token in tfidf_vectorizer.get_feature_names()]
    #print_top_words(lsa, tfidf_feature_names, 10)
    return lsa.transform(tfs), lsa

## Non-negative matrix factorisation
Summary: **Coming soon**

In [6]:
def NMF(tfs,num_topics=40):
    start = time.time()
    nmf = NonnegativeMatrixFactorization(n_components=num_topics, init='random', random_state=0)
    nmf.fit(tfs)    
    print('NMF execution time: ', time.time() - start)
    
    #
    #print_top_words(nmf, tfidf_feature_names, 10)
    return nmf.transform(tfs), nmf

### Get top words for each dimension
In order to get the words which are most important for each dimension (which correspond to topics), the standard basis in the topic space is converted back into the word space. These are exactly the eigenvectors of data. Now the top n biggest entries and their corresponding words form the top words.

In [7]:
def get_top_words_dim(model, feature_names, n_top_words):
    dim_topics = {}
    for topic_idx, topic in enumerate(model.components_):
        dim_topics[topic_idx] = [feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]
    return dim_topics

# Clustering

## K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [8]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def cluster(tfs_reduced, num_topics=10):
    start = time.time()
    km = KMeans(n_clusters=num_topics).fit(tfs_reduced)
    print('Clustering execution time: ', time.time() - start)
    return km

### Get top words for each cluster
The process is similar to the one for getting the top words for each dimension. But in this case the cluster centers from the clustering step are transformed back into the word space and analysed. This is based on the assumption that the cluster center represents the set of all documents in the corrsponding cluster.

In [9]:
def get_top_words_point(model, point, feature_names, n_top_words):
    point = np.array(point).reshape(1,-1)
    point.reshape(1, -1)
    word_space_point = model.inverse_transform(point)
    return [feature_names[j] for j in word_space_point.argsort()[0][:-n_top_words - 1:-1]]

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [10]:
def dimReductionLDA(tfs_reduced, clusters, targetDim=2):
    start = time.time()
    lda = LinearDiscriminantAnalysis(n_components=targetDim)
    tfs_2d = lda.fit(tfs_reduced, clusters.labels_).transform(tfs_reduced)
    print('LDA execution time: ', time.time() - start)
    print(lda.coef_.shape, lda.xbar_.shape)
    return tfs_2d, lda

## tSNE
*Summary*:


*In-depth explanation*:

In [11]:
def dimReductiontSNE(tfs_reduced, perplexity=30, learning_rate=100, targetDim=2):
    start = time.time()
    tfs_2d = (MulticoreTSNE(n_jobs=8, n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate) if targetDim ==2 else TSNE(n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate)).fit_transform(tfs_reduced)
    print('tSNE execution time: ', time.time() - start)
    return tfs_2d

# Analysis

### Statistics on words in the corpus

In [12]:
words = dict(zip(tfidf_vectorizer.get_feature_names(), np.sum(tfs, axis=1)))
print(words)
#print()
p = figure(x_range=[k for k,v in words.items()], plot_width=800, plot_height=800, title=None)
p.vbar(x=[k for k,v in words.items()], top=[v for k,v in words.items()], width=10)
show(p)

{'abadeh': matrix([[11.2039067]]), 'abb': matrix([[12.23526522]]), 'abbilden': matrix([[9.3049031]]), 'abbilder': matrix([[9.70365441]]), 'abbildungen': matrix([[9.52331889]]), 'abcd': matrix([[10.42773443]]), 'abfolge': matrix([[10.09141109]]), 'abfolgen': matrix([[9.73277856]]), 'abgebildet': matrix([[10.99777417]]), 'abgeflacht': matrix([[9.12650326]]), 'abgelegen': matrix([[9.01793652]]), 'abgeleitet': matrix([[10.55527798]]), 'abgereicherten': matrix([[7.58641019]]), 'abgeschlossen': matrix([[10.01247332]]), 'abgewandelt': matrix([[8.06990064]]), 'abgrenzen': matrix([[8.24290846]]), 'abholzungen': matrix([[11.44032269]]), 'abhängig': matrix([[10.14066222]]), 'abhängigkeit': matrix([[10.02213842]]), 'abhängigkeiten': matrix([[9.1662304]]), 'abiotischen': matrix([[9.5436784]]), 'abiotischer': matrix([[10.47817207]]), 'abkühlgeschichte': matrix([[11.06447042]]), 'ablagerung': matrix([[7.90900962]]), 'ablagerungen': matrix([[6.58643354]]), 'ablagerungsgeschichte': matrix([[8.18676512]

# Linearize results into a grid

In [13]:
def mapToSpaceSampling(points):
    # just take the first n² < #points Points
    points = points[: int(np.sqrt(len(points)))**2]
    grid = np.dstack(np.meshgrid(np.linspace(np.min(points[:, 0]), np.max(points[:, 0]), int(np.sqrt(len(points)))),
                       np.linspace(np.min(points[:, 1]), np.max(points[:, 1]), int(np.sqrt(len(points)))))).reshape(-1, 2)
    cost = cdist(points, grid, "sqeuclidean").astype(np.float64)
    print(cost.shape)
    cost *= 100000 / cost.max()
    row_ind_lapjv, col_ind_lapjv, _ = lapjv(cost, verbose=True, force_doubles=True)
    return grid[row_ind_lapjv]

In [14]:
def computeClusterTopography(points, values, width, height, interpolation='linear'):
    # lay grid over the points so that all points are covered
    grid_x, grid_y = np.mgrid[np.min(points[:,0]):np.max(points[:,0]):width*1j, np.min(points[:,1]):np.max(points[:,1]):height*1j]
    return griddata(np.array(points), np.array(values[:len(points)]), (grid_x, grid_y), method=interpolation, fill_value=np.min(values[:len(points)]))

# Visualization

In [15]:
def draw_scatter(data, width=600, height=600, viz='scatter'):
    display(Javascript("""
        (function(element){
            require(['scatter'], function(scatter) {
                scatter(element.get(0), %s, %d, %d, %s);
            });
        })(element);
    """ % (json.dumps(data), width, height, json.dumps(viz))))

In [16]:
def save(payload):
    name = "c" + str(payload['params']['num_clusters']) +"-t" + str(payload['params']['num_topics']) + "_" + str(payload['params']['embedding'])
    if payload['params']['embedding'] == 'tSNE':
        name += "_p" + str(payload['params']['perplexity']) + "-lr" + str(payload['params']['learning_rate'])
    with open('./dumps/' + name + '.json', 'w') as dumpfile:
        json.dump(payload, dumpfile, sort_keys=True, indent=4, ensure_ascii=False)

In [20]:
def visualize(targetDim=2,tfs=None,dimreduction='LSA', clustering='KMEANS', embedding='LDA', num_topics=20, num_clusters=3, perplexity=5, learning_rate=200, error='cluster_error', interpolation='linear', viz='scatter'):
    
    # viz dimensions
    width = 600
    height = 600
    
    
    if dimreduction == 'LSA':
        tfs_reduced, model = LSA(tfs, num_topics=num_topics)
    elif dimreduction == 'NMF':
        tfs_reduced, model = NMF(tfs, num_topics=num_topics)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    if clustering == 'KMEANS':
        clusters = cluster(tfs_reduced, num_topics=num_clusters)
    else:
        return 'No clustering technique was selected!'
    
    if embedding == 'LDA':
        tfs_embedded, lda = dimReductionLDA(tfs_reduced, clusters=clusters, targetDim=targetDim)
    elif embedding == 'tSNE':
        tfs_embedded = dimReductiontSNE(tfs_reduced, perplexity=perplexity, learning_rate=learning_rate, targetDim=targetDim)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    # compute linearization
    tfs_mapped = mapToSpaceSampling(tfs_embedded) if targetDim == 2 else [0]*len(tfs_embedded)
    
    # compute top words (TODO: could be flawed)
    tfidf_feature_names = [str(token) for token in tfidf_vectorizer.get_feature_names()]
    cluster_words = [get_top_words_point(model, center, tfidf_feature_names, 5) for center in clusters.cluster_centers_]
    [print(i, words) for i,words in enumerate(cluster_words)]
    
    #compute cluster topography
    similarity_to_cluster_centers = [norm(x-clusters.cluster_centers_[clusters.labels_[i]]) for i,x in enumerate(tfs_reduced)]
    #similarity_to_cluster_centers = similarity_to_cluster_centers / norm(similarity_to_cluster_centers)
    reduction_error = np.max(lda.decision_function(tfs_reduced), axis=1) if (embedding == 'LDA') else [0]* len(tfs_embedded)
    #reduction_error = reduction_error / norm(reduction_error)
    mixed = reduction_error + similarity_to_cluster_centers
    interpolated_topography = computeClusterTopography(tfs_embedded if viz == 'scatter' else tfs_mapped, similarity_to_cluster_centers if error=='cluster_error' else reduction_error if error=='reduction_error' else mixed, width, height, interpolation)
    
    colours = d3['Category10'][num_clusters]
    if targetDim == 2:
        # configure bokeh plot                   
        source = ColumnDataSource(data=dict(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            x_mapped=tfs_mapped[:, 0],
            y_mapped=tfs_mapped[:, 1],
            ids=list(cleanProjectTexts().keys()),
            #titles=[next((project['title'] for project in loadProjects() if project['id'] == key), ['None']) for key in cleanProjectTexts().keys()],
            colours=np.array(colours)[clusters.labels_],
            labels=clusters.labels_
        ))

        TOOLTIPS = [
            ("index", "$index"),
            ("id", "@ids"),
            #("title", "@titles"),
        ]
        # scatterplot
        scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom,save')
        scatter.scatter('x', 'y', size=10,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        
        # mapped scatterplot
        mapped_scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom')
        mapped_scatter.scatter('x_mapped', 'y_mapped', size=50,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = mapped_scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        show(row(scatter, mapped_scatter))
    else:

        source = go.Scatter3d(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            z=tfs_embedded[:, 2],
            mode='markers',
            marker=dict(
                size=2,
                color=clusters.labels_,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.8
            )
        )

        data = [source]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        iplot(fig, filename='3d-scatter-colorscale')
    
    payload = {
        'params': {
            'targetDim': targetDim,
            'dimreduction': dimreduction,
            'clustering': clustering,
            'embedding': embedding,
            'num_topics': num_topics,
            'num_clusters': num_clusters,
            'perplexity': perplexity,
            'learning_rate': learning_rate
        },
        'project_data': [{'id':pid,'reducedpoint': reducedpoint, 'embpoint':embpoint, 'mappoint':mappoint, 'cluster':cluster, 'error':error, 'title': title, 'words': words} for pid, reducedpoint, embpoint, mappoint, cluster, error, title, words in zip(
            list(cleanProjectTexts().keys()),
            tfs_reduced.tolist(),
            tfs_embedded.tolist(),
            tfs_mapped.tolist(),
            clusters.labels_.tolist(),
            reduction_error,
            [next((project['title'] for project in loadProjects() if project['id'] == key), ['None']) for key in cleanProjectTexts().keys()],
            [get_top_words_point(model, point, tfidf_feature_names, 5) for point in tfs_reduced]
            
        )],
        'cluster_data': {
            'cluster_words': cluster_words,
            'cluster_colour': colours
        },
        'cluster_topography': np.flip(interpolated_topography.T, axis=0).flatten().tolist()
    }
    save(payload)
    display(HTML(filename="scatter.css.html"))
    display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});"))
    display(Javascript(filename="scatter.js"))
    draw_scatter(payload, width, height, viz)
        


In [21]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

w = interactive(visualize,targetDim=s(2,3),tfs=fixed(tfs), dimreduction=['LSA', 'NMF'], clustering=['KMEANS'], embedding=['LDA', 'tSNE'], num_topics=s(4,48), num_clusters=s(4,14), perplexity=s(5,50), learning_rate=s(100,1000),error=['red_error', 'cluster_error', 'mixed'], interpolation=['linear', 'cubic', 'nearest'], viz=['scatter', 'linearized'])
output = w.children[-1]
#output.layout.height = '2000px'
display(w)



Comparing sparse matrices using == is inefficient, try using != instead.



interactive(children=(IntSlider(value=2, continuous_update=False, description='targetDim', max=3, min=2), Drop…

In [22]:
#%lprun -f visualize visualize(**w.kwargs)