# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [1]:
# general imports
import numpy as np
import sklearn

# data wrangling
import json
import spacy
import regex as re
from langdetect import detect
import psycopg2

# document embedding
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short

# topic extraction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF as NonnegativeMatrixFactorization

#clustering
from sklearn.cluster import KMeans

# projection into 2d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE

# linearization
from lapjv import lapjv
from scipy.spatial.distance import cdist
from scipy.interpolate import griddata
from numpy.linalg import norm
from sklearn.preprocessing import normalize

# interactivity
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML

#2d plot
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, OpenURL, TapTool
from bokeh.plotting import figure, ColumnDataSource
from bokeh.palettes import d3, brewer
from bokeh.layouts import row, column
output_notebook()

# 3d plot
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [46]:
class DataLoader():
    def __init__(self, query, clean=True):
        with open('../../../../IKON-backend/assets/IKON-backend-config/secrets/postgres_password') as password_file:
            password = password_file.read().strip()
            self.conn = psycopg2.connect(dbname="ikon", user="ikonuser", password=password, port=5432, host='localhost')
            self.nlp = spacy.load('xx')
            self.query = query
            self.clean = clean
            self.data = self.loadFromDB(self.query).fetchall()
            self.stopwords = STOPWORDS.union(set(self.loadGermanStopwords())).union(set(self.loadEnglishStopwords()))
    
    def __iter__(self):
        self.data = self.loadFromDB(self.query)
        return self
    
    def __next__(self):
        data = self.data.fetchone()
        if data is None:
            raise StopIteration
        text, id, *args = data
        custom_filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short, lambda x: ' '.join([token.lemma_ for token in self.nlp(x) if token.lemma_ not in self.stopwords])]
        text = preprocess_string(text , filters=custom_filters)
        return TaggedDocument(text, [id])
    
    def getIDs(self):
        data = self.loadFromDB(self.query)
        return [id for (text, num, id, title) in data]
    
    def getTitles(self):
        data = self.loadFromDB(self.query)
        return [title for (text, num, id, title) in data]
        
    def loadFromDB(self, query):
        cursor = self.conn.cursor()
        cursor.execute(query)
        return cursor
        
    def loadGermanStopwords(self):
        with open('../../../assets/data/nlp/stopwords_de.json', 'r') as datafile:
            return json.load(datafile)

    def loadEnglishStopwords(self):
        with open('../../../assets/data/nlp/stopwords_eng.json', 'r') as datafile:
            return json.load(datafile)

In [59]:
traindata = DataLoader("SELECT project_abstract, ROW_NUMBER() OVER (), id, title FROM projects WHERE project_abstract NOT LIKE '%Keine Zusammenfassung%';")
mfndata = DataLoader("SELECT abstract, ROW_NUMBER() OVER (), id, title FROM project_view WHERE institution_id = 13232 AND abstract NOT LIKE '%Zusammenfassung%';")

In [61]:
import sys
#len(traindata.prefetch)
sys.getsizeof(traindata.prefetch)

776632

## Document Embedding

### TF-IDF
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [15]:
dct = %time Dictionary(doc.words for doc in traindata)  # fit dictionary
traincorpus = [dct.doc2bow(doc.words) for doc in traindata]  # convert corpus to BoW format
model_tfidf = %time TfidfModel(traincorpus)  # fit model

CPU times: user 9.14 s, sys: 16 ms, total: 9.15 s
Wall time: 9.19 s
CPU times: user 33.8 ms, sys: 5 µs, total: 33.8 ms
Wall time: 33.8 ms


In [16]:
mfncorpus = [dct.doc2bow(doc.words) for doc in mfndata]  # convert corpus to BoW format
docs_vectorized_tfidf = corpus2csc(model_tfidf[mfncorpus]).T

### Doc2Vec
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [40]:
print('Doc2Vec setup and vocabulary building:')
doc2vec_model = %time Doc2Vec(traindata, vector_size=100, window=2, min_count=1, workers=1, epochs=20)
print('Doc2Vec training:')
%time doc2vec_model.train(traindata, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Doc2Vec setup and vocabulary building:
CPU times: user 3min 37s, sys: 212 ms, total: 3min 37s
Wall time: 3min 33s
Doc2Vec training:
CPU times: user 3min 31s, sys: 188 ms, total: 3min 31s
Wall time: 3min 27s


In [18]:
docs_vectorized_doc2vec = np.array([doc2vec_model.infer_vector(doc.words) for doc in mfndata])

In [44]:
print(docs_vectorized_doc2vec.shape)

(132, 100)


# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

In [21]:
def LSA(tfs,num_topics=40):
    print('LSA:')
    lsa = %time TruncatedSVD(n_components=num_topics, random_state=0).fit(tfs)
    return lsa.transform(tfs), lsa

## Autoencoder
Summary: **Coming soon**

In [22]:
def Autoencoder(tfs,num_topics=40):
    pass

## Get top words for each dimension
In order to get the words which are most important for each dimension (which correspond to topics), the standard basis in the topic space is converted back into the word space. These are exactly the eigenvectors of data. Now the top n biggest entries and their corresponding words form the top words.

In [23]:
def get_top_words_dim(model, feature_names, n_top_words):
    dim_topics = {}
    for topic_idx, topic in enumerate(model.components_):
        dim_topics[topic_idx] = [feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]
    return dim_topics

## Clustering

### K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [24]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def cluster(tfs_reduced, num_topics=10):
    print('K-Means:')
    km = %time KMeans(n_clusters=num_topics).fit(tfs_reduced)
    return km

### Get top words for each cluster
The process is similar to the one for getting the top words for each dimension. But in this case the cluster centers from the clustering step are transformed back into the word space and analysed. This is based on the assumption that the cluster center represents the set of all documents in the corrsponding cluster.

In [25]:
def get_top_words_point(model, point, feature_names, n_top_words):
    point = np.array(point).reshape(1,-1)
    point.reshape(1, -1)
    word_space_point = model.inverse_transform(point)
    return [feature_names[j] for j in word_space_point.argsort()[0][:-n_top_words - 1:-1]]

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [26]:
def dimReductionLDA(tfs_reduced, clusters, targetDim=2):
    lda = LinearDiscriminantAnalysis(n_components=targetDim)
    print('LDA:')
    tfs_2d = %time lda.fit(tfs_reduced, clusters.labels_).transform(tfs_reduced)
    return tfs_2d, lda

## tSNE
*Summary*:


*In-depth explanation*:

In [27]:
def dimReductiontSNE(tfs_reduced, perplexity=30, learning_rate=100, targetDim=2):
    print('t-SNE:')
    tfs_2d = %time (MulticoreTSNE(n_jobs=4, n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate) if targetDim ==2 else TSNE(n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate)).fit_transform(tfs_reduced)
    return tfs_2d

# Analysis

### Statistics on words in the corpus

In [28]:
#words = tfidf_vectorizer.get_feature_names()
#counts = np.sum(tfs, axis=1)
#print(tfs.shape)
#print(counts.shape)
#p = figure(x_range=counts.sort(), plot_width=800, plot_height=800, title=None)
#p.vbar(x=words, top=counts, width=10)
#show(p)

# Linearize results into a grid

In [29]:
def mapToSpaceSampling(points):
    # just take the first n² < #points Points
    points = points[: int(np.sqrt(len(points)))**2]
    grid = np.dstack(np.meshgrid(np.linspace(np.min(points[:, 0]), np.max(points[:, 0]), int(np.sqrt(len(points)))),
                       np.linspace(np.min(points[:, 1]), np.max(points[:, 1]), int(np.sqrt(len(points)))))).reshape(-1, 2)
    cost = cdist(points, grid, "sqeuclidean").astype(np.float64)
    print(cost.shape)
    cost *= 100000 / cost.max()
    row_ind_lapjv, col_ind_lapjv, _ = lapjv(cost, verbose=True, force_doubles=True)
    return grid[row_ind_lapjv]

In [30]:
def computeClusterTopography(points, values, width, height, interpolation='linear'):
    # lay grid over the points so that all points are covered
    grid_x, grid_y = np.mgrid[np.min(points[:,0]):np.max(points[:,0]):width*1j, np.min(points[:,1]):np.max(points[:,1]):height*1j]
    return griddata(np.array(points), np.array(values[:len(points)]), (grid_x, grid_y), method=interpolation, fill_value=np.min(values[:len(points)]))

In [31]:
def compute(tfs, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height):
    
    if dimreduction == 'LSA':
        tfs_reduced, model = LSA(tfs, num_topics=num_topics)
    elif dimreduction == 'NMF':
        tfs_reduced, model = NMF(tfs, num_topics=num_topics)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    if clustering == 'KMEANS':
        clusters = cluster(tfs_reduced, num_topics=num_clusters)
    else:
        return 'No clustering technique was selected!'
    
    if embedding == 'LDA':
        tfs_embedded, lda = dimReductionLDA(tfs_reduced, clusters=clusters, targetDim=targetDim)
    elif embedding == 'tSNE':
        tfs_embedded = dimReductiontSNE(tfs_reduced, perplexity=perplexity, learning_rate=learning_rate, targetDim=targetDim)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    # compute linearization
    tfs_mapped =  np.array([[0,0]]*len(tfs_embedded)) # mapToSpaceSampling(tfs_embedded) if targetDim == 2 else
    
    # compute top words (TODO: could be flawed)
    tfidf_feature_names = [] # [str(token) for token in tfidf_vectorizer.get_feature_names()]
    cluster_words = [] # [get_top_words_point(model, center, tfidf_feature_names, 5) for center in clusters.cluster_centers_]
    
    #compute cluster topography
    #similarity_to_cluster_centers = [norm(x-clusters.cluster_centers_[clusters.labels_[i]]) for i,x in enumerate(tfs_reduced)]
    #similarity_to_cluster_centers = similarity_to_cluster_centers / norm(similarity_to_cluster_centers)
    #reduction_error = np.max(lda.decision_function(tfs_reduced), axis=1) if (embedding == 'LDA') else [0]* len(tfs_embedded)
    #reduction_error = reduction_error / norm(reduction_error)
    #mixed = reduction_error + similarity_to_cluster_centers
    #interpolated_topography = computeClusterTopography(tfs_embedded if viz == 'scatter' else tfs_mapped, similarity_to_cluster_centers if error=='cluster_error' else reduction_error if error=='reduction_error' else mixed, width, height, interpolation)
    return model, tfs_reduced, clusters, tfs_embedded, tfs_mapped, tfidf_feature_names, cluster_words, [], [], [], []

# Visualization

In [32]:
def draw_scatter(data, width=600, height=600, viz='scatter'):
    display(Javascript("""
        (function(element){
            require(['scatter'], function(scatter) {
                scatter(element.get(0), %s, %d, %d, %s);
            });
        })(element);
    """ % (json.dumps(data), width, height, json.dumps(viz))))

In [33]:
def save(payload):
    name = "c" + str(payload['params']['num_clusters']) +"-t" + str(payload['params']['num_topics']) + "_" + str(payload['params']['embedding'])
    if payload['params']['embedding'] == 'tSNE':
        name += "_p" + str(payload['params']['perplexity']) + "-lr" + str(payload['params']['learning_rate'])
    with open('./dumps/' + name + '.json', 'w') as dumpfile:
        json.dump(payload, dumpfile, sort_keys=True, indent=4, ensure_ascii=False)

In [34]:
def visualize(targetDim=2,tfs=None,dimreduction='LSA', clustering='KMEANS', embedding='LDA', num_topics=20, num_clusters=3, perplexity=5, learning_rate=200, error='cluster_error', interpolation='linear', viz='scatter'):
       
    # viz dimensions
    width = 600
    height = 600
    
    # compute all necessary stuff
    model, tfs_reduced, clusters, tfs_embedded, tfs_mapped, tfidf_feature_names, cluster_words, similarity_to_cluster_centers, reduction_error, mixed, interpolated_topography = compute(tfs, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height)
    
    
    [print(i, words) for i,words in enumerate(cluster_words)]
    colours = d3['Category10'][num_clusters]
    #ids, titles, texts = [list(elem) for elem in zip(*loadProjects())]
    if targetDim == 2:
        # configure bokeh plot                   
        source = ColumnDataSource(data=dict(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            x_mapped=tfs_mapped[:, 0],
            y_mapped=tfs_mapped[:, 1],
            ids=mfndata.getIDs(),
            titles=mfndata.getTitles(),
            colours=np.array(colours)[clusters.labels_],
            labels=clusters.labels_
        ))

        TOOLTIPS = [
            ("index", "$index"),
            ("id", "@ids"),
            ("title", "@titles"),
        ]
        # scatterplot
        scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom,save')
        scatter.scatter('x', 'y', size=10,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        
        # mapped scatterplot
        mapped_scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom')
        mapped_scatter.scatter('x_mapped', 'y_mapped', size=50,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = mapped_scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        show(row(scatter, mapped_scatter))
    else:

        source = go.Scatter3d(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            z=tfs_embedded[:, 2],
            mode='markers',
            marker=dict(
                size=2,
                color=clusters.labels_,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.8
            )
        )

        data = [source]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        iplot(fig, filename='3d-scatter-colorscale')
    
    payload = {
        'params': {
            'targetDim': targetDim,
            'dimreduction': dimreduction,
            'clustering': clustering,
            'embedding': embedding,
            'num_topics': num_topics,
            'num_clusters': num_clusters,
            'perplexity': perplexity,
            'learning_rate': learning_rate
        },
        'project_data': [{'id':pid,'reducedpoint': reducedpoint, 'embpoint':embpoint, 'mappoint':mappoint, 'cluster':cluster, 'error':error, 'title': title, 'words': words} for pid, reducedpoint, embpoint, mappoint, cluster, error, title, words in zip(
            list(cleanProjectTexts().keys()),
            tfs_reduced.tolist(),
            tfs_embedded.tolist(),
            tfs_mapped.tolist(),
            clusters.labels_.tolist(),
            reduction_error,
            [next((project['title'] for project in loadProjects() if project['id'] == key), ['None']) for key in cleanProjectTexts().keys()],
            [get_top_words_point(model, point, tfidf_feature_names, 5) for point in tfs_reduced]
            
        )],
        'cluster_data': {
            'cluster_words': cluster_words,
            'cluster_colour': colours
        },
        'cluster_topography': np.flip(interpolated_topography.T, axis=0).flatten().tolist()
    }
    #save(payload)
    #display(HTML(filename="scatter.css.html"))
    #display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});"))
    #display(Javascript(filename="scatter.js"))
    #draw_scatter(payload, width, height, viz)
        


In [41]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

w = interactive(visualize,targetDim=s(2,3),tfs=fixed(docs_vectorized_tfidf), dimreduction=['LSA', 'NMF'], clustering=['KMEANS'], embedding=['LDA', 'tSNE'], num_topics=s(4,48), num_clusters=s(4,14), perplexity=s(5,50), learning_rate=s(100,1000),error=['red_error', 'cluster_error', 'mixed'], interpolation=['linear', 'cubic', 'nearest'], viz=['scatter', 'linearized'])
output = w.children[-1]
#output.layout.height = '2000px'
display(w)



Comparing sparse matrices using == is inefficient, try using != instead.



interactive(children=(IntSlider(value=2, continuous_update=False, description='targetDim', max=3, min=2), Drop…

In [36]:
#%lprun -f visualize visualize(**w.kwargs)