# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [1]:
# general imports
import numpy as np
#import sklearn
import os

# data wrangling
import json
import spacy
spacy.prefer_gpu()
import psycopg2
from multiprocessing import Pool, cpu_count
from os import path



# document embedding
from gensim.utils import SaveLoad
from gensim.corpora.dictionary import Dictionary
from gensim.sklearn_api import TfIdfTransformer, D2VTransformer
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short
from scipy.sparse import csr_matrix
import scipy

# topic extraction
from sklearn.decomposition import TruncatedSVD
from gensim.models.coherencemodel import CoherenceModel
from sklearn.preprocessing import normalize


#clustering
from numpy import triu_indices
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration
from sklearn.neighbors import radius_neighbors_graph

# projection into 2d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE

# linearization
from lapjv import lapjv
from scipy.spatial.distance import cdist
from scipy.interpolate import griddata
from numpy.linalg import norm
from sklearn.preprocessing import normalize
from scipy.sparse import vstack

# quality metrics of the clustering
from sklearn.metrics import silhouette_samples

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from tempfile import mkdtemp

# interactivity
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider, Dropdown, FloatSlider, fixed
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML
import pickle

#2d plot
from bokeh.io import output_notebook, show, export_png
from bokeh.models import ColumnDataSource, OpenURL, TapTool, LinearAxis, Grid
from bokeh.plotting import figure
from bokeh.models.glyphs import VBar
from bokeh.palettes import d3, brewer, mpl, inferno
from bokeh.layouts import row, column
output_notebook()
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

# 3d plot
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [135]:
with open(os.environ['PG_PASSWORD']) as password_file:
    password = password_file.read().strip()
    conn = psycopg2.connect(dbname="ikon", user="ikonuser", password=password, port=5432, host='Postgres')

class DataLoader(object):
    def __init__(self, query, clean=True, stream=False, workers=cpu_count()):
        self.query = query
        self.data = self.loadFromDB(self.query).fetchall()
            
    def __iter__(self):
        self.pos = 0
        return self
    
    def __next__(self):
        if self.pos >= len(self.data):
            raise StopIteration
        self.pos += 1
        return self.__getitem__(self.pos-1)
    
    def  __getitem__(self, pos):
        return self.data[self.pos]
    
    def __len__(self):
        return len(self.data)
    
    def __str__(self):
        return str(self.data)
    
    def loadFromDB(self, query):
        cursor = conn.cursor()
        cursor.execute(query)
        return cursor
        
class DataPreprocessor(DataLoader):
    def __init__(self, query, clean=True, stream=False, workers=cpu_count()):
        self.query = query
        self.clean = clean
        self.nlp = spacy.load('de', disable=["ner", "tagger"])
        self.nlp.Defaults.stop_words |= self.loadEnglishStopwords()
        data = self.chunkify(self.loadFromDB(self.query).fetchall(), workers)
        with Pool(workers) as pool:
            self.data = [item for sublist in pool.map(self.preprocessText, data) for item in sublist]
    
    def getIDs(self):
        return [id for (text, id, title) in self.data]
    
    def getTitles(self):
        return [title for (text, id, title) in self.data]
        
    def  __getitem__(self, pos):
        text, *args = self.data[pos]
        return text
    
    def __hash__(self):
        return hash(self.data)

    def loadEnglishStopwords(self):
        with open('../data/stopwords_eng.json', 'r') as datafile:
            return set(json.load(datafile))
        
    def preprocessText(self, results):
        texts, *args = zip(*results)
        data = []
        for doc, *args in zip(self.nlp.pipe(texts, batch_size=100, n_threads=-1), *args):
            if doc.lang_ == 'de' and len(doc) > 0:
                filter_doc = tuple([token.lemma_ for token in doc if self.filterType(token)])
                if len(filter_doc) > 0:
                    data.append((filter_doc, *args))
        return data
    
    def chunkify(self, lst, n):
        return [lst[i::n] for i in range(n)]
        
    def filterType(self, token):
        return token.is_alpha and not (token.is_stop or token.like_num or token.is_punct) and len(token.lemma_) > 3


In [136]:
traindata = %time DataPreprocessor('''SELECT FIRST(project_abstract), FIRST(id), FIRST(title) \
                                FROM projects \
                                WHERE project_abstract NOT LIKE '%Keine Zusammenfassung%' \
                                GROUP BY project_abstract \
                                LIMIT 100;''')

CPU times: user 5.78 s, sys: 577 ms, total: 6.35 s
Wall time: 15.4 s


In [137]:
mfndata = DataPreprocessor('''SELECT summary, id, titelprojekt \
                        FROM mfnprojects \
                        WHERE summary NOT LIKE '%Zusammenfassung%';''')

## Document Embedding

### TF-IDF
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [156]:
emb = Embedding(method='doc2vec')
emb.fit_transform(traindata)

<99x100 sparse matrix of type '<class 'numpy.float32'>'
	with 9900 stored elements in Compressed Sparse Row format>

# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

In [158]:
t = TopicExtraction(10)
pickle.dumps(t)

b'\x80\x03c__main__\nTopicExtraction\nq\x00)\x81q\x01}q\x02(X\x08\x00\x00\x00featuresq\x03K\nX\x06\x00\x00\x00methodq\x04X\x03\x00\x00\x00lsaq\x05X\x08\x00\x00\x00selectorq\x06csklearn.decomposition.truncated_svd\nTruncatedSVD\nq\x07)\x81q\x08}q\t(X\t\x00\x00\x00algorithmq\nX\n\x00\x00\x00randomizedq\x0bX\x0c\x00\x00\x00n_componentsq\x0cK\nX\x06\x00\x00\x00n_iterq\rK\x05X\x0c\x00\x00\x00random_stateq\x0eK\x00X\x03\x00\x00\x00tolq\x0fG\x00\x00\x00\x00\x00\x00\x00\x00X\x10\x00\x00\x00_sklearn_versionq\x10X\x06\x00\x00\x000.21.3q\x11ubub.'

## Clustering

### K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [159]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def clusterkm(tfs_reduced, num_topics=10):
    km = KMeans(n_clusters=num_topics).fit(tfs_reduced)
    return km

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

## tSNE
*Summary*:


*In-depth explanation*:

# Linearize results into a grid

In [163]:
def mapToSpaceSampling(points):
    # just take the first n² < #points Points
    points = points[: int(np.sqrt(len(points)))**2]
    grid = np.dstack(np.meshgrid(np.linspace(np.min(points[:, 0]), np.max(points[:, 0]), int(np.sqrt(len(points)))),
                       np.linspace(np.min(points[:, 1]), np.max(points[:, 1]), int(np.sqrt(len(points)))))).reshape(-1, 2)
    cost = cdist(points, grid, "sqeuclidean").astype(np.float64)
    cost *= 100000 / cost.max()
    row_ind_lapjv, col_ind_lapjv, _ = lapjv(cost, verbose=True, force_doubles=True)
    return grid[row_ind_lapjv]

In [164]:
def computeClusterTopography(points, values, width, height, interpolation='linear'):
    # lay grid over the points so that all points are covered
    grid_x, grid_y = np.mgrid[np.min(points[:,0]):np.max(points[:,0]):width*1j, np.min(points[:,1]):np.max(points[:,1]):height*1j]
    return griddata(np.array(points), np.array(values[:len(points)]), (grid_x, grid_y), method=interpolation, fill_value=np.min(values[:len(points)]))

In [165]:
def compute(embedding, dimreduction, clustering, planereduction, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height):
    
    estimators = [('Embedding', Embedding(method=embedding)),
                  ('EmbeddingData', Debug()),
                  ('TopicExtraction', TopicExtraction(num_topics, method=dimreduction)),
                  ('TopicExtractionData', Debug()),
                  ('Clustering', Clustering(num_clusters, method=clustering)), 
                  ('PlaneReduction', PlaneReduction(2, method=planereduction, perplexity=perplexity, learning_rate=learning_rate))]
    pipe = Pipeline(estimators, memory='/tmp/')
    
    pipe.fit(traindata)
    tfs_plane, labels = pipe.fit_transform(mfndata)
    tfs_reduced = pipe.named_steps.TopicExtractionData.data
    tfs = pipe.named_steps.EmbeddingData.data
    print(tfs.shape)
    
    # compute linearization
    tfs_mapped = mapToSpaceSampling(tfs_plane)
    
    # compute top words
    cluster_words = [pipe.named_steps.Embedding.top_words(csr_matrix(vstack([tfs.getrow(i) for i in np.flatnonzero(labels==cluster)]).mean(axis=0)), topn=5) for cluster in range(num_clusters)]
    top_words = [pipe.named_steps.Embedding.top_words(tfs.getrow(i), topn=5) for i in range(tfs.shape[0])]
    # compute coherence score
    #cm = CoherenceModel(topics=cluster_words, window_size=10, texts=[list(doc.words) for doc in traindata], dictionary=dct, processes=cpu_count())
    
    #compute cluster topography
    similarity_to_cluster_centers = silhouette_samples(tfs_plane, labels=labels)

    interpolated_topography = np.array([1]*(width*height))# computeClusterTopography(tfs_plane if viz == 'scatter' else tfs_mapped, silhouette_samples(tfs_reduced, clusters.labels_), width, height, interpolation)

    return tfs_reduced, labels, tfs_plane, tfs_mapped, cluster_words, top_words, similarity_to_cluster_centers, interpolated_topography

# Visualization

In [166]:
def draw_scatter(data, width=600, height=600, viz='scatter'):
    display(Javascript("""
        (function(element){
            require(['scatter'], function(scatter) {
                scatter(element.get(0), %s, %d, %d, %s);
            });
        })(element);
    """ % (json.dumps(data), width, height, json.dumps(viz))))

In [167]:
def save(payload):
    name = "c" + str(payload['params']['num_clusters']) +"-t" + str(payload['params']['num_topics']) + "_" + str(payload['params']['embedding'])
    if payload['params']['embedding'] == 'tSNE':
        name += "_p" + str(payload['params']['perplexity']) + "-lr" + str(payload['params']['learning_rate'])
    with open('./dumps/' + name + '.json', 'w') as dumpfile:
        json.dump(payload, dumpfile, sort_keys=True, indent=4, ensure_ascii=False)

In [168]:
def visualize(embedding='tfidf',dimreduction='LSA', clustering='KMEANS', planereduction='LDA', num_topics=20, granularity=5, perplexity=5, learning_rate=200, error='cluster_error', interpolation='linear', viz='scatter', fake=''):
    # viz dimensions
    num_clusters=granularity
    width = 600
    height = 600
    payload = {}
    
    
    if not fake:
        tfs_reduced, clusters, tfs_embedded, tfs_mapped, cluster_words, top_words, similarity_to_cluster_centers, interpolated_topography = compute(embedding, dimreduction, clustering, planereduction, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height)

        [print(i, words) for i,words in enumerate(cluster_words)]
        colours = d3['Category20'][num_clusters]
        
        # configure bokeh plot                   
        source = ColumnDataSource(data=dict(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            x_mapped=tfs_mapped[:, 0],
            y_mapped=tfs_mapped[:, 1],
            ids=mfndata.getIDs(),
            titles=mfndata.getTitles(),
            colours=np.array(colours)[clusters],
            labels=clusters
        ))

        TOOLTIPS = [
            ("index", "$index"),
            ("id", "@ids"),
            ("title", "@titles"),
        ]
        # scatterplot
        scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom,save')
        scatter.scatter('x', 'y', size=10,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)

        # mapped scatterplot
        mapped_scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom')
        mapped_scatter.scatter('x_mapped', 'y_mapped', size=50,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = mapped_scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)

        payload = {
            'params': {
                'dimreduction': dimreduction,
                'clustering': clustering,
                'embedding': embedding,
                'num_topics': num_topics,
                'num_clusters': num_clusters,
                'perplexity': perplexity,
                'learning_rate': learning_rate
            },
            'project_data': [{'id':pid,'reducedpoint': reducedpoint, 'embpoint':embpoint, 'mappoint':mappoint, 'cluster':cluster, 'error':error, 'title': title, 'words': words} for pid, reducedpoint, embpoint, mappoint, cluster, error, title, words in zip(
                mfndata.getIDs(),
                tfs_reduced.tolist(),
                tfs_embedded.tolist(),
                tfs_mapped.tolist(),
                clusters.tolist(),
                similarity_to_cluster_centers.tolist(),
                mfndata.getTitles(),
                top_words

            )],
            'cluster_data': {
                'cluster_words': cluster_words,
                'cluster_colour': colours
            },
            'cluster_topography': np.flip(interpolated_topography.T, axis=0).flatten().tolist()
        }
        save(payload)
    else:
        with open(fake, 'r') as input_data:
            payload = payload=json.load(input_data)
    display(HTML(filename="scatter.css.html"))
    display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});"))
    display(Javascript(filename="scatter.js"))
    draw_scatter(payload, width, height, viz)
        


In [169]:
import warnings
warnings.filterwarnings('ignore')


In [170]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

w = interactive(visualize, embedding=['doc2vec', 'tfidf'], dimreduction=['lsa'], clustering=['kmeans'], planereduction=['lda','tsne'], num_topics=s(4,48), num_clusters=s(4,10), perplexity=s(5,50), learning_rate=s(100,1000),error=['silhouette', 'cluster_error'], interpolation=['linear', 'cubic', 'nearest'], viz=['scatter', 'linearized'], fake='')
output = w.children[-1]
#output.layout.height = '2000px'
display(w)


interactive(children=(Dropdown(description='embedding', index=1, options=('doc2vec', 'tfidf'), value='tfidf'),…