# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [1]:
# general imports
import numpy as np
#import sklearn
import os

# data wrangling
import json
import spacy
spacy.prefer_gpu()
from spacy_langdetect import LanguageDetector
import psycopg2
from multiprocessing import Pool, cpu_count

# document embedding
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short

# topic extraction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF as NonnegativeMatrixFactorization
from gensim.models.coherencemodel import CoherenceModel


#clustering
from sklearn.cluster import KMeans

# projection into 2d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE

# linearization
from lapjv import lapjv
from scipy.spatial.distance import cdist
from scipy.interpolate import griddata
from numpy.linalg import norm
from sklearn.preprocessing import normalize

# quality metrics of the clustering
from sklearn.metrics import silhouette_samples

# interactivity
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML

#2d plot
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, OpenURL, TapTool
from bokeh.plotting import figure, ColumnDataSource
from bokeh.palettes import d3, brewer
from bokeh.layouts import row, column
output_notebook()

# 3d plot
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [2]:
with open(os.environ['PG_PASSWORD']) as password_file:
    password = password_file.read().strip()
    conn = psycopg2.connect(dbname="ikon", user="ikonuser", password=password, port=5432, host='Postgres')
class DataLoader(object):
    def __init__(self, query, clean=True, stream=False, workers=cpu_count()):
        self.query = query
        self.clean = clean
        self.nlp = spacy.load('de', disable=["ner", "tagger"])
        #self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        self.nlp.Defaults.stop_words |= self.loadGermanStopwords()
        self.nlp.Defaults.stop_words |= self.loadEnglishStopwords()
        data = self.chunkify(self.loadFromDB(self.query).fetchall(), workers)
        with Pool(workers) as pool:
            self.data = [item for sublist in pool.map(self.preprocessText, data) for item in sublist]
        
        self.filepath = get_tmpfile(str(hash(tuple(self.data))))
        with open(self.filepath, "w") as file:
            for text, *args in self.data:
                file.write("%s\n" % " ".join(text))
            
    def __iter__(self):
        self.pos = 0
        return self
    
    def __next__(self):
        if self.pos >= len(self.data)-1:
            raise StopIteration
        self.pos += 1
        text, *args = self.data[self.pos]
        return TaggedDocument(text, [self.pos])
    
    def  __getitem__(self, pos):
        text, *args = self.data[pos]
        return TaggedDocument(text, [pos])
    
    def __len__(self):
        return len(self.data)
    
    def getIDs(self):
        return [id for (text, id, title) in self.data]
    
    def getTitles(self):
        return [title for (text, id, title) in self.data]
        
    def loadFromDB(self, query):
        cursor = conn.cursor()
        cursor.execute(query)
        return cursor
        
    def loadGermanStopwords(self):
        with open('../data/stopwords_de.json', 'r') as datafile:
            return set(json.load(datafile))

    def loadEnglishStopwords(self):
        with open('../data/stopwords_eng.json', 'r') as datafile:
            return set(json.load(datafile))
        
    def preprocessText(self, results):
        texts, *args = zip(*results)
        data = []
        for doc, *args in zip(self.nlp.pipe(texts, batch_size=100, n_threads=-1), *args):
            if(doc._.language['language'] == 'de'):
                data.append((tuple([token.lemma_ for token in doc if self.filterType(token)]), *args))
        return data
    
    def chunkify(self, lst, n):
        return [lst[i::n] for i in range(n)]
        
    def filterType(self, token):
        return token.is_alpha and not (token.is_stop or token.like_num or token.is_punct) and len(token.lemma_) > 3

In [3]:
traindata = %time DataLoader('''SELECT FIRST(project_abstract), FIRST(id), FIRST(title) \
                                FROM projects WHERE project_abstract NOT LIKE '%Keine Zusammenfassung%' \
                                GROUP BY project_abstract \
                                ;''')
mfndata = DataLoader('''SELECT abstract, id, title \
                        FROM project_view \
                        WHERE institution_id = 13232 AND abstract NOT LIKE '%Zusammenfassung%';''')

CPU times: user 5.52 s, sys: 1.72 s, total: 7.24 s
Wall time: 18min 51s


In [25]:
len(mfndata)

82

## Document Embedding

### TF-IDF
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [5]:
dct = %time Dictionary(doc.words for doc in traindata)  # fit dictionary
traincorpus = [dct.doc2bow(doc.words) for doc in traindata]  # convert corpus to BoW format
model_tfidf = %time TfidfModel(traincorpus)  # fit model

CPU times: user 14.2 s, sys: 59.3 ms, total: 14.2 s
Wall time: 14.2 s
CPU times: user 4.97 s, sys: 9.32 ms, total: 4.98 s
Wall time: 4.98 s


In [6]:
mfncorpus = [dct.doc2bow(doc.words) for doc in mfndata]  # convert corpus to BoW format
docs_vectorized_tfidf = corpus2csc(model_tfidf[mfncorpus]).T

### Doc2Vec
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [7]:
class Doc2VecExtended(Doc2Vec):
    def top_words(self, vector, topn=5):
        return self.wv.similar_by_vector(vector, topn=topn)

In [8]:
print('Doc2Vec setup and vocabulary building:')
doc2vec_model = %time Doc2VecExtended(corpus_file=traindata.filepath, total_words=dct.num_pos, vector_size=100, window=20, min_count=10, workers=4, epochs=20)
print('Doc2Vec training:')
%time doc2vec_model.train(corpus_file=traindata.filepath, total_words=dct.num_pos, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Doc2Vec setup and vocabulary building:



This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



CPU times: user 12min 58s, sys: 2.3 s, total: 13min 1s
Wall time: 3min 31s
Doc2Vec training:
CPU times: user 12min 45s, sys: 2.11 s, total: 12min 47s
Wall time: 3min 22s


In [9]:
docs_vectorized_doc2vec = np.array([doc2vec_model.infer_vector(doc.words) for doc in mfndata])

In [10]:
print(docs_vectorized_doc2vec.shape)

(81, 100)


# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

In [11]:
def LSA(tfs,num_topics=40):
    print('LSA:')
    lsa = %time TruncatedSVD(n_components=num_topics, random_state=0).fit(tfs)
    return lsa.transform(tfs), lsa

## Autoencoder
Summary: **Coming soon**

In [12]:
def Autoencoder(tfs,num_topics=40):
    pass

## Get top words for each dimension
In order to get the words which are most important for each dimension (which correspond to topics), the standard basis in the topic space is converted back into the word space. These are exactly the eigenvectors of data. Now the top n biggest entries and their corresponding words form the top words.

In [13]:
def get_top_words_dim(model, feature_names, n_top_words):
    dim_topics = {}
    for topic_idx, topic in enumerate(model.components_):
        dim_topics[topic_idx] = [feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]
    return dim_topics

## Clustering

### K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [14]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def cluster(tfs_reduced, num_topics=10):
    print('K-Means:')
    km = %time KMeans(n_clusters=num_topics).fit(tfs_reduced)
    return km

### Get top words for each cluster
The process is similar to the one for getting the top words for each dimension. But in this case the cluster centers from the clustering step are transformed back into the word space and analysed. This is based on the assumption that the cluster center represents the set of all documents in the corrsponding cluster.

In [15]:
def get_top_words_point(model, point, feature_names, n_top_words):
    point = np.array(point).reshape(1,-1)
    point.reshape(1, -1)
    word_space_point = model.inverse_transform(point)
    return [feature_names[j] for j in word_space_point.argsort()[0][:-n_top_words - 1:-1]]

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [16]:
def dimReductionLDA(tfs_reduced, clusters, targetDim=2):
    lda = LinearDiscriminantAnalysis(n_components=targetDim)
    print('LDA:')
    tfs_2d = %time lda.fit(tfs_reduced, clusters.labels_).transform(tfs_reduced)
    return tfs_2d, lda

## tSNE
*Summary*:


*In-depth explanation*:

In [17]:
def dimReductiontSNE(tfs_reduced, perplexity=30, learning_rate=100, targetDim=2):
    print('t-SNE:')
    tfs_2d = %time TSNE(n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate).fit_transform(tfs_reduced)
    return tfs_2d

# Linearize results into a grid

In [18]:
def mapToSpaceSampling(points):
    # just take the first n² < #points Points
    points = points[: int(np.sqrt(len(points)))**2]
    grid = np.dstack(np.meshgrid(np.linspace(np.min(points[:, 0]), np.max(points[:, 0]), int(np.sqrt(len(points)))),
                       np.linspace(np.min(points[:, 1]), np.max(points[:, 1]), int(np.sqrt(len(points)))))).reshape(-1, 2)
    cost = cdist(points, grid, "sqeuclidean").astype(np.float64)
    print(cost.shape)
    cost *= 100000 / cost.max()
    row_ind_lapjv, col_ind_lapjv, _ = lapjv(cost, verbose=True, force_doubles=True)
    return grid[row_ind_lapjv]

In [19]:
def computeClusterTopography(points, values, width, height, interpolation='linear'):
    # lay grid over the points so that all points are covered
    grid_x, grid_y = np.mgrid[np.min(points[:,0]):np.max(points[:,0]):width*1j, np.min(points[:,1]):np.max(points[:,1]):height*1j]
    return griddata(np.array(points), np.array(values[:len(points)]), (grid_x, grid_y), method=interpolation, fill_value=np.min(values[:len(points)]))

In [20]:
def compute(tfs, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height):
    
    if dimreduction == 'LSA':
        tfs_reduced, model = LSA(tfs, num_topics=num_topics)
    elif dimreduction == 'NMF':
        tfs_reduced, model = NMF(tfs, num_topics=num_topics)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    if clustering == 'KMEANS':
        clusters = cluster(tfs_reduced, num_topics=num_clusters)
    else:
        return 'No clustering technique was selected!'
    
    if embedding == 'LDA':
        tfs_embedded, lda = dimReductionLDA(tfs_reduced, clusters=clusters, targetDim=targetDim)
    elif embedding == 'tSNE':
        tfs_embedded = dimReductiontSNE(tfs_reduced, perplexity=perplexity, learning_rate=learning_rate, targetDim=targetDim)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    # compute linearization
    tfs_mapped = mapToSpaceSampling(tfs_embedded) if targetDim == 2 else np.array([[0,0]]*len(tfs_embedded)) 
    
    # compute top words (TODO: could be flawed)
    tfidf_feature_names = [[]] # [str(token) for token in tfidf_vectorizer.get_feature_names()]
    cluster_words = [[word for word, score in doc2vec_model.wv.similar_by_vector(np.mean(tfs[clusters.labels_==cluster], axis=0), topn=5)] for cluster in range(num_clusters)]
    top_words = [[word for word, score in doc2vec_model.top_words(project, topn=5)] for project in docs_vectorized_doc2vec]
    
    # compute coherence score
    cm = CoherenceModel(topics=cluster_words, corpus=mfncorpus, dictionary=dct, coherence='u_mass')
    print("Coherence score: ", cm.get_coherence())
    
    #compute cluster topography
    similarity_to_cluster_centers = [norm(x-clusters.cluster_centers_[clusters.labels_[i]]) for i,x in enumerate(tfs_reduced)]
    similarity_to_cluster_centers = similarity_to_cluster_centers / norm(similarity_to_cluster_centers)
    reduction_error = np.max(lda.decision_function(tfs_reduced), axis=1) if (embedding == 'LDA') else [0]* len(tfs_embedded)
    reduction_error = reduction_error / norm(reduction_error)
    interpolated_topography = computeClusterTopography(tfs_embedded if viz == 'scatter' else tfs_mapped, similarity_to_cluster_centers if error=='cluster_error' else silhouette_samples(tfs_reduced, clusters.labels_), width, height, interpolation)
    return model, tfs_reduced, clusters, tfs_embedded, tfs_mapped, tfidf_feature_names, cluster_words, top_words, similarity_to_cluster_centers, reduction_error, interpolated_topography

# Visualization

In [21]:
def draw_scatter(data, width=600, height=600, viz='scatter'):
    display(Javascript("""
        (function(element){
            require(['scatter'], function(scatter) {
                scatter(element.get(0), %s, %d, %d, %s);
            });
        })(element);
    """ % (json.dumps(data), width, height, json.dumps(viz))))

In [22]:
def save(payload):
    name = "c" + str(payload['params']['num_clusters']) +"-t" + str(payload['params']['num_topics']) + "_" + str(payload['params']['embedding'])
    if payload['params']['embedding'] == 'tSNE':
        name += "_p" + str(payload['params']['perplexity']) + "-lr" + str(payload['params']['learning_rate'])
    with open('./dumps/' + name + '.json', 'w') as dumpfile:
        json.dump(payload, dumpfile, sort_keys=True, indent=4, ensure_ascii=False)

In [23]:
def visualize(targetDim=2,tfs=None,dimreduction='LSA', clustering='KMEANS', embedding='LDA', num_topics=20, num_clusters=3, perplexity=5, learning_rate=200, error='cluster_error', interpolation='linear', viz='scatter'):
       
    # viz dimensions
    width = 600
    height = 600
    
    # compute all necessary stuff
    model, tfs_reduced, clusters, tfs_embedded, tfs_mapped, tfidf_feature_names, cluster_words, top_words, similarity_to_cluster_centers, reduction_error, interpolated_topography = compute(tfs, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height)
    
    
    [print(i, words) for i,words in enumerate(cluster_words)]
    colours = d3['Category10'][num_clusters]
    #ids, titles, texts = [list(elem) for elem in zip(*loadProjects())]
    if targetDim == 2:
        # configure bokeh plot                   
        source = ColumnDataSource(data=dict(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            x_mapped=tfs_mapped[:, 0],
            y_mapped=tfs_mapped[:, 1],
            ids=mfndata.getIDs(),
            titles=mfndata.getTitles(),
            colours=np.array(colours)[clusters.labels_],
            labels=clusters.labels_
        ))

        TOOLTIPS = [
            ("index", "$index"),
            ("id", "@ids"),
            ("title", "@titles"),
        ]
        # scatterplot
        scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom,save')
        scatter.scatter('x', 'y', size=10,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        
        # mapped scatterplot
        mapped_scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom')
        mapped_scatter.scatter('x_mapped', 'y_mapped', size=50,color='colours', legend='labels', source=source)
        url = 'http://gepris.dfg.de/gepris/projekt/@ids'
        taptool = mapped_scatter.select(type=TapTool)
        taptool.callback = OpenURL(url=url)
        #show(row(scatter, mapped_scatter))
    else:

        source = go.Scatter3d(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            z=tfs_embedded[:, 2],
            mode='markers',
            marker=dict(
                size=2,
                color=clusters.labels_,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.8
            )
        )

        data = [source]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        iplot(fig, filename='3d-scatter-colorscale')
    
    payload = {
        'params': {
            'targetDim': targetDim,
            'dimreduction': dimreduction,
            'clustering': clustering,
            'embedding': embedding,
            'num_topics': num_topics,
            'num_clusters': num_clusters,
            'perplexity': perplexity,
            'learning_rate': learning_rate
        },
        'project_data': [{'id':pid,'reducedpoint': reducedpoint, 'embpoint':embpoint, 'mappoint':mappoint, 'cluster':cluster, 'error':error, 'title': title, 'words': words} for pid, reducedpoint, embpoint, mappoint, cluster, error, title, words in zip(
            mfndata.getIDs(),
            tfs_reduced.tolist(),
            tfs_embedded.tolist(),
            tfs_mapped.tolist(),
            clusters.labels_.tolist(),
            reduction_error,
            mfndata.getTitles(),
            top_words
            
        )],
        'cluster_data': {
            'cluster_words': cluster_words,
            'cluster_colour': colours
        },
        'cluster_topography': np.flip(interpolated_topography.T, axis=0).flatten().tolist()
    }
    save(payload)
    display(HTML(filename="scatter.css.html"))
    display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});"))
    display(Javascript(filename="scatter.js"))
    draw_scatter(payload, width, height, viz)
        


In [24]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

w = interactive(visualize,targetDim=s(2,3),tfs=fixed(docs_vectorized_doc2vec), dimreduction=['LSA', 'NMF'], clustering=['KMEANS'], embedding=['LDA', 'tSNE'], num_topics=s(4,48), num_clusters=s(4,14), perplexity=s(5,50), learning_rate=s(100,1000),error=['silhouette', 'cluster_error'], interpolation=['linear', 'cubic', 'nearest'], viz=['scatter', 'linearized'])
output = w.children[-1]
#output.layout.height = '2000px'
display(w)


interactive(children=(IntSlider(value=2, continuous_update=False, description='targetDim', max=3, min=2), Drop…