# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [21]:
import time
import json
import spacy
import regex as re
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF as NonnegativeMatrixFactorization
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
import numpy as np

from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
import ipywidgets as widgets
from IPython.display import display

from bokeh.io import output_notebook, show
from bokeh.plotting import figure, ColumnDataSource
from bokeh.palettes import d3
output_notebook()

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [2]:
def loadProjects():
    with open('../../../assets/data/project_output/projects.json', 'r') as datafile:
        return json.load(datafile)

def loadGermanStopwords():
    with open('../../../assets/data/nlp/stopwords_de.json', 'r') as datafile:
        return json.load(datafile)

def loadEnglishStopwords():
    with open('../../../assets/data/nlp/stopwords_eng.json', 'r') as datafile:
        return json.load(datafile)

def cleanProjectTexts():
    cleanedProjectTexts = {}
    stopwordsDE = set(loadGermanStopwords())
    for project in loadProjects():
        if detect(project['abstract']) == 'de':
            letters_only = re.sub('[^\p{L} ]', ' ', project['abstract'])
            words = letters_only.lower().split()
            usefulWords = [x for x in words if not (x in stopwordsDE or len(x) <= 2 )]
            cleanedProjectTexts[project['id']] = ' '.join(usefulWords)
    return cleanedProjectTexts

## TF-IDF computation
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

*In-depth explanation*:

In [3]:
nlp = spacy.load('de')
def lemmatize(text):
    return nlp(text)

def TfIdf(dict):
    start = time.time()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lemmatize)
    tfs = tfidf_vectorizer.fit_transform(list(dict.values()))
    print('TFIDF execution time: ', time.time() - start)
    return (tfidf_vectorizer, tfs)

In [4]:
tfidf_vectorizer, tfs = TfIdf(cleanProjectTexts())

TFIDF execution time:  5.428374528884888


# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

*In-depth explanation*:

In [5]:
def LSA(tfs,num_topics=40):
    start = time.time()
    lsa = TruncatedSVD(n_components=num_topics, random_state=0).fit(tfs)
    print('LSA execution time: ', time.time() - start)
    
    #tfidf_feature_names = [str(token) for token in tfidf_vectorizer.get_feature_names()]
    #print_top_words(lsa, tfidf_feature_names, 10)
    return lsa.transform(tfs), lsa

## Non-negative matrix factorisation
Summary: **Coming soon**

In-depth explanation:

In [6]:
def NMF(tfs,num_topics=40):
    start = time.time()
    nmf = NonnegativeMatrixFactorization(n_components=num_topics, init='random', random_state=0)
    nmf.fit(tfs)    
    print('NMF execution time: ', time.time() - start)
    
    #
    #print_top_words(nmf, tfidf_feature_names, 10)
    return nmf.transform(tfs), nmf

### Get top words for each dimension
In order to get the words which are most important for each dimension (which correspond to topics), the standard basis in the topic space is converted back into the word space. These are exactly the eigenvectors of data. Now the top n biggest entries and their corresponding words form the top words.

In [7]:
def get_top_words_dim(model, feature_names, n_top_words):
    dim_topics = {}
    for topic_idx, topic in enumerate(model.components_):
        dim_topics[topic_idx] = [feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]
    return dim_topics

# Clustering

## K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In-depth explanation:

In [8]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def cluster(tfs_reduced, num_topics=10):
    start = time.time()
    km = KMeans(n_clusters=num_topics).fit(tfs_reduced)
    print('Clustering execution time: ', time.time() - start)
    return km

### Get top words for each cluster
The process is similar to the one for getting the top words for each dimension. But in this case the cluster centers from the clustering step are transformed back into the word space and analysed. This is based on the assumption that the cluster center represents the set of all documents in the corrsponding cluster.

In [9]:
def get_top_words_cluster(model, clustering_centers, feature_names, n_top_words):
    cluster_topics = {}
    word_space_cluster_centers = model.inverse_transform(clustering_centers)
    for i, word_space_cluster in enumerate(word_space_cluster_centers):
        cluster_topics[i] = [feature_names[j]
                        for j in word_space_cluster.argsort()[:-n_top_words - 1:-1]]
    return cluster_topics

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

*In-depth explanation*:

In [10]:
def dimReductionLDA(tfs_reduced, clusters, targetDim=2):
    start = time.time()
    tfs_2d = LinearDiscriminantAnalysis(n_components=targetDim).fit(tfs_reduced, clusters.labels_).transform(tfs_reduced)
    print('LDA execution time: ', time.time() - start)
    return tfs_2d

## tSNE
*Summary*:


*In-depth explanation*:

In [11]:
def dimReductiontSNE(tfs_reduced, perplexity=30, learning_rate=100, targetDim=2):
    start = time.time()
    tfs_2d = TSNE(n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate).fit_transform(tfs_reduced)
    print('tSNE execution time: ', time.time() - start)
    return tfs_2d

# Analysis

### Statistics on words in the corpus

In [79]:
words = ''.join([x for sublist in list(cleanProjectTexts().values()) for x in sublist]).split()
words = dict(zip(*np.unique(words, return_counts=True)))
p = figure(x_range=[k for k,v in words.items()], plot_width=800, plot_height=800, title=None)
p.vbar(x=[k for k,v in words.items()], top=[v for k,v in words.items()], width=10)
show(p)

In [12]:
def save(payload):
    with open('dump.json', 'w') as dumpfile:
        json.dump(payload, dumpfile)

In [17]:
def visualize(targetDim=2,tfs=None,dimreduction='LSA', clustering='KMEANS', embedding='LDA', num_topics=20, num_clusters=3, perplexity=5, learning_rate=200 ):
    
    if dimreduction == 'LSA':
        tfs_reduced, model = LSA(tfs, num_topics=num_topics)
    elif dimreduction == 'NMF':
        tfs_reduced, model = NMF(tfs, num_topics=num_topics)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    if clustering == 'KMEANS':
        clusters = cluster(tfs_reduced, num_topics=num_clusters)
    else:
        return 'No clustering technique was selected!'
    
    if embedding == 'LDA':
        tfs_embedded = dimReductionLDA(tfs_reduced, clusters=clusters, targetDim=targetDim)
    elif embedding == 'tSNE':
        tfs_embedded = dimReductiontSNE(tfs_reduced, perplexity=perplexity, learning_rate=learning_rate, targetDim=targetDim)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    tfidf_feature_names = [str(token) for token in tfidf_vectorizer.get_feature_names()]
    [print(i, words) for i, words in get_top_words_cluster(model, clusters.cluster_centers_, tfidf_feature_names, 5).items()]
    
    if targetDim == 2:
        # configure bokeh plot                   
        source = ColumnDataSource(data=dict(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            ids=list(cleanProjectTexts().keys()),
            titles= [next((project['title'] for project in loadProjects() if project['id'] == key), ['None']) for key in cleanProjectTexts().keys()],
            colours=np.array(d3['Category20'][num_clusters])[clusters.labels_]
        ))

        TOOLTIPS = [
            ("index", "$index"),
            ("id", "@ids"),
            ("title", "@titles"),
        ]

        p = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS)
        p.scatter('x', 'y', size=10,color='colours', source=source)
        show(p)
    else:
        source = go.Scatter3d(
            x=tfs_embedded[:, 0],
            y=tfs_embedded[:, 1],
            z=tfs_embedded[:, 2],
            mode='markers',
            marker=dict(
                size=12,
                color=clusters.labels_,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.8
            )
        )

        data = [source]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        iplot(fig, filename='3d-scatter-colorscale')
    
    payload = {
        'params': {
            'targetDim': targetDim,
            'dimreduction': dimreduction,
            'clustering': clustering,
            'embedding': embedding,
            'num_topics': num_topics,
            'num_clusters': num_clusters,
            'perplexity': perplexity,
            'learning_rate': learning_rate
        },
        'data': {
            'points': tfs_embedded.tolist(),
            'clusters': clusters.labels_.tolist(),
            'cluster_words': [ words for i, words in get_top_words_cluster(model, clusters.cluster_centers_, tfidf_feature_names, 5).items()],
            'cluster_colour': d3['Category20'][num_clusters]
        }
    }
    save(payload)
        


In [18]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

w = interactive(visualize,targetDim=s(2,3),tfs=fixed(tfs), dimreduction=['LSA', 'NMF'], clustering=['KMEANS'], embedding=['LDA', 'tSNE'], num_topics=s(4,48), num_clusters=s(4,14), perplexity=s(5,50), learning_rate=s(2,20))
output = w.children[-1]
output.layout.height = '1500px'
display(w)



Comparing sparse matrices using == is inefficient, try using != instead.



interactive(children=(IntSlider(value=2, continuous_update=False, description='targetDim', max=3, min=2), Drop…