# Topic extraction from the GEPRiS dataset and creation of an user-centric visualisation
Author: Tim Korjakow        
Summer term 2018      
Freie Universität Berlin     
Fachgebiet Human-Centered Computing

![Process graph](nlpflowchart.svg)

In [1]:
# general imports
import numpy as np
#import sklearn
import os

# data wrangling
import json
import spacy
spacy.prefer_gpu()
from spacy_langdetect import LanguageDetector
import psycopg2
from multiprocessing import Pool, cpu_count

# document embedding
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short
import scipy

# topic extraction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF as NonnegativeMatrixFactorization
from gensim.models.coherencemodel import CoherenceModel
import keras
from sklearn.preprocessing import normalize


#clustering
from numpy import triu_indices
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration
from sklearn.neighbors import radius_neighbors_graph

# projection into 2d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE

# linearization
from lapjv import lapjv
from scipy.spatial.distance import cdist
from scipy.interpolate import griddata
from numpy.linalg import norm
from sklearn.preprocessing import normalize

# quality metrics of the clustering
from sklearn.metrics import silhouette_samples

# interactivity
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider, Dropdown, FloatSlider
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML
import pickle

#2d plot
from bokeh.io import output_notebook, show, export_png
from bokeh.models import ColumnDataSource, OpenURL, TapTool, LinearAxis, Grid
from bokeh.plotting import figure
from bokeh.models.glyphs import VBar
from bokeh.palettes import d3, brewer, mpl, inferno
from bokeh.layouts import row, column
output_notebook()
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

# 3d plot
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True) # for offline mode use

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Loading and Cleaning
The first step in every NLP project which works with texts is always the preparation of the input data. In this example the Project dump from GEPRIS is loaded and the project descriptions are extracted. After that the texts get cleaned by removing all non-alphabetic chars and all stopwords from the texts. English texts are getting filtered in oder to make the analysis simpler and more comparable.

In [4]:
with open(os.environ['PG_PASSWORD']) as password_file:
    password = password_file.read().strip()
    conn = psycopg2.connect(dbname="ikon", user="ikonuser", password=password, port=5432, host='Postgres')

class DataLoader(object):
    def __init__(self, query, clean=True, stream=False, workers=cpu_count()):
        self.query = query
        self.data = self.loadFromDB(self.query).fetchall()
            
    def __iter__(self):
        self.pos = 0
        return self
    
    def __next__(self):
        if self.pos >= len(self.data):
            raise StopIteration
        self.pos += 1
        return self.data[self.pos-1]
    
    def  __getitem__(self, pos):
        text, *args = self.data[pos]
        return self.data[self.pos]
    
    def __len__(self):
        return len(self.data)
    
    def __str__(self):
        return str(self.data)
    
    def loadFromDB(self, query):
        cursor = conn.cursor()
        cursor.execute(query)
        return cursor
        
class DataPreprocessor(DataLoader):
    def __init__(self, query, clean=True, stream=False, workers=cpu_count()):
        self.query = query
        self.clean = clean
        self.nlp = spacy.load('de', disable=["ner", "tagger"])
        #self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        self.nlp.Defaults.stop_words |= self.loadGermanStopwords()
        self.nlp.Defaults.stop_words |= self.loadEnglishStopwords()
        data = self.chunkify(self.loadFromDB(self.query).fetchall(), workers)
        with Pool(workers) as pool:
            self.data = [item for sublist in pool.map(self.preprocessText, data) for item in sublist]
        
        self.filepath = get_tmpfile(str(hash(tuple(self.data))))
        with open(self.filepath, "w") as file:
            for text, *args in self.data:
                file.write("%s\n" % " ".join(text))
    
    def __next__(self):
        if self.pos >= len(self.data):
            raise StopIteration
        text, *args = self.data[self.pos]
        self.pos += 1
        return TaggedDocument(text, [self.pos])
    
    def  __getitem__(self, pos):
        text, *args = self.data[pos]
        return TaggedDocument(text, [pos])
    
    def getIDs(self):
        return [id for (text, id, title) in self.data]
    
    def getTitles(self):
        return [title for (text, id, title) in self.data]
        
    def loadGermanStopwords(self):
        with open('../data/stopwords_de.json', 'r') as datafile:
            return set(json.load(datafile))

    def loadEnglishStopwords(self):
        with open('../data/stopwords_eng.json', 'r') as datafile:
            return set(json.load(datafile))
        
    def preprocessText(self, results):
        texts, *args = zip(*results)
        data = []
        for doc, *args in zip(self.nlp.pipe(texts, batch_size=100, n_threads=-1), *args):
            if(doc._.language['language'] == 'de'):
                data.append((tuple([token.lemma_ for token in doc if self.filterType(token)]), *args))
        return data
    
    def chunkify(self, lst, n):
        return [lst[i::n] for i in range(n)]
        
    def filterType(self, token):
        return token.is_alpha and not (token.is_stop or token.like_num or token.is_punct) and len(token.lemma_) > 3

In [5]:
traindata = %time DataPreprocessor('''SELECT FIRST(project_abstract), FIRST(id), FIRST(title) \
                                      FROM projects \
                                      WHERE project_abstract NOT LIKE '%Keine Zusammenfassung%' \
                                      GROUP BY project_abstract \
                                       \
                                      ;''')
mfndata = DataPreprocessor('''SELECT abstract, id, title \
                              FROM project_view \
                              WHERE institution_id = 13232 AND abstract NOT LIKE '%Zusammenfassung%';''')

CPU times: user 5.23 s, sys: 1.74 s, total: 6.97 s
Wall time: 19min 3s


## Data
Firstly we are going to have a look at the type of texts we have:

In [None]:
textlength_histogram = DataLoader('''SELECT text_length, COUNT(text_length) \
                                     FROM ( \
                                         SELECT array_length(regexp_split_to_array(trim(project_abstract), '\s'), 1) as text_length \
                                         FROM projects \
                                     ) l \
                                     GROUP BY text_length \
                                     ;''')

In [None]:
def plot_histogram(bins, name):
    fig = go.Figure(data=[go.Bar(x=[x[0] for x in bins], y=[x[1] for x in bins], marker_color='black')])
    fig.update_layout(
        title=go.layout.Title(
            xref="paper",
            x=0
        ),
        xaxis=go.layout.XAxis(
            title=go.layout.xaxis.Title(
                text="Document length in #words",
                font=dict(
                    family="Courier New, monospace",
                    size=18,
                    color="#7f7f7f"
                )
            )
        ),
        yaxis=go.layout.YAxis(
            title=go.layout.yaxis.Title(
                text="Absolute frequency in the corpus",
                font=dict(
                    family="Courier New, monospace",
                    size=18,
                    color="#7f7f7f"
                )
            )
        )
    )
    fig.update_layout(yaxis_type="log")
    fig.show()

In [None]:
plot_histogram(textlength_histogram, 'histogram')

As one can see there is a peak at word count 3 and one at approximatly 100. The first one corresponds to all projects which do not have descriptions, because they are described with "Keine Zusammenfassung vorhanden". The latter peak on the other hand is produced by projects from a fund which uses the same descriptions for all its projects which are financed through the DFG. 

In [None]:
cleaned_textlength_histogram = DataLoader('''SELECT text_length, COUNT(text_length) \
                                     FROM ( \
                                         SELECT array_length(regexp_split_to_array(trim(FIRST(project_abstract)), '\s'), 1) as text_length \
                                         FROM projects \
                                         WHERE project_abstract NOT LIKE '%Keine Zusammenfassung%' \
                                         GROUP BY project_abstract \
                                     ) l \
                                     GROUP BY text_length \
                                     ;''')

In [None]:
plot_histogram(cleaned_textlength_histogram, 'cleaned_histogram')

## Document Embedding

### TF-IDF
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [6]:
class TfidfModelExtended(TfidfModel):
    def top_words(self, vector, dct=None ,topn=5):
        if isinstance(vector, scipy.sparse.csr_matrix):
            vector = vector.todense()
        return [dct.get(entry) for entry in np.argpartition(np.asarray(vector).ravel(), -topn)[-topn:]]

In [7]:
dct = %time Dictionary(doc.words for doc in traindata)  # fit dictionary
traincorpus = [dct.doc2bow(doc.words) for doc in traindata]  # convert corpus to BoW format
tfidf_model = %time TfidfModelExtended(traincorpus)  # fit model

CPU times: user 18.7 s, sys: 106 ms, total: 18.8 s
Wall time: 18.8 s
CPU times: user 5.16 s, sys: 17.4 ms, total: 5.18 s
Wall time: 5.18 s


In [8]:
mfncorpus = [dct.doc2bow(doc.words) for doc in mfndata]  # convert corpus to BoW format
docs_vectorized_tfidf = corpus2csc(tfidf_model[mfncorpus]).T

### Doc2Vec
*Summary*:
This technique vectorizes a corpus, e.g. a collection of documents, by counting all appearences of words in the corpus and computing the tf-idf measure for each document, word pair.

In [9]:
class Doc2VecExtended(Doc2Vec):
    def top_words(self, vector, dct=None, topn=5):
        return [word for word, prob in self.wv.similar_by_vector(vector, topn=topn)]

In [10]:
print('Doc2Vec setup and vocabulary building:')
doc2vec_model = %time Doc2VecExtended(corpus_file=traindata.filepath, total_words=dct.num_pos, vector_size=100, window=20, min_count=4, workers=cpu_count(), epochs=30)
print('Doc2Vec training:')
%time doc2vec_model.train(corpus_file=traindata.filepath, total_words=dct.num_pos, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Doc2Vec setup and vocabulary building:
CPU times: user 22min 56s, sys: 4.44 s, total: 23min 1s
Wall time: 7min 12s
Doc2Vec training:
CPU times: user 21min 29s, sys: 3.8 s, total: 21min 33s
Wall time: 5min 51s


In [11]:
docs_vectorized_doc2vec = np.array([doc2vec_model.infer_vector(doc.words) for doc in mfndata])

# Topic extraction

## Latent Semantic Analysis
*Summary*:
The LSA transforms an corpus from its word space given by the tf-idf matrice into its semantic space. In this semantic space the dimensions denote topics in the corpus and every document vector is a linear combination of all the implicitly extracted topics.

In [12]:
def LSA(tfs,num_topics=40):
    print('LSA:')
    lsa = %time TruncatedSVD(n_components=num_topics, random_state=0).fit(tfs)
    return lsa.transform(tfs), lsa

## Autoencoder
Summary: **Coming soon**

In [13]:
from keras.layers import Input, Dense
from keras.models import Model
from keras import regularizers

def create_autoencoder(input_dim, encoding_dim=50):
    # this is our input placeholder
    input_img = Input(shape=(input_dim,))
    # "encoded" is the encoded representation of the input
    encoded = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_img)
    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_img, decoded)

    # this model maps an input to its encoded representation
    encoder = Model(input_img, encoded)

    # create a placeholder for an encoded (32-dimensional) input
    encoded_input = Input(shape=(encoding_dim,))
    # retrieve the last layer of the autoencoder model
    decoder_layer = autoencoder.layers[-1]
    # create the decoder model
    decoder = Model(encoded_input, decoder_layer(encoded_input))
    autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

    return autoencoder, encoder, decoder

In [14]:
input_train_doc2vec = normalize(doc2vec_model.docvecs.vectors_docs)
input_test_doc2vec = normalize(docs_vectorized_doc2vec)
autoencoder_doc2vec, encoder_doc2vec, decoder_doc2vec = create_autoencoder(doc2vec_model.docvecs.vectors_docs.shape[1])
history = autoencoder_doc2vec.fit(input_train_doc2vec, input_train_doc2vec,
                epochs=75,
                batch_size=256,
                shuffle=True,
                validation_data=(input_test_doc2vec, input_test_doc2vec),
                verbose=0)

W0730 19:17:52.396291 139642196645696 deprecation_wrapper.py:119] From /tf/.venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0730 19:17:52.561265 139642196645696 deprecation_wrapper.py:119] From /tf/.venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0730 19:17:52.575141 139642196645696 deprecation_wrapper.py:119] From /tf/.venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0730 19:17:52.624560 139642196645696 deprecation_wrapper.py:119] From /tf/.venv/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0730 19:17:52.643626 139642196645696 deprecation_wrapper.py

In [15]:
input_train_doc2vec.shape

(82233, 100)

In [None]:
# Plot training & validation loss values
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(75)), y=history.history['loss'], name='train_loss'))
fig.add_trace(go.Scatter(x=list(range(75)), y=history.history['val_loss'], name='vall_loss'))
fig.update_xaxes(title_text="Number of epochs")
fig.update_yaxes(title_text="Loss")
fig.update_layout(yaxis_type="log",
                  width=1000, 
                  height=600)
fig.show()

In [None]:
fig = go.Figure()

err_lsa = []
err_aut = []
spar_lsa = []
spar_aut = []
dims = list(range(10,20,5))
for dim in dims:
    print(dim)
    autoencoder, *args = create_autoencoder(doc2vec_model.docvecs.vectors_docs.shape[1], encoding_dim=dim)
    autoencoder.fit(input_train_doc2vec, input_train_doc2vec,
                epochs=70,
                batch_size=256,
                shuffle=True,
                verbose=0)
    tfs_lsa, lsa_model = LSA(input_train_doc2vec, num_topics=dim)
    orig_tfs_lda = lsa_model.inverse_transform(tfs_lsa)
    orig_tfs_aut = autoencoder.predict(input_train_doc2vec)
    err_lsa.append(np.average(norm(orig_tfs_lda-input_train_doc2vec, axis=1)))
    err_aut.append(np.average(norm(orig_tfs_aut-input_train_doc2vec, axis=1)))
fig.add_trace(go.Scatter(x=dims, y=err_lsa, name='Error LSA'))
fig.add_trace(go.Scatter(x=dims, y=err_aut, name='Error Autoencoder'))
fig.add_trace(go.Scatter(x=dims, y=spar_lsa, name='Sparcity LSA'))
fig.add_trace(go.Scatter(x=dims, y=spar_aut, name='Sparcity Autoencoder'))
fig.update_layout(yaxis_type="log")
fig.update_xaxes(title_text="Dimensionality of latent topic space")
fig.update_yaxes(title_text="Reconstruction loss")


fig.show()

## Clustering

### K-Means
Summary: Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [16]:
def clusterNumberHeuristic(tfs):
    return (tfs.shape[0]*tfs.shape[1])//tfs.count_nonzero()

def clusterkm(tfs_reduced, num_topics=10):
    print('K-Means:')
    km = %time KMeans(n_clusters=num_topics).fit(tfs_reduced)
    return km

### Agglomerative Clustering

In [17]:
ind = triu_indices(docs_vectorized_doc2vec.shape[0], 1)
wmds = np.zeros((docs_vectorized_doc2vec.shape[0], docs_vectorized_doc2vec.shape[0]))
euds = np.zeros((docs_vectorized_doc2vec.shape[0], docs_vectorized_doc2vec.shape[0]))
def symmetrize(a):
    return a + a.T - np.diag(a.diagonal())

def wmd(x):
    return doc2vec_model.wv.wmdistance(mfndata[x[0]].words, mfndata[x[1]].words)

def eud(x):
    return norm(doc2vec_model[x[0]] - doc2vec_model[x[1]])

with Pool(cpu_count()) as p:
    wmds[ind] = p.map(wmd, zip(*ind))
    euds[ind] = p.map(eud, zip(*ind))
wmds = symmetrize(wmds)
euds = symmetrize(euds)

In [18]:
fig = go.Figure(data=[go.Box(y=(wmds/euds).flatten(),
            boxpoints='all', # can also be outliers, or suspectedoutliers, or False
            jitter=0.3, # add some jitter for a better separation between points
            pointpos=-1.8, # relative position of points wrt box
            name=""
              )])
fig.update_layout(
    title=go.layout.Title(
        xref="paper",
        x=0
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Quotient between WMD and Euclidian distance"
        )
    )
)

fig.show()


invalid value encountered in true_divide



In [19]:
def clusterag(tfs_reduced, num_clusters=5):
    print('Agglomerative Clustering:')
    am = %time AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', memory='/tmp', linkage='average').fit(wmds)
    return am

# Embedding into 2D

## Linear Discriminant Analysis
*Summary*:
Given a clustering the LDA can be used to find a projection into a lower dimensional space which maximizes inter-class variance and minimizes intra-class variance. This leads to neater cluster, but is grounded in the hypotheses that the clusters have some real semantic meaning. Otherwise it may enforce preexisting biases.

In [20]:
def dimReductionLDA(tfs_reduced, clusters, targetDim=2):
    lda = LinearDiscriminantAnalysis(n_components=targetDim)
    print('LDA:')
    tfs_2d = %time lda.fit(tfs_reduced, clusters.labels_).transform(tfs_reduced)
    return tfs_2d, lda

## tSNE
*Summary*:


*In-depth explanation*:

In [21]:
def dimReductiontSNE(tfs_reduced, perplexity=30, learning_rate=100, targetDim=2):
    print('t-SNE:')
    tfs_2d = %time TSNE(n_components=targetDim, perplexity=perplexity, learning_rate=learning_rate).fit_transform(tfs_reduced)
    return tfs_2d

# Linearize results into a grid

In [22]:
def mapToSpaceSampling(points):
    # just take the first n² < #points Points
    points = points[: int(np.sqrt(len(points)))**2]
    grid = np.dstack(np.meshgrid(np.linspace(np.min(points[:, 0]), np.max(points[:, 0]), int(np.sqrt(len(points)))),
                       np.linspace(np.min(points[:, 1]), np.max(points[:, 1]), int(np.sqrt(len(points)))))).reshape(-1, 2)
    cost = cdist(points, grid, "sqeuclidean").astype(np.float64)
    print(cost.shape)
    cost *= 100000 / cost.max()
    row_ind_lapjv, col_ind_lapjv, _ = lapjv(cost, verbose=True, force_doubles=True)
    return grid[row_ind_lapjv]

In [23]:
def computeClusterTopography(points, values, width, height, interpolation='linear'):
    # lay grid over the points so that all points are covered
    grid_x, grid_y = np.mgrid[np.min(points[:,0]):np.max(points[:,0]):width*1j, np.min(points[:,1]):np.max(points[:,1]):height*1j]
    return griddata(np.array(points), np.array(values[:len(points)]), (grid_x, grid_y), method=interpolation, fill_value=np.min(values[:len(points)]))

In [30]:
def compute(tfs, emb_model, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height):
    
    if dimreduction == 'LSA':
        tfs_reduced, red_model = LSA(tfs, num_topics=num_topics)
    elif dimreduction == 'Autoencoder':
        print('Autoencoder:')
        tfs_reduced, red_model = encoder_doc2vec.predict(tfs), None
    else:
        return 'No dimensionality reduction technique was selected!'
    
    if clustering == 'KMEANS':
        clusters = clusterkm(tfs_reduced, num_topics=num_clusters)
        cluster_centers_ = clusters.cluster_centers_
    elif clustering == 'Agglomerative Clustering':
        clusters = clusterag(tfs_reduced, num_clusters=num_clusters)
        cluster_centers_ = [np.mean(tfs[clusters.labels_[clusters.labels_ == x]]) for x in range(num_clusters)]
    else:
        return 'No clustering technique was selected!'
    
    sim_kernel = 1/(1+wmds)
    if embedding == 'LDA':
        tfs_embedded, lda = dimReductionLDA(tfs_reduced, clusters=clusters, targetDim=targetDim)
    elif embedding == 'tSNE':
        tfs_embedded = dimReductiontSNE(tfs_reduced, perplexity=perplexity, learning_rate=learning_rate, targetDim=targetDim)
    else:
        return 'No dimensionality reduction technique was selected!'
    
    # compute linearization
    tfs_mapped = mapToSpaceSampling(tfs_embedded) if targetDim == 2 else np.array([[0,0]]*len(tfs_embedded)) 
    
    # compute top words
    cluster_words = [emb_model.top_words(np.mean(tfs[clusters.labels_==cluster], axis=0), dct=dct, topn=5) for cluster in range(num_clusters)]
    top_words = [emb_model.top_words(project, dct=dct, topn=5) for project in tfs]
    # compute coherence score
    cm = CoherenceModel(topics=cluster_words, window_size=10, texts=[list(doc.words) for doc in traindata], dictionary=dct, processes=cpu_count())
    
    #compute cluster topography
    similarity_to_cluster_centers = [norm(x-cluster_centers_[clusters.labels_[i]]) for i,x in enumerate(tfs_reduced)]
    similarity_to_cluster_centers = similarity_to_cluster_centers / -norm(similarity_to_cluster_centers)
    #reduction_error = np.max(lda.decision_function(tfs_reduced), axis=1) if (embedding == 'LDA') else [0]* len(tfs_embedded)
    #eduction_error = reduction_error / norm(reduction_error)
    interpolated_topography = computeClusterTopography(tfs_embedded if viz == 'scatter' else tfs_mapped, similarity_to_cluster_centers if error=='cluster_error' else silhouette_samples(tfs_reduced, clusters.labels_), width, height, interpolation)
    interpolated_topography = np.array([1])*len(interpolated_topography)
    return tfs_reduced, clusters, tfs_embedded, tfs_mapped, cluster_words, top_words, similarity_to_cluster_centers, interpolated_topography, cm

# Visualization

In [25]:
def draw_scatter(data, width=600, height=600, viz='scatter'):
    display(Javascript("""
        (function(element){
            require(['scatter'], function(scatter) {
                scatter(element.get(0), %s, %d, %d, %s);
            });
        })(element);
    """ % (json.dumps(data), width, height, json.dumps(viz))))

In [26]:
def save(payload):
    name = "c" + str(payload['params']['num_clusters']) +"-t" + str(payload['params']['num_topics']) + "_" + str(payload['params']['embedding'])
    if payload['params']['embedding'] == 'tSNE':
        name += "_p" + str(payload['params']['perplexity']) + "-lr" + str(payload['params']['learning_rate'])
    with open('./dumps/' + name + '.json', 'w') as dumpfile:
        json.dump(payload, dumpfile, sort_keys=True, indent=4, ensure_ascii=False)

In [27]:
def visualize(targetDim=2,tfs=None,dimreduction='LSA', clustering='KMEANS', embedding='LDA', num_topics=20, num_clusters=5, perplexity=5, learning_rate=200, error='cluster_error', interpolation='linear', viz='scatter', fake=''):
    # viz dimensions
    width = 600
    height = 600
    payload = {}
    
    
    if not fake:
        # compute all necessary stuff
        tfs, model = pickle.loads(tfs)
        tfs_reduced, clusters, tfs_embedded, tfs_mapped, cluster_words, top_words, similarity_to_cluster_centers, interpolated_topography, cm = compute(tfs, model, targetDim, dimreduction, clustering, embedding, num_topics, num_clusters, perplexity, learning_rate, error, interpolation, viz, width, height)

        [print(i, words) for i,words in enumerate(cluster_words)]
        colours = d3['Category20'][num_clusters]
        #ids, titles, texts = [list(elem) for elem in zip(*loadProjects())]
        if targetDim == 2:
            # configure bokeh plot                   
            source = ColumnDataSource(data=dict(
                x=tfs_embedded[:, 0],
                y=tfs_embedded[:, 1],
                x_mapped=tfs_mapped[:, 0],
                y_mapped=tfs_mapped[:, 1],
                ids=mfndata.getIDs(),
                titles=mfndata.getTitles(),
                colours=np.array(colours)[clusters.labels_],
                labels=clusters.labels_
            ))

            TOOLTIPS = [
                ("index", "$index"),
                ("id", "@ids"),
                ("title", "@titles"),
            ]
            # scatterplot
            scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom,save')
            scatter.scatter('x', 'y', size=10,color='colours', legend='labels', source=source)
            url = 'http://gepris.dfg.de/gepris/projekt/@ids'
            taptool = scatter.select(type=TapTool)
            taptool.callback = OpenURL(url=url)

            # mapped scatterplot
            mapped_scatter = figure(plot_width=800, plot_height=800, title=None, toolbar_location="below", tooltips=TOOLTIPS, tools='tap,pan,wheel_zoom')
            mapped_scatter.scatter('x_mapped', 'y_mapped', size=50,color='colours', legend='labels', source=source)
            url = 'http://gepris.dfg.de/gepris/projekt/@ids'
            taptool = mapped_scatter.select(type=TapTool)
            taptool.callback = OpenURL(url=url)
            #show(row(scatter, mapped_scatter))
        else:

            source = go.Scatter3d(
                x=tfs_embedded[:, 0],
                y=tfs_embedded[:, 1],
                z=tfs_embedded[:, 2],
                mode='markers',
                marker=dict(
                    size=2,
                    color=clusters.labels_,                # set color to an array/list of desired values
                    colorscale='Viridis',   # choose a colorscale
                    opacity=0.8
                )
            )

            data = [source]
            layout = go.Layout(
                margin=dict(
                    l=0,
                    r=0,
                    b=0,
                    t=0
                )
            )
            fig = go.Figure(data=data, layout=layout)
            iplot(fig, filename='3d-scatter-colorscale')

        payload = {
            'params': {
                'targetDim': targetDim,
                'dimreduction': dimreduction,
                'clustering': clustering,
                'embedding': embedding,
                'num_topics': num_topics,
                'num_clusters': num_clusters,
                'perplexity': perplexity,
                'learning_rate': learning_rate
            },
            'project_data': [{'id':pid,'reducedpoint': reducedpoint, 'embpoint':embpoint, 'mappoint':mappoint, 'cluster':cluster, 'error':error, 'title': title, 'words': words} for pid, reducedpoint, embpoint, mappoint, cluster, error, title, words in zip(
                mfndata.getIDs(),
                tfs_reduced.tolist(),
                tfs_embedded.tolist(),
                tfs_mapped.tolist(),
                clusters.labels_.tolist(),
                similarity_to_cluster_centers.tolist(),
                mfndata.getTitles(),
                top_words

            )],
            'cluster_data': {
                'cluster_words': cluster_words,
                'cluster_colour': colours
            },
            'cluster_topography': np.flip(interpolated_topography.T, axis=0).flatten().tolist()
        }
        save(payload)
    else:
        with open(fake, 'r') as input_data:
            payload = payload=json.load(input_data)
    display(HTML(filename="scatter.css.html"))
    display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});"))
    display(Javascript(filename="scatter.js"))
    draw_scatter(payload, width, height, viz)
        


In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
def s(x,y):
    return IntSlider(min=x,max=y, value=(y-x)//2, continuous_update=False)

def f(x,y):
    return FloatSlider(min=x,max=y, value=(y-x)//2, step=0.0001, continuous_update=False)

w = interactive(visualize,targetDim=s(2,3),tfs=Dropdown(options=[('Doc2Vec', pickle.dumps((docs_vectorized_doc2vec, doc2vec_model))), ('TfIdf', pickle.dumps((docs_vectorized_tfidf, tfidf_model)))], value=pickle.dumps((docs_vectorized_doc2vec, doc2vec_model))), dimreduction=['LSA', 'Autoencoder'], clustering=['KMEANS', 'Agglomerative Clustering'], embedding=['LDA', 'tSNE'], num_topics=s(4,48), num_clusters=s(3,10), perplexity=s(5,50), learning_rate=s(10,100),error=['silhouette', 'cluster_error'], interpolation=['linear', 'cubic', 'nearest'], viz=['scatter', 'linearized'], fake='')
output = w.children[-1]
#output.layout.height = '2000px'
display(w)


interactive(children=(IntSlider(value=2, continuous_update=False, description='targetDim', max=3, min=2), Drop…

### Evaluate topic extraction

In [29]:
# Initialize figure with subplots
fig = make_subplots(
    rows=2, cols=2, subplot_titles=("Doc2Vec + LSA", "Doc2Vec + Autoencoder", "TfIdf + LSA", "TfIdf + Autoencoder")
)

# Update xaxis properties
#fig.update_xaxes(title_text="xaxis 1 title", row=1, col=1)
#fig.update_xaxes(title_text="xaxis 2 title", range=[10, 50], row=1, col=2)
#fig.update_xaxes(title_text="xaxis 3 title", showgrid=False, row=2, col=1)
#fig.update_xaxes(title_text="xaxis 4 title", type="log", row=2, col=2)

# Update yaxis properties
#fig.update_yaxes(title_text="yaxis 1 title", row=1, col=1)
#fig.update_yaxes(title_text="yaxis 2 title", range=[40, 80], row=1, col=2)
#fig.update_yaxes(title_text="yaxis 3 title", showgrid=False, row=2, col=1)
#fig.update_yaxes(title_text="yaxis 4 title", row=2, col=2)

# Update title and height
#fig.update_layout(title_text="Customizing Subplot Axes", height=700)

def compute_cms(num_clusters):
    cms = []
    for rerun in range(3):
        tfs, model = pickle.loads(embmodel)
        *args, cm = compute(tfs, model, 2, dimreduction, clustering, 'LDA', 50, num_clusters, 5, 100, 'cluster_error', 'linear', 'scatter', 10, 10)
        print(cm)
        cms.append(cm.get_coherence())
    return sum(cms)/len(cms)


for i,(name,embmodel) in enumerate([('Doc2Vec', pickle.dumps((docs_vectorized_doc2vec, doc2vec_model))), ('TfIdf', pickle.dumps((docs_vectorized_tfidf, tfidf_model)))]):
    for j,dimreduction in enumerate(['LSA', 'Autoencoder']):
        if name == 'TfIdf' and dimreduction =='Autoencoder':
            continue
        for clustering in ['KMEANS', 'Agglomerative Clustering']:
            data = []
            dims = list(range(3, 15))
            data = [compute_cms(dim) for dim in dims]
            fig.add_trace(go.Scatter(x=dims, y=data, name=clustering, marker_color="red" if clustering == 'KMEANS' else "blue", showlegend=True if (i==0 and j==0) else False), row=i+1, col=j+1)
fig.update_xaxes(title_text="Number of clusters", row=1, col=1)
fig.update_xaxes(title_text="Number of clusters", row=1, col=2)
fig.update_xaxes(title_text="Number of clusters", row=2, col=1)

fig.update_yaxes(title_text="Coherence score", row=1, col=1)
fig.update_yaxes(title_text="Coherence score", row=1, col=2)
fig.update_yaxes(title_text="Coherence score", row=2, col=1)
fig.show()

LSA:
CPU times: user 19 ms, sys: 8.09 ms, total: 27.1 ms
Wall time: 7.64 ms
K-Means:
CPU times: user 111 ms, sys: 82.1 ms, total: 193 ms
Wall time: 49.9 ms
LDA:
CPU times: user 12 ms, sys: 8.02 ms, total: 20 ms
Wall time: 5.21 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 26.1 ms, sys: 11.9 ms, total: 38 ms
Wall time: 11.4 ms
K-Means:
CPU times: user 118 ms, sys: 116 ms, total: 234 ms
Wall time: 59.2 ms
LDA:
CPU times: user 3.55 ms, sys: 15.9 ms, total: 19.4 ms
Wall time: 4.89 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 10.3 ms, sys: 16 ms, total: 26.3 ms
Wall time:







LSA:
CPU times: user 12.6 ms, sys: 39.9 ms, total: 52.5 ms
Wall time: 16.4 ms
K-Means:
CPU times: user 151 ms, sys: 136 ms, total: 287 ms
Wall time: 79.3 ms
LDA:
CPU times: user 16.1 ms, sys: 4.07 ms, total: 20.1 ms
Wall time: 5.05 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 95.8 ms, sys: 39.9 ms, total: 136 ms
Wall time: 47.6 ms
K-Means:
CPU times: user 162 ms, sys: 128 ms, total: 290 ms
Wall time: 74.5 ms
LDA:
CPU times: user 13.3 ms, sys: 11.9 ms, total: 25.2 ms
Wall time: 7.27 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 25.1 ms, sys: 23.8 ms, total: 48.9 ms
Wall time: 15.1 ms
K-Means:
CPU times: user 155 ms, sys: 172 ms, total: 326 ms
Wall time: 83.3 ms
LDA:
CPU times: user 12.5 ms, sys: 7.03 ms, total: 19.5 ms
Wall time: 5.02 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 19 ms, sys: 7.97 ms, total: 27 ms
Wall time: 7.81 ms
K-Means:
CPU times: user 185 ms, sys: 148 ms, total: 333 ms
Wall time: 85 ms
LDA:
CPU times: user 16 ms, sys: 4.15 ms, total: 20.1 ms
Wall time: 5.19 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 15.2 ms, sys: 16 ms, total: 31.2 ms
Wall time: 







LSA:
CPU times: user 27.8 ms, sys: 20 ms, total: 47.7 ms
Wall time: 14.9 ms
K-Means:
CPU times: user 201 ms, sys: 192 ms, total: 393 ms
Wall time: 101 ms
LDA:
CPU times: user 18.9 ms, sys: 14.8 ms, total: 33.7 ms
Wall time: 13.5 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 19.2 ms, sys: 19.7 ms, total: 39 ms
Wall time: 12.8 ms
K-Means:
CPU times: user 211 ms, sys: 216 ms, total: 427 ms
Wall time: 111 ms
LDA:
CPU times: user 0 ns, sys: 7.41 ms, total: 7.41 ms
Wall time: 3.66 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 25.5 ms, sys: 19.8 ms, total: 45.3 ms
Wall time: 14.2 ms
K-Means:
CPU times: user 222 ms, sys: 204 ms, total: 426 ms
Wall time: 110 ms
LDA:
CPU times: user 0 ns, sys: 7.8 ms, total: 7.8 ms
Wall time: 3.61 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 25.1 ms, sys: 7.8 ms, total: 32.9 ms
Wall time: 9.86 ms
K-Means:
CPU times: user 220 ms, sys: 216 ms, total: 435 ms
Wall time: 112 ms
LDA:
CPU times: user 7.53 ms, sys: 100 µs, total: 7.63 ms
Wall time: 3.82 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 13 ms, sys: 32 ms, total: 45 ms
Wall time: 14.9 ms
K-Means:
CPU times: user 228 ms, sys: 208 ms, total: 436 ms
Wall time: 113 ms
LDA:
CPU times: user 8.33 ms, sys: 126 µs, total: 8.46 ms
Wall time: 4.37 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 17.5 ms, sys: 16 ms, total: 33.5 ms
Wall time: 10.1 ms
K-Means:
CPU times: user 233 ms, sys: 200 ms, total: 433 ms
Wall time: 113 ms
LDA:
CPU times: user 174 µs, sys: 7.68 ms, total: 7.85 ms
Wall time: 3.89 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 11.4 ms, sys: 19.7 ms, total: 31.1 ms
Wall time: 9.







LSA:
CPU times: user 22.9 ms, sys: 15.8 ms, total: 38.7 ms
Wall time: 12.3 ms
K-Means:
CPU times: user 245 ms, sys: 192 ms, total: 437 ms
Wall time: 116 ms
LDA:
CPU times: user 3.82 ms, sys: 3.99 ms, total: 7.81 ms
Wall time: 3.92 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 34.4 ms, sys: 11.7 ms, total: 46 ms
Wall time: 14 ms
K-Means:
CPU times: user 215 ms, sys: 228 ms, total: 443 ms
Wall time: 121 ms
LDA:
CPU times: user 88 µs, sys: 7.65 ms, total: 7.74 ms
Wall time: 3.76 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 17.1 ms, sys: 19.9 ms, total: 37 ms
Wall time: 11.5 ms
K-Means:
CPU times: user 264 ms, sys: 180 ms, total: 443 ms
Wall time: 122 ms
LDA:
CPU times: user 6.99 ms, sys: 96 µs, total: 7.09 ms
Wall time: 4.11 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 11.9 ms, sys: 15.8 ms, total: 27.7 ms
Wall time: 8.32 ms
Agglomerative Clustering:
CPU times: user 109 ms, sys: 160 ms, total: 269 ms
Wall time: 73.9 ms
LDA:
CPU times: user 2.27 ms, sys: 19.7 ms, total: 21.9 ms
Wall time: 5.65 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 60.7 ms, sys: 40.1 ms, tota

LSA:
CPU times: user 23.4 ms, sys: 8.3 ms, total: 31.7 ms
Wall time: 10.1 ms
Agglomerative Clustering:
CPU times: user 4.22 ms, sys: 11.4 ms, total: 15.6 ms
Wall time: 3.92 ms
LDA:
CPU times: user 12.7 ms, sys: 7.52 ms, total: 20.2 ms
Wall time: 5.12 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 15.5 ms, sys: 11.8 ms, total: 27.3 ms
Wall time: 7.78 ms
Agglomerative Clustering:
CPU times: user 3.92 ms, sys: 12 ms, total: 15.9 ms
Wall time: 4.17 ms
LDA:
CPU times: user 12.8 ms, sys: 8.66 ms, total: 21.5 ms
Wall time: 5.42 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 15

LSA:
CPU times: user 22 ms, sys: 15.9 ms, total: 37.9 ms
Wall time: 11.8 ms
Agglomerative Clustering:
CPU times: user 0 ns, sys: 16.5 ms, total: 16.5 ms
Wall time: 4.3 ms
LDA:
CPU times: user 10.8 ms, sys: 11.5 ms, total: 22.3 ms
Wall time: 5.59 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 19.7 ms, sys: 8.39 ms, total: 28.1 ms
Wall time: 8.35 ms
Agglomerative Clustering:
CPU times: user 5.71 ms, sys: 11.4 ms, total: 17.1 ms
Wall time: 4.3 ms
LDA:
CPU times: user 2.95 ms, sys: 19.5 ms, total: 22.5 ms
Wall time: 5.63 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
LSA:
CPU times: user 10.5 m







Autoencoder:
K-Means:
CPU times: user 38 ms, sys: 114 µs, total: 38.1 ms
Wall time: 38.1 ms
LDA:
CPU times: user 12.1 ms, sys: 0 ns, total: 12.1 ms
Wall time: 5.31 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 36.7 ms, sys: 0 ns, total: 36.7 ms
Wall time: 36.6 ms
LDA:
CPU times: user 0 ns, sys: 8.83 ms, total: 8.83 ms
Wall time: 4.27 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 33.1 ms, sys: 4.06 ms, total: 37.1 ms
Wall time: 37 ms
LDA:
CPU times: user 3.36 ms, sys: 10.3 ms, total: 13.7 ms
Wall time: 5.82 ms
(81, 81)
Coherence_Measur







Autoencoder:
K-Means:
CPU times: user 36.4 ms, sys: 0 ns, total: 36.4 ms
Wall time: 36.3 ms
LDA:
CPU times: user 13.8 ms, sys: 441 µs, total: 14.3 ms
Wall time: 6.18 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 41.5 ms, sys: 0 ns, total: 41.5 ms
Wall time: 41.3 ms
LDA:
CPU times: user 15.1 ms, sys: 537 µs, total: 15.6 ms
Wall time: 6.69 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 41.6 ms, sys: 0 ns, total: 41.6 ms
Wall time: 41.5 ms
LDA:
CPU times: user 17.2 ms, sys: 3.77 ms, total: 20.9 ms
Wall time: 8.15 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 40 ms, sys: 3.99 ms, total: 44 ms
Wall time: 43.9 ms
LDA:
CPU times: user 0 ns, sys: 8.99 ms, total: 8.99 ms
Wall time: 4.33 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 46.7 ms, sys: 0 ns, total: 46.7 ms
Wall time: 46.7 ms
LDA:
CPU times: user 3.65 ms, sys: 3.74 ms, total: 7.39 ms
Wall time: 3.65 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 49 ms, sys: 60 µs, total: 49.1 ms
Wall time: 49 ms
LDA:
CPU times: user 7.59 ms, sys: 3.75 ms, total: 11.3 ms
Wall time: 5.02 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 46.3 ms, sys: 133 µs, total: 46.4 ms
Wall time: 46.1 ms
LDA:
CPU times: user 401 µs, sys: 7.63 ms, total: 8.04 ms
Wall time: 4.05 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 48.5 ms, sys: 28 µs, total: 48.5 ms
Wall time: 48.4 ms
LDA:
CPU times: user 2.9 ms, sys: 7.77 ms, total: 10.7 ms
Wall time: 4.77 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 48.8 ms, sys: 53 µs, total: 48.9 ms
Wall time: 48.4 ms
LDA:
CPU times: user 0 ns, sys: 10 ms, total: 10 ms
Wall time: 4.54 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 50.9 ms, sys: 0 ns, total: 50.9 ms
Wall time: 50.9 ms
LDA:
CPU times: user 10 ms, sys: 8.12 ms, total: 18.2 ms
Wall time: 7.22 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 53.8 ms, sys: 108 µs, total: 53.9 ms
Wall time: 53.8 ms
LDA:
CPU times: user 7.78 ms, sys: 3.75 ms, total: 11.5 ms
Wall time: 5.13 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 54.6 ms, sys: 160 µs, total: 54.8 ms
Wall time: 54.7 ms
LDA:
CPU times: user 2 ms, sys: 7.45 ms, total: 9.45 ms
Wall time: 4.39 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
K-Means:
CPU times: user 61.4 ms, sys: 0 ns, total: 61.4 ms
Wall time: 61.3 ms
LDA:
CPU times: user 10.7 ms, sys: 8.07 ms, total: 18.8 ms
Wall time: 7.58 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 57.9 ms, sys: 119 µs, total: 58 ms
Wall time: 57.8 ms
LDA:
CPU times: user 1.07 ms, sys: 11.6 ms, total: 12.6 ms
Wall time: 5.76 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 55.4 ms, sys: 3.96 ms, total: 59.4 ms
Wall time: 59.3 ms
LDA:
CPU times: user 7.78 ms, sys: 237 µs, total: 8.02 ms
Wall time: 3.89 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 58.6 ms, sys: 0 ns, total: 58.6 ms
Wall time: 58.6 ms
LDA:
CPU times: user 5.66 ms, sys: 7.58 ms, total: 13.2 ms
Wall time: 5.57 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 63.4 ms, sys: 218 µs, total: 63.6 ms
Wall time: 63.6 ms
LDA:
CPU times: user 2.6 ms, sys: 6.73 ms, total: 9.33 ms
Wall time: 4.42 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 62.9 ms, sys: 0 ns, total: 62.9 ms
Wall time: 62.9 ms
LDA:
CPU times: user 2.95 ms, sys: 11.9 ms, total: 14.9 ms
Wall time: 6.11 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 71.8 ms, sys: 0 ns, total: 71.8 ms
Wall time: 71.8 ms
LDA:
CPU times: user 7.59 ms, sys: 3.8 ms, total: 11.4 ms
Wall time: 4.99 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 67.6 ms, sys: 0 ns, total: 67.6 ms
Wall time: 67.5 ms
LDA:
CPU times: user 10.2 ms, sys: 3.88 ms, total: 14.1 ms
Wall time: 5.92 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 67.7 ms, sys: 0 ns, total: 67.7 ms
Wall time: 67.6 ms
LDA:
CPU times: user 6.46 ms, sys: 7.89 ms, total: 14.4 ms
Wall time: 5.89 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 68.8 ms, sys: 62 µs, total: 68.9 ms
Wall time: 68.9 ms
LDA:
CPU times: user 4.78 ms, sys: 19.5 ms, total: 24.3 ms
Wall time: 9.5 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 78.1 ms, sys: 0 ns, total: 78.1 ms
Wall time: 78.1 ms
LDA:
CPU times: user 656 µs, sys: 7.73 ms, total: 8.39 ms
Wall time: 3.97 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 72.1 ms, sys: 7 µs, total: 72.1 ms
Wall time: 72 ms
LDA:
CPU times: user 9.03 ms, sys: 8.06 ms, total: 17.1 ms
Wall time: 6.86 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 73.7 ms, sys: 117 µs, total: 73.8 ms
Wall time: 73.7 ms
LDA:
CPU times: user 11.3 ms, sys: 8.22 ms, total: 19.5 ms
Wall time: 12 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 73.3 ms, sys: 93 µs, total: 73.4 ms
Wall time: 73.1 ms
LDA:
CPU times: user 5.55 ms, sys: 104 µs, total: 5.65 ms
Wall time: 3.59 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 81.6 ms, sys: 0 ns, total: 81.6 ms
Wall time: 81.7 ms
LDA:
CPU times: user 4.64 ms, sys: 3.7 ms, total: 8.35 ms
Wall time: 4.02 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
K-Means:
CPU times: user 76.2 ms, sys: 16 µs, total: 76.2 ms
Wall time: 75.9 ms
LDA:
CPU times: user 5.88 ms, sys: 15.7 ms, total: 21.5 ms
Wall time: 8.5 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








Autoencoder:
Agglomerative Clustering:
CPU times: user 4.1 ms, sys: 8 µs, total: 4.11 ms
Wall time: 4.04 ms
LDA:
CPU times: user 6.76 ms, sys: 3.74 ms, total: 10.5 ms
Wall time: 4.72 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
Agglomerative Clustering:
CPU times: user 3.29 ms, sys: 0 ns, total: 3.29 ms
Wall time: 2.97 ms
LDA:
CPU times: user 4.27 ms, sys: 7.73 ms, total: 12 ms
Wall time: 5.13 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
Agglomerative Clustering:
CPU times: user 3.54 ms, sys: 0 ns, total: 3.54 ms
Wall time: 3.23 ms
LDA:
CPU times: user 3.82 ms, sys: 11.7 ms, total:

Autoencoder:
Agglomerative Clustering:
CPU times: user 3.49 ms, sys: 0 ns, total: 3.49 ms
Wall time: 3.11 ms
LDA:
CPU times: user 2.16 ms, sys: 7.33 ms, total: 9.49 ms
Wall time: 4.31 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
Agglomerative Clustering:
CPU times: user 0 ns, sys: 2.79 ms, total: 2.79 ms
Wall time: 2.77 ms
LDA:
CPU times: user 7.56 ms, sys: 273 µs, total: 7.83 ms
Wall time: 3.79 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)
Autoencoder:
Agglomerative Clustering:
CPU times: user 3.09 ms, sys: 0 ns, total: 3.09 ms
Wall time: 2.81 ms
LDA:
CPU times: user 8 ms, sys: 0 ns, total: 8 m







LSA:
CPU times: user 15.9 s, sys: 8.21 s, total: 24.1 s
Wall time: 8.71 s
K-Means:
CPU times: user 21.9 ms, sys: 160 µs, total: 22 ms
Wall time: 22 ms
LDA:
CPU times: user 2.4 ms, sys: 4.21 ms, total: 6.61 ms
Wall time: 3.11 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.09 s, total: 24.1 s
Wall time: 8.61 s
K-Means:
CPU times: user 23.1 ms, sys: 0 ns, total: 23.1 ms
Wall time: 22.4 ms
LDA:
CPU times: user 6.92 ms, sys: 269 µs, total: 7.19 ms
Wall time: 3.65 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 7.77 s, total: 23.7 s
Wall time: 8.54 s
K-Means:
CPU times: user 24.6 ms, sys: 61 µs, total: 24.7 ms
Wall time: 23.9 ms
LDA:
CPU times: user 6.88 ms, sys: 259 µs, total: 7.13 ms
Wall time: 3.75 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 7.7 s, total: 23.5 s
Wall time: 8.47 s
K-Means:
CPU times: user 24.8 ms, sys: 0 ns, total: 24.8 ms
Wall time: 24.5 ms
LDA:
CPU times: user 8.78 ms, sys: 236 µs, total: 9.02 ms
Wall time: 4.93 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.5 s, total: 23.5 s
Wall time: 8.49 s
K-Means:
CPU times: user 24.1 ms, sys: 5 µs, total: 24.2 ms
Wall time: 24.1 ms
LDA:
CPU times: user 3.29 ms, sys: 3.15 ms, total: 6.44 ms
Wall time: 3.26 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.81 s, total: 23.8 s
Wall time: 8.49 s
K-Means:
CPU times: user 26.6 ms, sys: 0 ns, total: 26.6 ms
Wall time: 26.3 ms
LDA:
CPU times: user 6.81 ms, sys: 230 µs, total: 7.04 ms
Wall time: 3.71 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 8.27 s, total: 24.2 s
Wall time: 8.69 s
K-Means:
CPU times: user 26.8 ms, sys: 0 ns, total: 26.8 ms
Wall time: 26.7 ms
LDA:
CPU times: user 9.38 ms, sys: 529 µs, total: 9.91 ms
Wall time: 4.49 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.58 s, total: 23.6 s
Wall time: 8.41 s
K-Means:
CPU times: user 28.9 ms, sys: 84 µs, total: 28.9 ms
Wall time: 28.6 ms
LDA:
CPU times: user 0 ns, sys: 7.56 ms, total: 7.56 ms
Wall time: 3.78 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.8 s, total: 23.9 s
Wall time: 8.6 s
K-Means:
CPU times: user 26.9 ms, sys: 3.89 ms, total: 30.8 ms
Wall time: 30.6 ms
LDA:
CPU times: user 6.6 ms, sys: 262 µs, total: 6.86 ms
Wall time: 3.51 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 8.23 s, total: 24.1 s
Wall time: 8.69 s
K-Means:
CPU times: user 31.5 ms, sys: 0 ns, total: 31.5 ms
Wall time: 31.3 ms
LDA:
CPU times: user 6.6 ms, sys: 267 µs, total: 6.86 ms
Wall time: 3.54 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.46 s, total: 23.6 s
Wall time: 8.47 s
K-Means:
CPU times: user 30.3 ms, sys: 245 µs, total: 30.6 ms
Wall time: 30.4 ms
LDA:
CPU times: user 202 µs, sys: 6.9 ms, total: 7.1 ms
Wall time: 3.59 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.85 s, total: 24 s
Wall time: 8.63 s
K-Means:
CPU times: user 32.6 ms, sys: 67 µs, total: 32.6 ms
Wall time: 32.4 ms
LDA:
CPU times: user 3.99 ms, sys: 3.12 ms, total: 7.11 ms
Wall time: 3.45 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.7 s, sys: 7.87 s, total: 23.6 s
Wall time: 8.51 s
K-Means:
CPU times: user 34.4 ms, sys: 268 µs, total: 34.6 ms
Wall time: 34.6 ms
LDA:
CPU times: user 3.61 ms, sys: 2.75 ms, total: 6.35 ms
Wall time: 3.27 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.21 s, total: 24.3 s
Wall time: 8.74 s
K-Means:
CPU times: user 33.6 ms, sys: 0 ns, total: 33.6 ms
Wall time: 33.4 ms
LDA:
CPU times: user 6.62 ms, sys: 205 µs, total: 6.82 ms
Wall time: 3.51 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.82 s, total: 23.9 s
Wall time: 8.56 s
K-Means:
CPU times: user 35.6 ms, sys: 0 ns, total: 35.6 ms
Wall time: 35.5 ms
LDA:
CPU times: user 0 ns, sys: 6.95 ms, total: 6.95 ms
Wall time: 3.52 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.73 s, total: 23.8 s
Wall time: 8.57 s
K-Means:
CPU times: user 37.5 ms, sys: 0 ns, total: 37.5 ms
Wall time: 37.5 ms
LDA:
CPU times: user 6.79 ms, sys: 200 µs, total: 6.99 ms
Wall time: 3.52 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.7 s, sys: 7.92 s, total: 23.6 s
Wall time: 8.52 s
K-Means:
CPU times: user 38.5 ms, sys: 0 ns, total: 38.5 ms
Wall time: 38.4 ms
LDA:
CPU times: user 10.4 ms, sys: 8.28 ms, total: 18.7 ms
Wall time: 7.36 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.5 s, sys: 7.2 s, total: 23.7 s
Wall time: 8.49 s
K-Means:
CPU times: user 37.4 ms, sys: 146 µs, total: 37.5 ms
Wall time: 37.3 ms
LDA:
CPU times: user 6.72 ms, sys: 332 µs, total: 7.05 ms
Wall time: 3.56 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 8.09 s, total: 24.1 s
Wall time: 8.64 s
K-Means:
CPU times: user 38.4 ms, sys: 0 ns, total: 38.4 ms
Wall time: 38.2 ms
LDA:
CPU times: user 7.21 ms, sys: 291 µs, total: 7.5 ms
Wall time: 3.83 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 8.15 s, total: 24 s
Wall time: 8.67 s
K-Means:
CPU times: user 38.3 ms, sys: 0 ns, total: 38.3 ms
Wall time: 38.2 ms
LDA:
CPU times: user 7.39 ms, sys: 418 µs, total: 7.81 ms
Wall time: 4.03 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 7.75 s, total: 23.6 s
Wall time: 8.4 s
K-Means:
CPU times: user 43.5 ms, sys: 0 ns, total: 43.5 ms
Wall time: 43.4 ms
LDA:
CPU times: user 7.36 ms, sys: 393 µs, total: 7.75 ms
Wall time: 3.95 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.94 s, total: 24 s
Wall time: 8.62 s
K-Means:
CPU times: user 41.5 ms, sys: 174 µs, total: 41.6 ms
Wall time: 41.5 ms
LDA:
CPU times: user 6.58 ms, sys: 497 µs, total: 7.08 ms
Wall time: 3.57 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 8 s, total: 24 s
Wall time: 8.64 s
K-Means:
CPU times: user 41.9 ms, sys: 0 ns, total: 41.9 ms
Wall time: 41.9 ms
LDA:
CPU times: user 3.89 ms, sys: 4.39 ms, total: 8.29 ms
Wall time: 4.1 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 7.85 s, total: 23.6 s
Wall time: 8.52 s
K-Means:
CPU times: user 45.5 ms, sys: 189 µs, total: 45.7 ms
Wall time: 45.7 ms
LDA:
CPU times: user 345 µs, sys: 6.65 ms, total: 7 ms
Wall time: 3.46 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.3 s, sys: 7.47 s, total: 23.8 s
Wall time: 8.55 s
K-Means:
CPU times: user 44.9 ms, sys: 0 ns, total: 44.9 ms
Wall time: 44.9 ms
LDA:
CPU times: user 2.79 ms, sys: 4.12 ms, total: 6.91 ms
Wall time: 3.45 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.01 s, total: 24.1 s
Wall time: 8.67 s
K-Means:
CPU times: user 45.8 ms, sys: 0 ns, total: 45.8 ms
Wall time: 45.7 ms
LDA:
CPU times: user 3.28 ms, sys: 3.78 ms, total: 7.06 ms
Wall time: 3.99 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 8.06 s, total: 23.8 s
Wall time: 8.52 s
K-Means:
CPU times: user 47.9 ms, sys: 0 ns, total: 47.9 ms
Wall time: 47.7 ms
LDA:
CPU times: user 7.58 ms, sys: 451 µs, total: 8.03 ms
Wall time: 4.08 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 7.98 s, total: 23.8 s
Wall time: 8.43 s
K-Means:
CPU times: user 48.1 ms, sys: 0 ns, total: 48.1 ms
Wall time: 47.7 ms
LDA:
CPU times: user 6.95 ms, sys: 372 µs, total: 7.33 ms
Wall time: 3.6 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.92 s, total: 23.9 s
Wall time: 8.58 s
K-Means:
CPU times: user 47.7 ms, sys: 3 µs, total: 47.7 ms
Wall time: 47.6 ms
LDA:
CPU times: user 8.96 ms, sys: 150 µs, total: 9.11 ms
Wall time: 4.3 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 14.8 s, sys: 7.71 s, total: 22.5 s
Wall time: 11.5 s
K-Means:
CPU times: user 100 ms, sys: 58 µs, total: 100 ms
Wall time: 99.7 ms
LDA:
CPU times: user 14.1 ms, sys: 4.26 ms, total: 18.4 ms
Wall time: 12 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.1 s, sys: 8.4 s, total: 23.5 s
Wall time: 9.14 s
K-Means:
CPU times: user 76.7 ms, sys: 97 µs, total: 76.8 ms
Wall time: 80.2 ms
LDA:
CPU times: user 7.51 ms, sys: 537 µs, total: 8.04 ms
Wall time: 5.09 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.3 s, sys: 7.74 s, total: 24.1 s
Wall time: 8.52 s
K-Means:
CPU times: user 51.5 ms, sys: 0 ns, total: 51.5 ms
Wall time: 51.4 ms
LDA:
CPU times: user 7.59 ms, sys: 480 µs, total: 8.07 ms
Wall time: 3.84 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 8 s, total: 23.8 s
Wall time: 8.44 s
K-Means:
CPU times: user 53.4 ms, sys: 210 µs, total: 53.6 ms
Wall time: 53.5 ms
LDA:
CPU times: user 7.68 ms, sys: 498 µs, total: 8.18 ms
Wall time: 4.08 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.7 s, sys: 7.88 s, total: 23.6 s
Wall time: 8.57 s
K-Means:
CPU times: user 59.9 ms, sys: 28 µs, total: 60 ms
Wall time: 59.7 ms
LDA:
CPU times: user 5.53 ms, sys: 2.72 ms, total: 8.25 ms
Wall time: 4.17 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.5 s, sys: 7.83 s, total: 24.3 s
Wall time: 8.9 s
K-Means:
CPU times: user 57 ms, sys: 0 ns, total: 57 ms
Wall time: 56.9 ms
LDA:
CPU times: user 0 ns, sys: 7.92 ms, total: 7.92 ms
Wall time: 3.89 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.12 s, total: 24.2 s
Wall time: 8.84 s
Agglomerative Clustering:
CPU times: user 3.5 ms, sys: 0 ns, total: 3.5 ms
Wall time: 4.8 ms
LDA:
CPU times: user 6.33 ms, sys: 391 µs, total: 6.72 ms
Wall time: 3.5 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.37 s, total: 24.5 s
Wall time: 8.88 s
Agglomerative Clustering:
CPU times: user 3.05 ms, sys: 0 ns, total: 3.05 ms
Wall time: 2.72 ms
LDA:
CPU times: user 7.4 ms, sys: 427 µs, total: 7.83 ms
Wall time: 3.63 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.2 s, sys: 7.88 s, total: 24.1 s
Wall time: 8.44 s
Agglomerative Clustering:
CPU times: user 3.41 ms, sys: 0 ns, total: 3.41 ms
Wall time: 3.06 ms
LDA:
CPU times: user 6.72 ms, sys: 379 µs, total: 7.1 ms
Wall time: 3.57 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.88 s, total: 24 s
Wall time: 8.59 s
Agglomerative Clustering:
CPU times: user 3.32 ms, sys: 0 ns, total: 3.32 ms
Wall time: 2.98 ms
LDA:
CPU times: user 6.58 ms, sys: 360 µs, total: 6.94 ms
Wall time: 3.36 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.65 s, total: 23.8 s
Wall time: 8.72 s
Agglomerative Clustering:
CPU times: user 3.78 ms, sys: 17 µs, total: 3.8 ms
Wall time: 3.3 ms
LDA:
CPU times: user 6.67 ms, sys: 396 µs, total: 7.07 ms
Wall time: 3.59 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.5 s, total: 23.5 s
Wall time: 8.46 s
Agglomerative Clustering:
CPU times: user 2.91 ms, sys: 447 µs, total: 3.35 ms
Wall time: 3 ms
LDA:
CPU times: user 0 ns, sys: 7.47 ms, total: 7.47 ms
Wall time: 3.72 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 8.09 s, total: 24.2 s
Wall time: 8.68 s
Agglomerative Clustering:
CPU times: user 3.64 ms, sys: 0 ns, total: 3.64 ms
Wall time: 3.37 ms
LDA:
CPU times: user 7.11 ms, sys: 0 ns, total: 7.11 ms
Wall time: 3.66 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.3 s, sys: 8.36 s, total: 24.7 s
Wall time: 8.85 s
Agglomerative Clustering:
CPU times: user 2.83 ms, sys: 0 ns, total: 2.83 ms
Wall time: 2.84 ms
LDA:
CPU times: user 6.4 ms, sys: 345 µs, total: 6.74 ms
Wall time: 3.39 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 7.29 s, total: 23.2 s
Wall time: 8.33 s
Agglomerative Clustering:
CPU times: user 3.48 ms, sys: 0 ns, total: 3.48 ms
Wall time: 3.15 ms
LDA:
CPU times: user 6.73 ms, sys: 400 µs, total: 7.13 ms
Wall time: 3.48 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.58 s, total: 23.6 s
Wall time: 8.4 s
Agglomerative Clustering:
CPU times: user 2.95 ms, sys: 0 ns, total: 2.95 ms
Wall time: 2.94 ms
LDA:
CPU times: user 368 µs, sys: 6.57 ms, total: 6.94 ms
Wall time: 3.46 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 8.19 s, total: 24.2 s
Wall time: 8.74 s
Agglomerative Clustering:
CPU times: user 2.67 ms, sys: 429 µs, total: 3.1 ms
Wall time: 2.76 ms
LDA:
CPU times: user 6.52 ms, sys: 313 µs, total: 6.83 ms
Wall time: 3.49 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.2 s, sys: 7.86 s, total: 24 s
Wall time: 8.72 s
Agglomerative Clustering:
CPU times: user 2.7 ms, sys: 436 µs, total: 3.13 ms
Wall time: 3.08 ms
LDA:
CPU times: user 6.27 ms, sys: 304 µs, total: 6.58 ms
Wall time: 3.32 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 7.75 s, total: 23.7 s
Wall time: 8.42 s
Agglomerative Clustering:
CPU times: user 3.3 ms, sys: 0 ns, total: 3.3 ms
Wall time: 3.09 ms
LDA:
CPU times: user 6.73 ms, sys: 412 µs, total: 7.14 ms
Wall time: 3.58 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.97 s, total: 24 s
Wall time: 9.3 s
Agglomerative Clustering:
CPU times: user 3.37 ms, sys: 0 ns, total: 3.37 ms
Wall time: 3.04 ms
LDA:
CPU times: user 6.89 ms, sys: 528 µs, total: 7.42 ms
Wall time: 4.01 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 7.6 s, total: 23.5 s
Wall time: 9.06 s
Agglomerative Clustering:
CPU times: user 4.51 ms, sys: 53 µs, total: 4.57 ms
Wall time: 4.53 ms
LDA:
CPU times: user 15.2 ms, sys: 7.23 ms, total: 22.4 ms
Wall time: 9.5 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.7 s, sys: 8.26 s, total: 24 s
Wall time: 9.24 s
Agglomerative Clustering:
CPU times: user 3.22 ms, sys: 0 ns, total: 3.22 ms
Wall time: 3.14 ms
LDA:
CPU times: user 3.28 ms, sys: 3.9 ms, total: 7.19 ms
Wall time: 3.67 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 14.9 s, sys: 8.16 s, total: 23.1 s
Wall time: 10.2 s
Agglomerative Clustering:
CPU times: user 8 µs, sys: 3.97 ms, total: 3.97 ms
Wall time: 3.89 ms
LDA:
CPU times: user 6.72 ms, sys: 15.8 ms, total: 22.5 ms
Wall time: 11.7 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.6 s, sys: 8.76 s, total: 24.4 s
Wall time: 10.8 s
Agglomerative Clustering:
CPU times: user 3.27 ms, sys: 0 ns, total: 3.27 ms
Wall time: 3.21 ms
LDA:
CPU times: user 8.94 ms, sys: 107 µs, total: 9.05 ms
Wall time: 4.97 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 14.3 s, sys: 6.88 s, total: 21.2 s
Wall time: 10.7 s
Agglomerative Clustering:
CPU times: user 2.95 ms, sys: 502 µs, total: 3.45 ms
Wall time: 3.14 ms
LDA:
CPU times: user 7.29 ms, sys: 370 µs, total: 7.66 ms
Wall time: 3.81 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 7.82 s, total: 23.6 s
Wall time: 9.51 s
Agglomerative Clustering:
CPU times: user 3.29 ms, sys: 0 ns, total: 3.29 ms
Wall time: 3.41 ms
LDA:
CPU times: user 5.65 ms, sys: 23.2 ms, total: 28.9 ms
Wall time: 16.3 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 8.49 s, total: 24.5 s
Wall time: 9.04 s
Agglomerative Clustering:
CPU times: user 3.61 ms, sys: 0 ns, total: 3.61 ms
Wall time: 3.3 ms
LDA:
CPU times: user 8.68 ms, sys: 3.38 ms, total: 12.1 ms
Wall time: 5.33 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.2 s, sys: 8.26 s, total: 24.5 s
Wall time: 9.02 s
Agglomerative Clustering:
CPU times: user 3.29 ms, sys: 0 ns, total: 3.29 ms
Wall time: 3.24 ms
LDA:
CPU times: user 6.08 ms, sys: 420 µs, total: 6.5 ms
Wall time: 3.83 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.2 s, sys: 8.14 s, total: 24.4 s
Wall time: 9.41 s
Agglomerative Clustering:
CPU times: user 2.58 ms, sys: 451 µs, total: 3.03 ms
Wall time: 3 ms
LDA:
CPU times: user 6.43 ms, sys: 440 µs, total: 6.87 ms
Wall time: 3.49 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.5 s, total: 23.6 s
Wall time: 8.54 s
Agglomerative Clustering:
CPU times: user 3.33 ms, sys: 0 ns, total: 3.33 ms
Wall time: 3.01 ms
LDA:
CPU times: user 7.18 ms, sys: 527 µs, total: 7.71 ms
Wall time: 5.17 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.3 s, sys: 7.53 s, total: 23.9 s
Wall time: 8.81 s
Agglomerative Clustering:
CPU times: user 2.84 ms, sys: 504 µs, total: 3.35 ms
Wall time: 2.99 ms
LDA:
CPU times: user 7.57 ms, sys: 400 µs, total: 7.97 ms
Wall time: 4.03 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.72 s, total: 23.8 s
Wall time: 9.16 s
Agglomerative Clustering:
CPU times: user 3.36 ms, sys: 0 ns, total: 3.36 ms
Wall time: 3.33 ms
LDA:
CPU times: user 6.58 ms, sys: 4.4 ms, total: 11 ms
Wall time: 6.34 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.4 s, sys: 7.62 s, total: 24 s
Wall time: 8.75 s
Agglomerative Clustering:
CPU times: user 3.17 ms, sys: 569 µs, total: 3.74 ms
Wall time: 3.44 ms
LDA:
CPU times: user 0 ns, sys: 8.02 ms, total: 8.02 ms
Wall time: 4.08 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16 s, sys: 7.64 s, total: 23.7 s
Wall time: 8.7 s
Agglomerative Clustering:
CPU times: user 0 ns, sys: 3.15 ms, total: 3.15 ms
Wall time: 3.1 ms
LDA:
CPU times: user 3.62 ms, sys: 3.86 ms, total: 7.48 ms
Wall time: 3.73 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.9 s, sys: 8.61 s, total: 24.5 s
Wall time: 9.61 s
Agglomerative Clustering:
CPU times: user 3.52 ms, sys: 0 ns, total: 3.52 ms
Wall time: 3.18 ms
LDA:
CPU times: user 7.49 ms, sys: 677 µs, total: 8.17 ms
Wall time: 3.86 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.61 s, total: 23.7 s
Wall time: 8.76 s
Agglomerative Clustering:
CPU times: user 3.15 ms, sys: 0 ns, total: 3.15 ms
Wall time: 3.12 ms
LDA:
CPU times: user 8.57 ms, sys: 896 µs, total: 9.46 ms
Wall time: 4.76 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.2 s, sys: 8 s, total: 24.2 s
Wall time: 9.07 s
Agglomerative Clustering:
CPU times: user 2.99 ms, sys: 0 ns, total: 2.99 ms
Wall time: 2.96 ms
LDA:
CPU times: user 6.57 ms, sys: 533 µs, total: 7.1 ms
Wall time: 3.45 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.8 s, sys: 8.48 s, total: 24.3 s
Wall time: 9.35 s
Agglomerative Clustering:
CPU times: user 2.86 ms, sys: 529 µs, total: 3.39 ms
Wall time: 3.03 ms
LDA:
CPU times: user 7.75 ms, sys: 658 µs, total: 8.41 ms
Wall time: 4.34 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.84 s, total: 24 s
Wall time: 8.87 s
Agglomerative Clustering:
CPU times: user 3.02 ms, sys: 561 µs, total: 3.58 ms
Wall time: 3.24 ms
LDA:
CPU times: user 8.24 ms, sys: 0 ns, total: 8.24 ms
Wall time: 4.13 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15 s, sys: 8.25 s, total: 23.2 s
Wall time: 9.22 s
Agglomerative Clustering:
CPU times: user 3.28 ms, sys: 0 ns, total: 3.28 ms
Wall time: 3.03 ms
LDA:
CPU times: user 1.5 ms, sys: 6.63 ms, total: 8.13 ms
Wall time: 3.94 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 15.5 s, sys: 8.45 s, total: 24 s
Wall time: 9.45 s
Agglomerative Clustering:
CPU times: user 3.31 ms, sys: 0 ns, total: 3.31 ms
Wall time: 3.4 ms
LDA:
CPU times: user 1.02 ms, sys: 6.62 ms, total: 7.64 ms
Wall time: 3.85 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)








LSA:
CPU times: user 16.1 s, sys: 7.97 s, total: 24.1 s
Wall time: 9.03 s
Agglomerative Clustering:
CPU times: user 0 ns, sys: 3.33 ms, total: 3.33 ms
Wall time: 3.18 ms
LDA:
CPU times: user 7.73 ms, sys: 769 µs, total: 8.5 ms
Wall time: 3.94 ms
(81, 81)
Coherence_Measure(seg=<function s_one_set at 0x7f00bba64400>, prob=<function p_boolean_sliding_window at 0x7f00bba64620>, conf=<function cosine_similarity at 0x7f00bb813d90>, aggr=<function arithmetic_mean at 0x7f00bb12f378>)






