In [4]:
import json
import pprint
from pathlib import Path
import sys
sys.path.insert(1, '../src/utils')
import ipdb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.manifold import TSNE
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from data import preprocessing

  and should_run_async(code)


# Variables


In [5]:
num_topics = 50
use_title = True
stemming = True
lemmatization = True
lib = "spacy"

input_path = Path("../src/data/data_jmlr_vol13-21.json")

  and should_run_async(code)


# Note
We also had ```gensim``` supported for preprocessing, but realized that even in their official docs, they make it a point that this should not be a use case! After having odd problems with their lemmatization routines (and ```pattern```), we just dropped it...

# Data loading and preparation

## load

In [6]:
with open(input_path, encoding="utf-8") as f:
    data = json.load(f)
data_df = pd.json_normalize(data['papers'])
corpus = data_df["abstract"]
if use_title:
    corpus = data_df["title"] + " " + corpus 


  and should_run_async(code)


## preprocess

In [None]:
corpus = preprocessing(
    corpus,
    lib=lib,
    stemming=stemming,
    lemmatization=lemmatization,
    min_word_len=2,
    max_word_len=15
)

  and should_run_async(code)


Dropping 0 entries of corpus, due to nan ...


1it [00:00,  8.62it/s]

Starting tokenization ...


880it [01:05, 14.37it/s]

In [None]:
dictionary = corpora.Dictionary(corpus["token"]) 
BoW_corpus = [dictionary.doc2bow(text) for text in corpus["token"]]

In [None]:
tfidf = models.TfidfModel(BoW_corpus)
corpus_tfidf = tfidf[BoW_corpus]

# LSI

## TFIDF

In [None]:
lsi_tfidf = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)# train model
lsi_tfidf[corpus_tfidf[1]]  # apply model to  document

## Bag of Words

In [None]:
lsi_bow = models.LsiModel(BoW_corpus, id2word=dictionary, num_topics=num_topics)
lsi_bow[BoW_corpus[1]]  # apply model to  document

# LDA

## init

In [None]:
# LDA model training 
lda_model = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

# Inspect

In [None]:
preprocessing_str = f"(Preprocessing with {lib}, Stemming = {stemming}, Lemmatization = {lemmatization})"

In [None]:
abstract_count = 0
for abstract in data_df["abstract"]:
    if abstract:
        abstract_count += 1

In [None]:
print("There are " + str(len(data_df["abstract"])) + f" Papers. {str(abstract_count)} of them have Abstracts.")

## Keywords

In [None]:
for keyword in data_df["keywords"]:
    print(keyword)

In [None]:
print(f"The Dataset contains {len(data_df)} Papers")
count_keywords = 0
all_keywords = []
for keyword in data_df["keywords"]:
    if keyword and keyword[0]:
        count_keywords += 1
        all_keywords = all_keywords + keyword
print(f"{count_keywords} of them contain Keywords.")
print(f"There are {len(all_keywords)} Keywords. {len(set(all_keywords))} of them are unique.")


## Corpus

In [None]:
pprint.pprint(dictionary.token2id)#token -> tokenId.

In [None]:
pprint.pprint(dictionary.dfs) # token_id -> how many documents contain this token.

In [None]:
pprint.pprint(BoW_corpus)# list of (token_id, token_count) 

In [None]:
for doc in corpus_tfidf:
    print(doc)

In [None]:
def get_top_words(corpus, label, method):
    all_words = {}
    for doc in corpus:
        for word_id, score in doc:
            word = dictionary.id2token[word_id]
            if word in all_words:
                if method == "TF-IDF":
                    all_words[word] += score / len(corpus)
                else:
                    all_words[word] += score
            else:
                if method == "TF-IDF":
                    all_words[word] = score / len(corpus)
                else:
                    all_words[word] = score
    df = pd.DataFrame(list(all_words.items()),
                       columns=['term', label])
    return df.sort_values(by=label, ascending=False).head(10)

In [None]:
def plot_histogram(top_words, method, legend):
    words_plot = top_words.plot.bar(x='term', y=legend, rot=0, fontsize= 16, figsize= (30, 10))
    words_plot.set_xlabel("Term", fontsize= 20)
    words_plot.set_ylabel(method, fontsize= 20)
    words_plot.set_title(f"Top words with {method} {preprocessing_str}", fontsize= 30)
    words_plot.yaxis.get_major_formatter().set_scientific(False)
    fig = words_plot.get_figure()
    fig.savefig(Path(f'imgs/top_words_{method}_preprocessing_{lib}_stemming_{stemming}_lemmatization_{lemmatization}_num_topics_{num_topics}.png'))

In [None]:
plot_histogram(get_top_words(corpus_tfidf, "weight", "TF-IDF"), "TF-IDF", "weight")


In [None]:
plot_histogram(get_top_words(BoW_corpus, "count", "BoW"), "Bow", "count")

## Topics

In [None]:
lsi_tfidf.print_topics()

In [None]:
lsi_bow.print_topics()

In [None]:
lda_model.print_topics()

## Plot

In [None]:
def plot_2d_space(corpus, method, corpus_name, method_name, use_tsne=False):

    if isinstance(method, models.ldamodel.LdaModel):
        documents_2d_1=[x[0][0][1] for x in method[corpus] if x]
        documents_2d_2=[x[0][1][1] for x in list(method[corpus]) if x]
    else:
        documents_2d_1=[x[0][1] for x in method[corpus] if x]
        documents_2d_2=[x[1][1] for x in list(method[corpus]) if x]


    fig, ax = plt.subplots(figsize=(10,10))

  # Get topic weights
    topic_weights = []
    for i, row_list in enumerate(method[corpus]):
        if row_list:
            if isinstance(method, models.ldamodel.LdaModel):
                topic_weights.append([w for i, w in row_list[0]])
            else:
                topic_weights.append([w for i, w in row_list])

    # Array of topic weights    
    arr = pd.DataFrame(topic_weights).fillna(0).values

    # Dominant topic number in each doc
    topic_num = np.argmax(arr, axis=1)

    if use_tsne:
        tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99)
        tsne = tsne_model.fit_transform(arr)
        documents_2d_1 = tsne[:,0]
        documents_2d_2 = tsne[:,1]
    ax.set_title(f"{corpus_name} in 2D-Space with {method_name} {preprocessing_str}")
    ax.scatter(documents_2d_1, documents_2d_2, c=topic_num, s=80 ,alpha=0.8)
    plt.savefig(Path(f'imgs/{method_name}_{corpus_name}_preprocessing_{lib}_stemming_{stemming}_lemmatization_{lemmatization}_num_topics_{num_topics}.png'))

In [None]:
plot_2d_space(BoW_corpus, lsi_bow, "BoW", "LSI")

In [None]:
plot_2d_space(corpus_tfidf, lsi_tfidf, "TF-IDF", "LSI")

In [None]:
plot_2d_space(corpus_tfidf, lda_model, "TF-IDF", "LDA")

In [None]:
plot_2d_space(corpus_tfidf, lda_model, "TF-IDF", "LDA with TSNE", use_tsne=True)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary=lda_model.id2word, mds='mmds')

In [None]:
vis