In [19]:
import json
import pprint

import numpy as np
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from gensim import corpora, models
import pandas as pd
from sklearn.manifold import TSNE
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import sys
sys.path.insert(1, '../src/utils/')

from vectorize import preprocessing

  and should_run_async(code)


# Variables


In [7]:
num_topics = 2
use_title = False
stemming = True
lemmatization = True
lib = "gensim"

  and should_run_async(code)


# Data loading and preparation

## load

In [87]:
with open('..\\src\\data\\data_jmlr_vol17.json') as f:
    data = json.load(f)
data_df = pd.json_normalize(data['papers'])
corpus = data_df["abstract"]
if use_title:
    corpus = data_df["title"] + " " + corpus 


  and should_run_async(code)


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 91622: character maps to <undefined>

## preprocess

In [None]:
### Preprocess the Dataset ### 
tokenized =[] 
for sentence in corpus: 
  # the simple_preprocess function returns a list of each sentence 
  tokenized.append(simple_preprocess(sentence ,min_len=2, max_len=15)) # the minimum length of a token and  maximum length of a token.
print(tokenized)


In [80]:
tokenized = preprocessing(
    corpus,
    lib=lib,
    stemming=stemming,
    lemmatization=lemmatization,
    min_word_len=2,
    max_word_len=15
)

b'stochastic/JJ'


  and should_run_async(code)


TypeError: a bytes-like object is required, not 'str'

In [None]:
dictionary = corpora.Dictionary(tokenized) 
BoW_corpus = [dictionary.doc2bow(text) for text in tokenized]

In [None]:
tfidf = models.TfidfModel(BoW_corpus)
corpus_tfidf = tfidf[BoW_corpus]

# LSI

## TFIDF

In [None]:
lsi_tfidf = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)# train model
lsi_tfidf[corpus_tfidf[1]]  # apply model to  document

## Bag of Words

In [None]:
lsi_bow = models.LsiModel(BoW_corpus, id2word=dictionary, num_topics=num_topics)
lsi_bow[BoW_corpus[1]]  # apply model to  document

# LDA

## init

In [None]:
# LDA model training 
lda_model = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

# Inspect

## Keywords

In [None]:
for keyword in data_df["keywords"]:
  print(keyword)

In [None]:
print(f"The Dataset contains {len(data_df)} Papers")
count_keywords = 0
all_keywords = []
for keyword in data_df["keywords"]:
  if keyword and keyword[0]:
    count_keywords += 1
    all_keywords = all_keywords + keyword
print(f"{count_keywords} of them contain Keywords.")
print(f"There are {len(all_keywords)} Keywords. {len(set(all_keywords))} of them are unique.")


## Corpus

In [None]:
pprint.pprint(dictionary.token2id)#token -> tokenId.

In [None]:
pprint.pprint(dictionary.dfs) # token_id -> how many documents contain this token.

In [None]:
pprint.pprint(BoW_corpus)# list of (token_id, token_count) 

TODO: You can furthur filter and clean your data by using functions such as filter_extremes (remove all tokens that are less frequent or more frequent than a number), filter_n_most_frequent(filter out the ‘remove_n’ most frequent tokens), merge_with (to merge multiple dictionaries)

In [None]:
for doc in corpus_tfidf:
    print(doc)

## Topics

In [None]:
lsi_tfidf.print_topics()

In [None]:
lsi_bow.print_topics()

In [None]:
lda_model.print_topics()

## Plot

In [None]:
def plot_2d_space(corpus, method, use_tsne=False):

  if isinstance(method, models.ldamodel.LdaModel):
    documents_2d_1=[x[0][0][1] for x in method[corpus] if x]
    documents_2d_2=[x[0][1][1] for x in list(method[corpus]) if x]
  else:
    documents_2d_1=[x[0][1] for x in method[corpus] if x]
    documents_2d_2=[x[1][1] for x in list(method[corpus]) if x]


  fig, ax = plt.subplots(figsize=(10,10))

  # Get topic weights
  topic_weights = []
  for i, row_list in enumerate(method[corpus]):
    if row_list:
      if isinstance(method, models.ldamodel.LdaModel):
        topic_weights.append([w for i, w in row_list[0]])
      else:
        topic_weights.append([w for i, w in row_list])

  # Array of topic weights    
  arr = pd.DataFrame(topic_weights).fillna(0).values

  # Dominant topic number in each doc
  topic_num = np.argmax(arr, axis=1)

  if use_tsne:
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99)
    tsne = tsne_model.fit_transform(arr)
    documents_2d_1 = tsne[:,0]
    documents_2d_2 = tsne[:,1]

  ax.scatter(documents_2d_1, documents_2d_2, c=topic_num, s=80 ,alpha=0.8)
  for i in range(len(documents_2d_1)):
      ax.annotate(i, (documents_2d_1[i], documents_2d_2[i]))

In [None]:
plot_2d_space(BoW_corpus, lsi_bow)

In [None]:
plot_2d_space(corpus_tfidf, lsi_tfidf)

In [None]:
plot_2d_space(corpus_tfidf, lda_model)

In [None]:
plot_2d_space(corpus_tfidf, lda_model, use_tsne=True)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary=lda_model.id2word, mds='mmds')

In [None]:
vis