In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import random
import spacy

from gensim import models, corpora
from gensim import similarities
from gensim.models.coherencemodel import CoherenceModel
from wordcloud import WordCloud

In [None]:
with open('all_articles.txt', 'r', encoding='utf8') as f:
  articles = f.read().split('@delimiter')

In [None]:
# 111425 articles
print(len(articles))

In [None]:
DATASET_SIZE = 20
NUM_PROCESSES = 6
NUM_TOPICS = 20
dataset = articles[:DATASET_SIZE]

In [None]:
nlp = spacy.load('en_core_web_lg',disable=['parser','ner'])

In [None]:
def token_filter(tokenized_doc):
    filtered_tokens = []

    for token in tokenized_doc:
        if(token.is_alpha and token.pos_ in ['NOUN','VERB','ADJ'] and token.is_punct == False and token.is_space == False and token.is_stop == False):
            filtered_tokens.append(token.lemma_)
    
    # returns filtered_tokens of a particular doc object
    return filtered_tokens


In [None]:
tokenized_articles = list(map(token_filter,nlp.pipe(dataset,n_process=NUM_PROCESSES)))
tokenized_articles

In [None]:
# a Dictionary of word<-->id mappings is created
dictionary = corpora.Dictionary(tokenized_articles)

In [None]:
len(dictionary)

filter out words which occur in fewer than 5 (no_below = 5) documents and more than 50% (no_above = 0.5) of the documents.

In [None]:
dictionary.filter_extremes(no_below=5,no_above=0.5)

In [None]:
%%time
corpus_bow = [dictionary.doc2bow(article) for article in tokenized_articles]

In [None]:
%%time
lda_model = models.ldamodel.LdaModel(corpus=corpus_bow,
                                     id2word=dictionary,
                                     num_topics=NUM_TOPICS,
                                     passes=10,
                                     alpha='auto',
                                     eta='auto',
                                     random_state=1,
                                     NUM_PROCESSES=)

In [None]:
lda_model.save("lda_model.gensim")
dictionary.save("lda_dictionary.gensim")
corpora.MmCorpus.serialize("lda_corpus.mm", corpus_bow)  # Save the corpus in Matrix Market format

In [None]:
lda_model.print_topics()

In [None]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus_bow], num_features=len(dictionary))
lda_index.save("lda_index.sim")