#Part I - Preperations

##Installs

First run instructions:

  1. Run installs once (first code cell).
  2. Restart runtime.
  3. Run all (updating pd required version).

In [None]:
!pip install stop-words
!pip install pyLDAvis
!pip install --upgrade gensim

** upgrade gensim to version 4.1.0. **NOTE:** does not support LdaMallet anymore.

##Imports

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

import gensim
from gensim import corpora, models
from gensim.models import LsiModel, LdaModel, LdaMulticore, HdpModel, EnsembleLda
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import cossim

from stop_words import get_stop_words
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import nltk
import pprint
import logging

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy

import warnings
from torch.serialization import SourceChangeWarning

##Notebook pre-settings

train/load mode

In [None]:
LOAD = True

Supress deprication warnings.

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Downloading wordnet to get english-stop-words from.

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Set pandas option to print all cell content.

In [None]:
pd.set_option('display.max_colwidth', None)

Download data

Links id:

    data = 1V-7cI74NT9wsl-vK02-IMA6ITkBEWHbC
    my_test_input = 1AEl-jpWfY79XFbu4yItqyIUh31Pu1tXo
    titles_test_input = 1agQ4_-X9BkJmZ1K8mY3R3PzRv9-LOyWB

In [None]:
!gdown --id 1V-7cI74NT9wsl-vK02-IMA6ITkBEWHbC
!gdown --id 1AEl-jpWfY79XFbu4yItqyIUh31Pu1tXo
!gdown --id 1agQ4_-X9BkJmZ1K8mY3R3PzRv9-LOyWB

Downloading...
From: https://drive.google.com/uc?id=1V-7cI74NT9wsl-vK02-IMA6ITkBEWHbC
To: /content/US-Economic-News.csv
12.5MB [00:00, 46.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AEl-jpWfY79XFbu4yItqyIUh31Pu1tXo
To: /content/test_sentences.csv
100% 79.0/79.0 [00:00<00:00, 44.4kB/s]
Downloading...
From: https://drive.google.com/uc?id=1agQ4_-X9BkJmZ1K8mY3R3PzRv9-LOyWB
To: /content/voted-kaggle-dataset.csv
4.29MB [00:00, 66.5MB/s]


##General topic-modeling settings

With nltk

In [None]:
pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
en_stop = get_stop_words('en')
lemmatizer = WordNetLemmatizer()
remove_words = ['br', 'ûªs', 'ûª', 'ûªt', 'ûó', 'ðê', 'ûïwe', 'ûïthe', 'ûïi', 
                'ûªre', 'de', 'ûªve', 'ûïit', 'ûóand', 'ûªll', 'er', 'tion', 
                'û_', 'åç', 'åè', 'ûótaiwan', 'ûïlee', 'sh', 'ûªaiwlfw', 'ofgltsw',
                ]


With SpaCy

In [None]:
nlp = spacy.load('en')
more_stop_words = ['say', '\s', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be'] + en_stop
nlp.Defaults.stop_words.update(more_stop_words) # Updates spaCy's default stop words list with my additional words. 
remove = [r'</br>', '<br>']
for stopword in more_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

##Util functions

Spacy functions

In [None]:
def clean_raw_texts(data_df, text_col_name, range=None):
    all_texts = data_df[text_col_name].values

    if range is not None and isinstance(range, int):
        all_texts = all_texts[:range]

    for i, text in enumerate(all_texts):
        for rw in remove:
            if rw in text:
                text = text.replace(rw, '') # delete </br> tags leftovers.
        all_texts[i] = text

    # bigram = gensim.models.phrases.Phrases(all_texts)
    # all_texts = [bigram[line] for line in all_texts]

    return all_texts

In [None]:
def to_docs_with_spacy(texts):
    docs = []

    for text in tqdm(texts):
        doc = nlp(text)
        docs.append(doc)
    
    return docs

In [None]:
def clean_docs_with_spacy(docs):
    texts, tmp = [], []

    for doc in docs:
        for word in doc:                  
            if word.text != '\n' and not word.is_stop and not word.is_punct and not word.like_num and len(word.text) > 1:
                tmp.append(word.lemma_)
                
        texts.append(tmp)
        tmp = []
    
    bigram = gensim.models.phrases.Phrases(texts)
    texts = [bigram[line] for line in texts]

    return texts

In [None]:
def preprocess_with_spacy(data_df, text_col_name, range=None):
    texts = clean_raw_texts(data_df, text_col_name, range)
    docs = to_docs_with_spacy(texts)
    texts = clean_docs_with_spacy(docs)

    return docs, texts

In [None]:
def text_to_corpus(texts):
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]  

    return dictionary, corpus

In [None]:
def get_model(model_tag, configuration):
    models = {
        'lsi': LsiModel,
        'lda': LdaModel,
        'hdp': HdpModel,
        'lda_mult': LdaMulticore,
        'ensemble': EnsembleLda,
    }

    if model_tag not in models.keys():
        print("no model under tag:", model_tag)
        print("available tags:", [key for key in models.keys()])
        return None

    return models[model_tag](**configuration)

In [None]:
def save_model(model, filename):
    ext = '.model'
    fixed_name = filename.replace(ext, '')
    ver = 1
    while os.path.isfile(fixed_name):
        v += 1
        fixed_name += v

    fixed_name += ext

    model.save(fixed_name)

def load_trained_model(filename, model_tag):
    models = models = {
        'lsi': LsiModel,
        'lda': LdaModel,
        'hdp': HdpModel,
        'lda_mult': LdaMulticore,
        'ensemble': EnsembleLda,
    }

    if model_tag not in models.keys():
        print("no model under tag:", model_tag)
        print("available tags:", [key for key in models.keys()])
        return None
    
    ext = '.model'
    filename.replace(ext, '')
    filename += ext

    return models[model_tag].load(filename)

#Part II - Dataset & Models construction

Read data from csv files

In [None]:
df = pd.read_csv('/content/US-Economic-News.csv', encoding='utf_8')
test_df = pd.read_csv('/content/test_sentences.csv')
test_articles_df = pd.read_csv('/content/voted-kaggle-dataset.csv')

SpaCy data pre-processing

In [None]:
%%time
docs, texts = preprocess_with_spacy(df, 'text')

  0%|          | 0/8000 [00:00<?, ?it/s]

2021-09-14 13:34:47,748 : INFO : collecting all words and their counts
2021-09-14 13:34:47,751 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-09-14 13:34:50,074 : INFO : collected 609451 token types (unigram + bigrams) from a corpus of 947683 words and 8000 sentences
2021-09-14 13:34:50,077 : INFO : merged Phrases<609451 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-09-14 13:34:50,080 : INFO : Phrases lifecycle event {'msg': 'built Phrases<609451 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 2.33s', 'datetime': '2021-09-14T13:34:50.080836', 'gensim': '4.1.0', 'python': '3.7.11 (default, Jul  3 2021, 18:01:19) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}


CPU times: user 6min 32s, sys: 7.27 s, total: 6min 39s
Wall time: 6min 41s


In [None]:
%%time
test_docs, test_texts = preprocess_with_spacy(test_df, 'test')

  0%|          | 0/5 [00:00<?, ?it/s]

2021-09-14 13:34:53,680 : INFO : collecting all words and their counts
2021-09-14 13:34:53,697 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-09-14 13:34:53,714 : INFO : collected 11 token types (unigram + bigrams) from a corpus of 8 words and 5 sentences
2021-09-14 13:34:53,727 : INFO : merged Phrases<11 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-09-14 13:34:53,742 : INFO : Phrases lifecycle event {'msg': 'built Phrases<11 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 0.06s', 'datetime': '2021-09-14T13:34:53.742423', 'gensim': '4.1.0', 'python': '3.7.11 (default, Jul  3 2021, 18:01:19) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}


CPU times: user 134 ms, sys: 15.8 ms, total: 150 ms
Wall time: 268 ms


In [None]:
displacy.render(docs[0], style='ent', jupyter=True)

In [None]:
displacy.render(docs[0], style='dep', jupyter=True, options={'distance':80})

In [None]:
print('texts length:', len(texts))
print('test_texts length:', len(test_texts))

texts length: 8000
test_texts length: 5


Turn to Dictionary and Corpus

In [None]:
dictionary, corpus = text_to_corpus(texts)
test_dictionary, test_corpus = text_to_corpus(test_texts)

Configurations

In [None]:
lsi_conf = {
    "corpus": corpus,
    "id2word": dictionary,
    "num_topics": 7,
    "decay": 0.5,
}

lda_conf = {
    'corpus': corpus,
    'id2word': dictionary,
    'num_topics': 15,
    'passes': 20,
}

hdp_conf = {
    'corpus': corpus,
    'id2word': dictionary,
    'T': 20,
}

ens_conf = {
    'corpus': corpus,
    'id2word': dictionary,
    'num_topics': 15,
    'passes': 20,
    'num_models': 7,
    'topic_model_class': LdaModel,
    'ensemble_workers': 4,
    'distance_workers': 4,
    'iterations': 150, 
}

lda_mult_conf = {
    'corpus': corpus,
    'id2word': dictionary,
    'num_topics': 15,
    'passes': 20,
}

Build models

In [None]:
# lsi_model = get_model("lsi", lsi_conf) # Latent Semantic Indexing (not usefull for our purpose)

In [None]:
%%time
lda_best_trained = 'lda_15_topics_20_passes'
if LOAD:
    lda_model(lda_best_trained, 'lda')
else:
    lda_model = get_model("lda", lda_conf) # Latent Dirichlet Allocation - supervised model.
    save_model(lda_model, 'new_lda')

SyntaxError: ignored

In [None]:
%%time
hdp_model = get_model("hdp", hdp_conf) # Hierarchical Dirichlet process - unsupervised model.

###View models topics

In [None]:
# View Topics - Lda
pprint.pprint(lda_model.print_topics())

2021-09-14 14:11:17,106 : INFO : topic #0 (0.067): 0.009*"year" + 0.009*"state" + 0.008*"work" + 0.007*"pay" + 0.006*"new" + 0.006*"percent" + 0.006*"time" + 0.006*"job" + 0.006*"people" + 0.006*"program"
2021-09-14 14:11:17,112 : INFO : topic #1 (0.067): 0.030*"rate" + 0.024*"interest_rate" + 0.023*"Fed" + 0.023*"Federal_Reserve" + 0.017*"bank" + 0.012*"inflation" + 0.010*"loan" + 0.009*"bond" + 0.009*"mortgage" + 0.008*"year"
2021-09-14 14:11:17,118 : INFO : topic #2 (0.067): 0.008*"Mr._Bernanke" + 0.005*"Williams" + 0.004*"communist" + 0.003*"Wall_Streeters" + 0.003*"sister" + 0.003*"nursing_home" + 0.003*"Poland" + 0.003*"novel" + 0.003*"love" + 0.003*"monetarist"
2021-09-14 14:11:17,123 : INFO : topic #3 (0.067): 0.023*"percent" + 0.021*"year" + 0.016*"increase" + 0.015*"rise" + 0.015*"month" + 0.014*"report" + 0.013*"price" + 0.011*"economy" + 0.011*"inflation" + 0.010*"rate"
2021-09-14 14:11:17,128 : INFO : topic #4 (0.067): 0.027*"U.S." + 0.014*"country" + 0.013*"government" + 

[(0,
  '0.009*"year" + 0.009*"state" + 0.008*"work" + 0.007*"pay" + 0.006*"new" + '
  '0.006*"percent" + 0.006*"time" + 0.006*"job" + 0.006*"people" + '
  '0.006*"program"'),
 (1,
  '0.030*"rate" + 0.024*"interest_rate" + 0.023*"Fed" + '
  '0.023*"Federal_Reserve" + 0.017*"bank" + 0.012*"inflation" + 0.010*"loan" + '
  '0.009*"bond" + 0.009*"mortgage" + 0.008*"year"'),
 (2,
  '0.008*"Mr._Bernanke" + 0.005*"Williams" + 0.004*"communist" + '
  '0.003*"Wall_Streeters" + 0.003*"sister" + 0.003*"nursing_home" + '
  '0.003*"Poland" + 0.003*"novel" + 0.003*"love" + 0.003*"monetarist"'),
 (3,
  '0.023*"percent" + 0.021*"year" + 0.016*"increase" + 0.015*"rise" + '
  '0.015*"month" + 0.014*"report" + 0.013*"price" + 0.011*"economy" + '
  '0.011*"inflation" + 0.010*"rate"'),
 (4,
  '0.027*"U.S." + 0.014*"country" + 0.013*"government" + 0.012*"market" + '
  '0.012*"world" + 0.008*"debt" + 0.007*"crisis" + 0.007*"bank" + '
  '0.007*"global" + 0.006*"United_States"'),
 (5,
  '0.011*"Wilson" + 0.008*

In [None]:
# View Topics - Hdp
pprint.pprint(hdp_model.print_topics())

2021-09-14 14:11:17,267 : INFO : (0, '0.008*year + 0.005*market + 0.004*rate + 0.004*stock + 0.004*U.S. + 0.004*rise + 0.004*percent + 0.004*price + 0.004*company + 0.003*increase')
2021-09-14 14:11:17,337 : INFO : (1, '0.005*year + 0.003*rate + 0.003*market + 0.003*U.S. + 0.002*stock + 0.002*price + 0.002*high + 0.002*rise + 0.002*increase + 0.002*company')
2021-09-14 14:11:17,407 : INFO : (2, '0.002*year + 0.002*rise + 0.002*price + 0.002*market + 0.002*stock + 0.002*U.S. + 0.002*rate + 0.001*high + 0.001*company + 0.001*investor')
2021-09-14 14:11:17,480 : INFO : (3, '0.002*year + 0.001*company + 0.001*rise + 0.001*U.S. + 0.001*rate + 0.001*price + 0.001*stock + 0.001*market + 0.001*high + 0.001*investor')
2021-09-14 14:11:17,547 : INFO : (4, '0.001*U.S. + 0.001*company + 0.001*market + 0.001*year + 0.001*price + 0.001*stock + 0.001*rise + 0.001*high + 0.001*investor + 0.001*Mr.')
2021-09-14 14:11:17,622 : INFO : (5, '0.001*year + 0.001*stock + 0.001*market + 0.001*U.S. + 0.001*comp

[(0,
  '0.008*year + 0.005*market + 0.004*rate + 0.004*stock + 0.004*U.S. + '
  '0.004*rise + 0.004*percent + 0.004*price + 0.004*company + 0.003*increase'),
 (1,
  '0.005*year + 0.003*rate + 0.003*market + 0.003*U.S. + 0.002*stock + '
  '0.002*price + 0.002*high + 0.002*rise + 0.002*increase + 0.002*company'),
 (2,
  '0.002*year + 0.002*rise + 0.002*price + 0.002*market + 0.002*stock + '
  '0.002*U.S. + 0.002*rate + 0.001*high + 0.001*company + 0.001*investor'),
 (3,
  '0.002*year + 0.001*company + 0.001*rise + 0.001*U.S. + 0.001*rate + '
  '0.001*price + 0.001*stock + 0.001*market + 0.001*high + 0.001*investor'),
 (4,
  '0.001*U.S. + 0.001*company + 0.001*market + 0.001*year + 0.001*price + '
  '0.001*stock + 0.001*rise + 0.001*high + 0.001*investor + 0.001*Mr.'),
 (5,
  '0.001*year + 0.001*stock + 0.001*market + 0.001*U.S. + 0.001*company + '
  '0.001*investor + 0.001*Mr. + 0.001*rise + 0.001*economy + 0.000*bank'),
 (6,
  '0.000*market + 0.000*Fed + 0.000*U.S. + 0.000*rate + 0.000*

###Mallet model - Not supported

**Gensim ver 4.1.0 does not support Mallet model.**

**MalletModel 2 LdaModel does not work properly, so we upgraded gensim and neglected the Mallet model.**

**Therefore, all cells regarding MalletModel are commented out.**

download and unzip LdaMallet

In [None]:
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# !unzip mallet-2.0.8.zip

In [None]:
# def install_java():
#   !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
#   os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
#   !java -version       #check java version
# install_java()

In [None]:
# from gensim.models.wrappers import LdaMallet

# mallet_path = '/content/mallet-2.0.8/bin/mallet'
# lda_mallet = LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=dictionary)

In [None]:
# lda_mallet.show_topics(-1)

In [None]:
# gensim_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [None]:
# (coherencemodel = CoherenceModel(model=gensim_lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
# print(coherencemodel.get_coherence())

In [None]:
# pyLDAvis.enable_notebook()
# vis_mallet_data = gensimvis.prepare(gensim_lda_model, corpus, dictionary)
# pyLDAvis.display(vis_mallet_data)

###EnsembleLda



In [None]:
%%time
ens_best_trained = 'ensemble_7_lda_150_iterations_20_passes'
if LOAD:
    ensemble = load_trained_model(ens_best_trained, 'ensemble')
else:
    ensemble = get_model('ensemble', ens_conf)
    save_model(ensemlbe, 'new_ensemble_lda')

In [None]:
shape = ensemble.asymmetric_distance_matrix.shape
print('shape:', shape)
without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0], dtype=bool)].reshape(shape[0], -1)
print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())

ensemble.recluster(eps=0.2, min_samples=2, min_cores=2)

2021-09-14 14:21:13,679 : INFO : fitting the clustering model
2021-09-14 14:21:13,710 : INFO : generating stable topics
2021-09-14 14:21:13,715 : INFO : found 13 clusters
2021-09-14 14:21:13,750 : INFO : found 10 stable topics
2021-09-14 14:21:13,754 : INFO : generating classic gensim model representation based on results from the ensemble
2021-09-14 14:21:13,763 : INFO : using symmetric alpha at 0.1
2021-09-14 14:21:13,766 : INFO : using symmetric eta at 0.1
2021-09-14 14:21:13,787 : INFO : using serial LDA version on this node


shape: (120, 120)
0.005534759203446371 0.8103933178432241 1.0


2021-09-14 14:21:13,871 : INFO : running online (multi-pass) LDA training, 10 topics, 0 passes over the supplied corpus of 8000 documents, updating model once every 2000 documents, evaluating perplexity every 8000 documents, iterating 50x with a convergence threshold of 0.001000
2021-09-14 14:21:13,878 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=56564, num_topics=10, decay=0.5, chunksize=2000) in 0.01s', 'datetime': '2021-09-14T14:21:13.878121', 'gensim': '4.1.0', 'python': '3.7.11 (default, Jul  3 2021, 18:01:19) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}


In [None]:
print("num of topics in ensemble:", len(ensemble.get_topics()))

num of topics in ensemble: 10


#Part III - Models Visualizations

LdaModel visual

In [None]:
pyLDAvis.enable_notebook()
vis_lda = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_lda)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


HdpModel visual

In [None]:
vis_hdp = gensimvis.prepare(hdp_model, corpus, dictionary)
pyLDAvis.display(vis_hdp)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


LdaEnsemble

In [None]:
vis_lda_ens = gensimvis.prepare(ensemble.classic_model_representation, corpus, dictionary)
pyLDAvis.display(vis_lda_ens)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


#Part IV - Compare models

In order to compare different topic-models, calculate a 'coherence' and 'preplexity' values.

##Coherence

Coherence is a ratio between models topics and it reflects the level of the human-logic perception of the model. The higher The better.

- There are two main methods: 'u_mass' and 'c_v'

In [None]:
cm_umass_lda = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
coherence_umass_lda = cm_umass_lda.get_coherence()

cm_cv_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_cv_lda = cm_cv_lda.get_coherence()

cm_cv_hdp = CoherenceModel(model=hdp_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_cv_hdp = cm_cv_hdp.get_coherence()

In [None]:
coherence_umass_lda

-7.367485127160617

coherence history:

**7 topics**
    
    lda: 0.5062256897837729, 0.5448603849002667

    ens: 0.5758941699203135, 0.5444156252198165

**10 topics**

    ens: 0.5781681353837331(passes=20)

**15 topics**

    lda: 0.5640849068140356

    ens: 0.5887390636591965(passes=3), 0.5886554167545726(20 passes)

In [None]:
coherence_cv_lda

0.5739542270019945

In [None]:
coherence_cv_hdp

0.5344888002233145

In [None]:
cm_cv_ens = CoherenceModel(model=ensemble, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_cv_ens = cm_cv_ens.get_coherence()

In [None]:
coherence_cv_ens

0.6174554795540548

##Preplexity

Preplexity is a measure of how good a model is. The lower the better.

In [None]:
preplexity_lda = lda_model.log_perplexity(corpus)
preplexity_ens = ensemble.log_perplexity(corpus)

2021-09-14 16:09:56,692 : INFO : -9.721 per-word bound, 844.1 perplexity estimate based on a held-out corpus of 8000 documents with 840921 words
2021-09-14 16:10:03,002 : INFO : -9.559 per-word bound, 754.1 perplexity estimate based on a held-out corpus of 8000 documents with 840921 words


In [None]:
preplexity_lda

-9.721228313791771

In [None]:
preplexity_ens

-9.558631550073498

##Difference

Compare differences between models: Lda vs. EnsembleLda

In [None]:
mdiff, annotation = lda_model.diff(ensemble.generate_gensim_representation(), distance='jensen_shannon')

In [None]:
print(mdiff)

[[0.65506657 0.79951505 0.86647833 0.66734539 0.72057789]
 [0.4774283  0.65357744 0.87103098 0.74461597 0.16296525]
 [0.98850598 1.         0.86235675 0.90233343 0.91877213]
 [0.0955497  0.59341439 0.89231994 0.62115372 0.61306076]
 [0.84535052 0.85537036 0.45795395 0.76224698 0.77257149]
 [0.9760461  0.97398292 0.84907689 0.87022385 0.93107806]
 [0.97341619 0.96805631 0.84563905 0.86850353 0.93008487]
 [0.99810733 0.9965131  0.81887601 0.89474303 0.95210318]
 [0.54719335 0.05366259 0.88531908 0.61770878 0.65797544]
 [0.89077696 0.87209877 0.88958088 0.83780497 0.80328532]
 [0.68841946 0.66898503 0.8227623  0.19073801 0.75222423]
 [0.81316041 0.82086377 0.84687504 0.76974072 0.84808898]
 [0.68435711 0.68177749 0.34476458 0.75852275 0.66016869]
 [0.98677447 0.98271309 0.85815949 0.87420245 0.93954368]
 [0.55297215 0.78007663 0.79816287 0.70588087 0.63316069]]


#Part V - Inference

In [None]:
def find_similiar_docs_by_cossim(model, corpus, test_corpus):
    if isinstance(model, EnsembleLda):
        docs_topics = model.generate_gensim_representation().get_document_topics(corpus)
        test_docs_topics = model.generate_gensim_representation().get_document_topics(test_corpus)
    elif isinstance(model, LdaModel):
        docs_topics = model.get_document_topics(corpus)
        test_docs_topics = model.get_document_topics(test_corpus)
    else:
        return None, None

    cs_map = []
    max_values = [0 for _ in range(len(test_docs_topics))]
    max_docs_idxs = [0 for _ in range(len(test_docs_topics))]

    for doc_idx, doc_topics in enumerate(docs_topics):
        for i, test_doc_topics in enumerate(test_docs_topics):
            cs = cossim(doc_topics, test_doc_topics)
            if cs > max_values[i]:
                max_values[i] = cs
                max_docs_idxs[i] = doc_idx

    return max_values, max_docs_idxs

In [None]:
test_corpus

[[(0, 1)], [(1, 1), (2, 1), (3, 1)], [(4, 1)], [(5, 1), (6, 1)], [(7, 1)]]

In [None]:
max_values, max_docs_idxs = find_similiar_docs_by_cossim(lda_model, corpus, test_corpus)

In [None]:
max_values

[0.9791280054441839,
 0.9971664659518216,
 0.9851451333296798,
 0.9672786436660437,
 0.9817375171838574]

In [None]:
print('most similar docs titles indexes:', max_docs_idxs)

most similar docs titles indexes: [1064, 1474, 5011, 1494, 3211]


In [None]:
def print_docs_text(df, test_df, docs_idxs, test_col_name):
    for i, doc in enumerate(docs_idxs):
        test_headline = test_df.iloc[i][test_col_name]
        pred_headline = df.iloc[doc]['headline']
        pred_text = df.iloc[doc]['text']

        print('input text:', test_headline)
        print('output doc title:', pred_headline)
        print('output doc text:', pred_text, end="\n\n")

In [None]:
print_docs_text(df, test_df, max_docs_idxs, 'test')

input text: inflation
output doc title: Fed's Williams: Fed Not Near Limit on Bond Buying
output doc text: The Federal Reserve isn't near a limit on how many Treasury or mortgage-backed securities it can purchase, Federal Reserve President John Williams said in an interview with The Wall Street Journal.Some Fed officials have been concerned that if the central bank buys too many bonds in these markets it could become such a big player that these markets become illiquid and stop functioning properly. Mr. Williams said the Fed isn't close to causing those kinds of problems. He said he wants to keep buying $85 billiong per month of long-term securities in 2013.He is in a camp of policy activists at the Fed who want the central bank to keep buying mortgage and Treasury bonds next year to push down long-term interest rates in hopes of boosting the economy.The Fed next meets Dec. 11-12. It is widely expected to continue its $40 billion-per-month mortgage-bond-buying program. It must decide w

In [None]:
def get_most_likely_topic(model, unseen_corpus):
    dist = model.get_document_topics(unseen_corpus)
    most_sim_topics = []
    
    for id, doc in enumerate(dist):
        topics = []
        probs = []
        for topic, prob in doc:
            topics.append(topic)
            probs.append(prob)
        max_p = max(probs)
        topic = topics[probs.index(max_p)]
        most_sim_topics.append((id, topic))
    
    return most_sim_topics

In [None]:
get_most_likely_topic(lda_model, test_corpus)

[(0, 1), (1, 1), (2, 0), (3, 8), (4, 1)]

In [None]:
def find_docs_similiar_by_similarity(corpus, dictionary, test_corpus, input_df, input_col, output_df, output_col):
    index_tmpfile = get_tmpfile("index")
    index = Similarity(index_tmpfile, corpus, num_features=len(dictionary))
    similarities = index[test_corpus]

    sim_dict = {}
    sim_dict['sim_docs_id'] = []
    sim_dict['sim_docs_titles'] = []
    sim_dict['sim_docs_texts'] = []
    sim_dict['sim_val'] = []
    for id, doc in enumerate(similarities):
        most_sim_val = max(similarities[id])
        most_sim_doc_index = list(similarities[id]).index(most_sim_val)
        sim_dict['sim_docs_id'].append((id, most_sim_doc_index))
        sim_dict['sim_docs_titles'].append((input_df.iloc[[sim_dict['sim_docs_id'][id][0]]][input_col].values[0], output_df.iloc[[sim_dict['sim_docs_id'][id][1]]][output_col].values[0])) #translate docs indexes to titles.
        sim_dict['sim_val'].append(most_sim_val)
        sim_dict['sim_docs_texts'].append(output_df.iloc[id]['text'])

    return sim_dict

In [None]:
sim_dict = find_docs_similiar_by_similarity(corpus, dictionary, test_corpus=test_corpus, input_df=test_df, input_col="test", output_df=df, output_col="headline")

In [None]:
for i, sim_titles in enumerate(sim_dict['sim_docs_titles']):
  print(sim_titles[0])
  print(sim_titles[1])
  print(sim_dict['sim_docs_texts'][i])
  print('similarity val:', sim_dict['sim_val'][i], end="\n\n")

inflation
Yields on CDs Fell in the Latest Week
NEW YORK -- Yields on most certificates of deposit offered by major banks dropped more than a tenth of a percentage point in the latest week, reflecting the overall decline in short-term interest rates.On small-denomination, or "consumer," CDs sold directly by banks, the average yield on six-month deposits fell to 5.49% from 5.62% in the week ended yesterday, according to an 18-bank survey by Banxquote Money Markets, a Wilmington, Del., information service.On three-month "consumer" deposits, the average yield sank to 5.29% from 5.42% the week before, according to Banxquote. Two banks in the Banxquote survey, Citibank in New York and CoreStates in Pennsylvania, are paying less than 5% on threemonth small-denomination CDs.Declines were somewhat smaller on five-year consumer CDs, which eased to 7.37% from 7.45%, Banxquote said.Yields on three-month and six-month Treasury bills sold at Monday's auction plummeted more than a fifth of a percent

In [None]:
test_articles_docs, test_articles_texts = preprocess_with_spacy(test_articles_df, 'Title')
test_articles_dictionary, test_articles_corpus = text_to_corpus(test_articles_texts)
test_max_values, test_max_docs_idxs = find_similiar_docs_by_cossim(ensemble, corpus, test_corpus)

  0%|          | 0/2885 [00:00<?, ?it/s]

2021-09-14 16:10:44,324 : INFO : collecting all words and their counts
2021-09-14 16:10:44,326 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-09-14 16:10:44,346 : INFO : collected 9596 token types (unigram + bigrams) from a corpus of 8932 words and 2885 sentences
2021-09-14 16:10:44,348 : INFO : merged Phrases<9596 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2021-09-14 16:10:44,349 : INFO : Phrases lifecycle event {'msg': 'built Phrases<9596 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 0.03s', 'datetime': '2021-09-14T16:10:44.349837', 'gensim': '4.1.0', 'python': '3.7.11 (default, Jul  3 2021, 18:01:19) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-09-14 16:10:44,395 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-09-14 16:10:44,434 : INFO : built Dictionary(4216 unique tokens: ['card', 'credit', 'detection', 'fraud', 'Database']...) from 2885 doc

In [None]:
print_docs_text(df, test_df, test_max_docs_idxs, 'test')

input text: inflation
output doc title: Business and Finance
output doc text: Stocks hit new highs for the year, commodities prices rose and the dollar slipped as traders focused on signs of economic growth in Asia and Fed chief Bernanke's commitment to low interest rates. The Dow industrials rose 136.49 points, or 1.3%, to 10406.96. Gold climbed to $1,138.60.Bernanke warned that unemployment, tepid lending and troubles in commercial real estate will weigh on recovery. He also said the Fed is watching the dollar's trajectory.---The New York Fed caved in to demands by AIG's trading partners that they be paid in full for complex securities they had insured with the firm, a government audit found.---

input text: crisis in stock market
output doc title: Late Buying Spurt Pushes List Upward: Ticker Lags Near Close
output doc text: .NEW YORK, Aug. 17 (AP>‰ÛÓLate buying rescued a faltering stock market advance today and the list ended irregularly higher in fairly active trading.The ticker ta