In [1]:
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import ldamodel

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#### Data

In [3]:
texts_week51 = pd.read_csv(r'corpus_week51.csv', delimiter=',')
texts_week52 = pd.read_csv(r'corpus_week52.csv', delimiter=',')
texts_week53 = pd.read_csv(r'corpus_week53.csv', delimiter=',')
texts_week1 = pd.read_csv(r'corpus_week1.csv', delimiter=',')
texts_week2 = pd.read_csv(r'corpus_week2.csv', delimiter=',')
texts_week3 = pd.read_csv(r'corpus_week3.csv', delimiter=',')
texts_week4 = pd.read_csv(r'corpus_week4.csv', delimiter=',')

corpus51 = [list(x) for x in texts_week51.fillna('').values]
corpus52 = [list(x) for x in texts_week52.fillna('').values]
corpus53 = [list(x) for x in texts_week53.fillna('').values]
corpus1 = [list(x) for x in texts_week1.fillna('').values]
corpus2 = [list(x) for x in texts_week2.fillna('').values]
corpus3 = [list(x) for x in texts_week3.fillna('').values]
corpus4 = [list(x) for x in texts_week4.fillna('').values]

In [None]:
texts_week51
print(corpus52)

#### Build model with test corpus

In [5]:
dictionary = Dictionary(corpus52)
corpus = [dictionary.doc2bow(text) for text in corpus51]

np.random.seed(1) # setting random seed to get the same results each time.

from gensim.models import ldamodel
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=4, minimum_probability=1e-8)
model.show_topics()

2022-04-25 01:00:50,469 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-04-25 01:00:50,551 : INFO : built Dictionary(5135 unique tokens: ['', 'action', 'already', 'asap', 'country']...) from 4846 documents (total 135688 corpus positions)
2022-04-25 01:00:50,552 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(5135 unique tokens: ['', 'action', 'already', 'asap', 'country']...) from 4846 documents (total 135688 corpus positions)", 'datetime': '2022-04-25T01:00:50.552640', 'gensim': '4.1.2', 'python': '3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-04-25 01:00:50,613 : INFO : using symmetric alpha at 0.25
2022-04-25 01:00:50,613 : INFO : using symmetric eta at 0.25
2022-04-25 01:00:50,614 : INFO : using serial LDA version on this node
2022-04-25 01:00:50,619 : INFO : running online (single-pass) LDA training, 4 topics, 1 passes over the supplied corpus of

[(0,
  '0.820*"" + 0.010*"election" + 0.003*"steal" + 0.003*"vote" + 0.003*"people" + 0.002*"stand" + 0.002*"president" + 0.002*"never" + 0.002*"go" + 0.002*"let"'),
 (1,
  '0.006*"" + 0.005*"match" + 0.004*"conspire" + 0.003*"oligarchs_actively" + 0.003*"stick" + 0.003*"conversation" + 0.002*"road" + 0.002*"bail" + 0.002*"joe" + 0.002*"weekend"'),
 (2,
  '0.005*"" + 0.002*"freak" + 0.002*"virus" + 0.002*"girl" + 0.002*"daily" + 0.002*"giant" + 0.002*"insult" + 0.002*"tidal" + 0.002*"accountability" + 0.001*"fboloudcom"'),
 (3,
  '0.054*"loyalty" + 0.053*"outcome" + 0.053*"publish" + 0.051*"rioter" + 0.040*"manufacturing" + 0.030*"" + 0.021*"fraud" + 0.017*"trade" + 0.008*"voter" + 0.008*"outline"')]

In [13]:
doc_51 = corpus51[5]
doc_52 = corpus52[0]
doc_53 = corpus53[0]
doc_1 = corpus1[4]
doc_2 = corpus2[1]
doc_3 = corpus3[0]
doc_4 = corpus4[1]


# now let's make these into a bag of words format
bow_51 = model.id2word.doc2bow(doc_51)   
bow_52 = model.id2word.doc2bow(doc_52)   
bow_53 = model.id2word.doc2bow(doc_53)
bow_1 = model.id2word.doc2bow(doc_1)   
bow_2 = model.id2word.doc2bow(doc_2)   
bow_3 = model.id2word.doc2bow(doc_3)
bow_4 = model.id2word.doc2bow(doc_4)   

# we can now get the LDA topic distributions for these
lda_bow_51 = model[bow_51]
lda_bow_52 = model[bow_52]
lda_bow_53 = model[bow_53]
lda_bow_1 = model[bow_1]
lda_bow_2 = model[bow_2]
lda_bow_3 = model[bow_3]
lda_bow_4 = model[bow_4]


In [7]:
lda_bow_51

[(0, 0.97848237), (1, 0.0071551995), (2, 0.0071550966), (3, 0.0072073075)]

In [14]:
from gensim.matutils import hellinger

print("The distance between week 51 and week 52: {} \n".format(hellinger(lda_bow_51, lda_bow_52)))
print("The distance between week 52 and week 53: {} \n".format(hellinger(lda_bow_52, lda_bow_53)))
print("The distance between week 53 and week 01: {} \n".format(hellinger(lda_bow_53, lda_bow_1)))
print("The distance between week 01 and week 02: {} \n".format(hellinger(lda_bow_1, lda_bow_2)))
print("The distance between week 02 and week 03: {} \n".format(hellinger(lda_bow_2, lda_bow_3)))
print("The distance between week 03 and week 04: {} \n".format(hellinger(lda_bow_3, lda_bow_4)))

The distance between week 51 and week 52: 0.09184565839857405 

The distance between week 52 and week 53: 0.17373862228754475 

The distance between week 53 and week 01: 0.10582816926072752 

The distance between week 01 and week 02: 0.04817568536396766 

The distance between week 02 and week 03: 0.00437014867500873 

The distance between week 03 and week 04: 0.005570218415861333 

