# Topic modeling using LDA

reference : https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import gensim
from gensim import corpora, models
import re
from pprint import pprint


f = open("final_liberal.txt", 'r', encoding = "UTF8")
# 데이터 받아옴
dataset = []
while True:
    line = f.readline()
    if not line: break
    dataset += [line.split(' ')]
f.close()


## Bag of words on the dataset

In [2]:
dct = gensim.corpora.Dictionary(dataset)
print('dictionary size : %d' % len(dct))

corpus = [dct.doc2bow(line) for line in dataset]


dictionary size : 91916


## TF-IDF

In [5]:
model = models.TfidfModel(corpus)
corpus_tfidf = model[corpus]
    
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.02318583671783724),
 (1, 0.05648719313462274),
 (2, 0.02167849663527989),
 (3, 0.016896199064181532),
 (4, 0.008987126420023174),
 (5, 0.018268130118038125),
 (6, 0.022830422213800584),
 (7, 0.015233300031683359),
 (8, 0.03051709098212779),
 (9, 0.03673652891591564),
 (10, 0.019808628152610853),
 (11, 0.10676754191956293),
 (12, 0.027321884090074005),
 (13, 0.02303668801288404),
 (14, 0.33061462869624125),
 (15, 0.14536871301084828),
 (16, 0.011175618059595143),
 (17, 0.005497550977236616),
 (18, 0.02152037095545037),
 (19, 0.05414127938027999),
 (20, 0.020830343898925626),
 (21, 0.026286874563610715),
 (22, 0.05648719313462274),
 (23, 0.021271201189881017),
 (24, 0.10597988100564937),
 (25, 0.015255385767333915),
 (26, 0.012833492384166464),
 (27, 0.02829539777969618),
 (28, 0.026575170747956622),
 (29, 0.025101848775484117),
 (30, 0.014265544219830445),
 (31, 0.033114593343003276),
 (32, 0.01780172327429373),
 (33, 0.023533215160429943),
 (34, 0.02303668801288404),
 (35, 0.021

## Running LDA using Bag of Words

In [6]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dct, passes=2, workers=2)

In [7]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"like" + 0.007*"peopl" + 0.007*"one" + 0.006*"said" + 0.006*"would" + 0.005*"time" + 0.004*"work" + 0.004*"get" + 0.004*"year" + 0.004*"want"
Topic: 1 Word: 0.020*"trump" + 0.006*"said" + 0.006*"clinton" + 0.005*"one" + 0.005*"peopl" + 0.005*"presid" + 0.005*"polit" + 0.005*"would" + 0.005*"campaign" + 0.005*"elect"
Topic: 2 Word: 0.010*"said" + 0.006*"trump" + 0.006*"court" + 0.006*"would" + 0.006*"state" + 0.005*"one" + 0.004*"presid" + 0.004*"year" + 0.004*"new" + 0.004*"time"
Topic: 3 Word: 0.007*"one" + 0.006*"peopl" + 0.006*"like" + 0.005*"year" + 0.005*"said" + 0.005*"also" + 0.004*"time" + 0.004*"new" + 0.004*"say" + 0.003*"would"
Topic: 4 Word: 0.009*"trump" + 0.005*"like" + 0.005*"would" + 0.005*"peopl" + 0.004*"one" + 0.004*"said" + 0.004*"clinton" + 0.004*"presid" + 0.004*"state" + 0.004*"us"
Topic: 5 Word: 0.011*"trump" + 0.006*"one" + 0.005*"like" + 0.004*"presid" + 0.004*"new" + 0.004*"said" + 0.004*"show" + 0.003*"make" + 0.003*"polit" + 0.003*"time

## Running LDA using TF-IDF

In [8]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dct, passes=2, workers=4)

In [9]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"trump" + 0.001*"de" + 0.001*"le" + 0.001*"sponsor
" + 0.001*"song" + 0.001*"clinton" + 0.001*"film" + 0.001*"women" + 0.001*"sander" + 0.001*"la"
Topic: 1 Word: 0.004*"trump" + 0.002*"clinton" + 0.002*"republican" + 0.001*"democrat" + 0.001*"sander" + 0.001*"presid" + 0.001*"vote" + 0.001*"women" + 0.001*"obama" + 0.001*"campaign"
Topic: 2 Word: 0.001*"trump" + 0.001*"drug" + 0.001*"marijuana" + 0.001*"opioid" + 0.001*"clinton" + 0.001*"women" + 0.001*"court" + 0.001*"obama" + 0.001*"presid" + 0.001*"appl"
Topic: 3 Word: 0.001*"trump" + 0.001*"student" + 0.001*"school" + 0.001*"uber" + 0.000*"women" + 0.000*"compani" + 0.000*"said" + 0.000*"presid" + 0.000*"u" + 0.000*"citi"
Topic: 4 Word: 0.001*"
" + 0.001*"trump" + 0.001*"uber" + 0.001*"polic" + 0.001*"song" + 0.000*"waymo" + 0.000*"women" + 0.000*"offic" + 0.000*"black" + 0.000*"said"
Topic: 5 Word: 0.001*"trump" + 0.001*"student" + 0.001*"polic" + 0.001*"court" + 0.001*"u" + 0.001*"clinton" + 0.001*"state" + 0

## Save LDA model

In [10]:
lda_model_tfidf.save("ldamodel")

num_topics 10,15,20,25,30
