# Topic modeling using LDA

reference : https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [19]:
import gensim
from gensim import corpora, models
import re
from pprint import pprint


f = open("new_conserv.txt", 'r', encoding = "UTF8")
# 데이터 받아옴
dataset = []
while True:
    line = f.readline()
    if not line: break
    dataset += [line.split(' ')]
f.close()


## Bag of words on the dataset

In [20]:
dct = gensim.corpora.Dictionary(dataset)
print('dictionary size : %d' % len(dct))

corpus = [dct.doc2bow(line) for line in dataset]


dictionary size : 63421


## TF-IDF

In [21]:
model = models.TfidfModel(corpus)
corpus_tfidf = model[corpus]
    
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.05218122637145137),
 (1, 0.15909716552660308),
 (2, 0.08317410594517079),
 (3, 0.08832725321536473),
 (4, 0.17628174432034432),
 (5, 0.23923159705953712),
 (6, 0.15131111963341945),
 (7, 0.13599213937203022),
 (8, 0.09832815921436736),
 (9, 0.06849429969318149),
 (10, 0.08938413447461928),
 (11, 0.14256709573329698),
 (12, 0.06620976242309624),
 (13, 0.03672592923230124),
 (14, 0.09666135921451371),
 (15, 0.14989903675512392),
 (16, 0.05530982028256249),
 (17, 0.09377120877754128),
 (18, 0.09675570509992226),
 (19, 0.19351141019984452),
 (20, 0.06750909630801802),
 (21, 0.07295323186454916),
 (22, 0.08386344558939009),
 (23, 0.06421362714752624),
 (24, 0.3220711363883548),
 (25, 0.1373449291401259),
 (26, 0.0728511170846625),
 (27, 0.05952447402379828),
 (28, 0.05783192713705242),
 (29, 0.03956311296162247),
 (30, 0.10257538802657626),
 (31, 0.1443809734143166),
 (32, 0.0596607125651638),
 (33, 0.14026322394139062),
 (34, 0.16583692455378596),
 (35, 0.08905801283911713),
 (36, 0

## Running LDA using Bag of Words

In [27]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dct, passes=2, workers=4)

In [28]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.195*"
" + 0.008*"trump" + 0.005*"immigr" + 0.004*"american" + 0.004*"peopl" + 0.004*"year" + 0.003*"go" + 0.003*"new" + 0.003*"one" + 0.003*"presid"
Topic: 1 Word: 0.007*"trump" + 0.007*"peopl" + 0.007*"presid" + 0.005*"would" + 0.005*"attack" + 0.005*"report" + 0.004*"one" + 0.004*"year" + 0.004*"state" + 0.004*"think"
Topic: 2 Word: 0.007*"presid" + 0.007*"trump" + 0.006*"peopl" + 0.006*"american" + 0.006*"state" + 0.005*"would" + 0.004*"obama" + 0.004*"also" + 0.004*"percent" + 0.004*"democrat"
Topic: 3 Word: 0.015*"trump" + 0.008*"clinton" + 0.004*"presid" + 0.004*"news" + 0.004*"u" + 0.004*"state" + 0.004*"time" + 0.004*"report" + 0.003*"campaign" + 0.003*"democrat"
Topic: 4 Word: 0.007*"trump" + 0.007*"presid" + 0.005*"would" + 0.005*"peopl" + 0.004*"state" + 0.004*"time" + 0.004*"also" + 0.004*"new" + 0.004*"report" + 0.003*"one"
Topic: 5 Word: 0.017*"trump" + 0.007*"presid" + 0.007*"clinton" + 0.005*"twitter" + 0.005*"go" + 0.005*"donald" + 0.005*"peopl" + 0.00

## Running LDA using TF-IDF

In [29]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dct, passes=2, workers=4)

In [30]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"trump" + 0.001*"abort" + 0.001*"presid" + 0.001*"parenthood" + 0.001*"court" + 0.001*"state" + 0.001*"student" + 0.001*"report" + 0.001*"twitter" + 0.001*"peopl"
Topic: 1 Word: 0.002*"trump" + 0.001*"polic" + 0.001*"migrant" + 0.001*"report" + 0.001*"peopl" + 0.001*"clinton" + 0.001*"presid" + 0.001*"twitter" + 0.001*"new" + 0.001*"news"
Topic: 2 Word: 0.005*"trump" + 0.003*"clinton" + 0.002*"presid" + 0.002*"state" + 0.002*"peopl" + 0.002*"republican" + 0.002*"would" + 0.002*"go" + 0.002*"donald" + 0.002*"cruz"
Topic: 3 Word: 0.833*"
" + 0.000*"trump" + 0.000*"presid" + 0.000*"state" + 0.000*"news" + 0.000*"clinton" + 0.000*"report" + 0.000*"twitter" + 0.000*"obama" + 0.000*"peopl"
Topic: 4 Word: 0.001*"trump" + 0.001*"polic" + 0.001*"report" + 0.001*"clinton" + 0.001*"attack" + 0.001*"presid" + 0.001*"state" + 0.001*"twitter" + 0.001*"news" + 0.001*"follow"
Topic: 5 Word: 0.003*"trump" + 0.002*"percent" + 0.002*"state" + 0.002*"presid" + 0.001*"immigr" + 0.001*"

## Save LDA model

In [31]:
lda_model_tfidf.save("./LDAmodel/conserv_10.model")

num_topics 10,15,20,25,30
