# Topic modeling using LDA

reference : https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [2]:
import gensim
from gensim import corpora, models
import re
from pprint import pprint


f = open("new_liberal.txt", 'r', encoding = "UTF8")
# 데이터 받아옴
dataset = []

while True:
    line = f.readline()
    if not line: break
    dataset += [line[:-1].split(' ')]

f.close()


[['presidenti', 'candid', 'barack', 'obama', 'promis', 'embrac', 'america', 'alli', 'extend', 'hand', 'adversari', 'presid', 'made', 'remark', 'progress', 'engag', 'longtim', 'foe', 'restor', 'relat', 'cuba', 'negoti', 'nuclear', 'deal', 'iran', 'countri', 'tradit', 'friend', 'obama', 'certainli', 'strengthen', 'allianc', 'western', 'european', 'nation', 'turn', 'georg', 'w', 'bush', 'work', 'close', 'countri', 'fight', 'isi', 'reduc', 'climat', 'chang', 'watch', 'tie', 'fray', 'sever', 'countri', 'previous', 'consid', 'partner', 'includ', 'egypt', 'israel', 'philippin', 'saudi', 'arabia', 'turkey', 'bitter', 'week', 'isra', 'leader', 'israel', 'settlement', 'polici', 'illustr', 'trend', 'tie', 'fray', 'part', 'event', 'outsid', 'obama', 'control', 'egyptian', 'revolut', 'case', 'saudi', 'american', 'revolut', 'fray', 'emerg', 'countri', 'leader', 'person', 'ideolog', 'odd', 'obama', 'fray', 'uniqu', 'way', 'obama', 'administr', 'treat', 'adversari', 'alli', 'prevail', 'logic', 'within

## Bag of words on the dataset

In [3]:
dct = gensim.corpora.Dictionary(dataset)
print('dictionary size : %d' % len(dct))

corpus = [dct.doc2bow(line) for line in dataset]


dictionary size : 87794


## TF-IDF

In [4]:
model = models.TfidfModel(corpus)
corpus_tfidf = model[corpus]
    
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.023325280228771356),
 (1, 0.05591285959403856),
 (2, 0.021735502827395434),
 (3, 0.01721070995268424),
 (4, 0.009420312290932254),
 (5, 0.01840329981543992),
 (6, 0.023518041165199784),
 (7, 0.015550186819180904),
 (8, 0.03136090274731551),
 (9, 0.03721254596741049),
 (10, 0.019995959187273163),
 (11, 0.10659230270708836),
 (12, 0.02736664921597114),
 (13, 0.02308025646643787),
 (14, 0.3325979279229002),
 (15, 0.14513269278639887),
 (16, 0.01159522900791685),
 (17, 0.021735502827395434),
 (18, 0.055531447420868824),
 (19, 0.022345999838914103),
 (20, 0.026349442563150836),
 (21, 0.05591285959403856),
 (22, 0.022110793011114713),
 (23, 0.1060738285707054),
 (24, 0.01557674521343276),
 (25, 0.013127339314249918),
 (26, 0.028416265312751033),
 (27, 0.02661388152775348),
 (28, 0.02513833113662666),
 (29, 0.014562544113770805),
 (30, 0.032755859431238495),
 (31, 0.018048932902214182),
 (32, 0.023776239675899806),
 (33, 0.02348420315967374),
 (34, 0.021235236164658538),
 (35, 0.012420

## Running LDA using Bag of Words

In [10]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dct, passes=2, workers=4)

In [11]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"peopl" + 0.004*"trump" + 0.004*"state" + 0.004*"work" + 0.004*"new" + 0.003*"time" + 0.003*"report" + 0.003*"presid" + 0.003*"us" + 0.003*"news"
Topic: 1 Word: 0.007*"trump" + 0.005*"state" + 0.005*"court" + 0.005*"peopl" + 0.003*"govern" + 0.003*"time" + 0.003*"new" + 0.003*"could" + 0.003*"know" + 0.003*"case"
Topic: 2 Word: 0.013*"trump" + 0.006*"peopl" + 0.005*"new" + 0.005*"presid" + 0.004*"time" + 0.003*"state" + 0.003*"countri" + 0.003*"compani" + 0.003*"govern" + 0.003*"polit"
Topic: 3 Word: 0.005*"peopl" + 0.005*"time" + 0.004*"work" + 0.004*"could" + 0.004*"new" + 0.004*"state" + 0.003*"use" + 0.003*"trump" + 0.003*"american" + 0.002*"compani"
Topic: 4 Word: 0.009*"trump" + 0.005*"us" + 0.004*"state" + 0.004*"peopl" + 0.003*"nation" + 0.003*"could" + 0.003*"new" + 0.003*"presid" + 0.003*"use" + 0.003*"report"
Topic: 5 Word: 0.007*"peopl" + 0.004*"trump" + 0.004*"time" + 0.003*"first" + 0.003*"new" + 0.003*"way" + 0.003*"use" + 0.003*"student" + 0.003*"co

## Running LDA using TF-IDF

In [12]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dct, passes=2, workers=4)

In [13]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"
" + 0.001*"trump" + 0.001*"presid" + 0.000*"russia" + 0.000*"women" + 0.000*"polic" + 0.000*"russian" + 0.000*"court" + 0.000*"state" + 0.000*"obama"
Topic: 1 Word: 0.001*"trump" + 0.001*"clinton" + 0.001*"student" + 0.001*"locht" + 0.000*"school" + 0.000*"state" + 0.000*"korea" + 0.000*"presid" + 0.000*"compani" + 0.000*"north"
Topic: 2 Word: 0.008*"
" + 0.001*"trump" + 0.001*"refuge" + 0.001*"polic" + 0.000*"clinton" + 0.000*"student" + 0.000*"immigr" + 0.000*"castro" + 0.000*"food" + 0.000*"presid"
Topic: 3 Word: 0.004*"trump" + 0.002*"clinton" + 0.002*"republican" + 0.001*"presid" + 0.001*"democrat" + 0.001*"obama" + 0.001*"sander" + 0.001*"vote" + 0.001*"court" + 0.001*"state"
Topic: 4 Word: 0.001*"trump" + 0.001*"eu" + 0.001*"tax" + 0.001*"opioid" + 0.001*"polic" + 0.001*"drug" + 0.001*"percent" + 0.001*"pleas" + 0.001*"vote" + 0.001*"peopl"
Topic: 5 Word: 0.001*"trump" + 0.001*"film" + 0.001*"women" + 0.001*"polic" + 0.001*"season" + 0.001*"black" + 0.001*

## Save LDA model

In [14]:
lda_model_tfidf.save("./LDAmodel/liberal_10.model")

num_topics 10,15,20,25,30
