# Topic modeling using LDA

reference : https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import gensim
from gensim import corpora, models
import re
from pprint import pprint

fname = "saved_conserv.txt" # dataset name
save_name = "./new_model/conserv_"  # 저장할 dir. 'conserv_' or 'liberal_' 
topicN = 20               # topic 개수

f = open(fname, 'r', encoding = "UTF8")

# 데이터 받아옴
dataset = []

while True:
    line = f.readline()
    if not line: break
    dataset += [line.split()]

f.close()




In [2]:
pprint(dataset[:5])

[['tuesday',
  'broadcast',
  'cnn',
  'situat',
  'room',
  'cnn',
  'senior',
  'washington',
  'correspond',
  'jeff',
  'zeleni',
  'state',
  'chelsea',
  'man',
  'transit',
  'man',
  'woman',
  'certainli',
  'play',
  'obama',
  'decis',
  'commut',
  'man',
  'sentenc',
  'without',
  'hard',
  'imagin',
  'done',
  'zeleni',
  'question',
  'white',
  'answer',
  'answer',
  'import',
  'much',
  'person',
  'stori',
  'chelsea',
  'man',
  'involv',
  'outcri',
  'left',
  'strong',
  'difficult',
  'time',
  'feder',
  'prison',
  'question',
  'central',
  'question',
  'without',
  'wonder',
  'outcom',
  'might',
  'transit',
  'man',
  'woman',
  'certainli',
  'play',
  'without',
  'hard',
  'imagin',
  'done',
  'mediait'],
 ['group',
  'american',
  'spring',
  'break',
  'revel',
  'reportedli',
  'chant',
  'build',
  'wall',
  'famili',
  'cruis',
  'cancun',
  'sf',
  'gate',
  'mail',
  'group',
  'aboard',
  'captain',
  'hook',
  'pirat',
  'ship',
  'dinner

## Bag of words on the dataset

In [3]:
dct = gensim.corpora.Dictionary(dataset)
print('dictionary size : %d' % len(dct))

corpus = [dct.doc2bow(line) for line in dataset]


dictionary size : 55227


## TF-IDF

In [4]:
model = models.TfidfModel(corpus)
corpus_tfidf = model[corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1654889451420026),
 (1, 0.08853617937044052),
 (2, 0.09578025527090075),
 (3, 0.19223669438375798),
 (4, 0.28725830600851043),
 (5, 0.15497070742817248),
 (6, 0.17172349117621),
 (7, 0.11286829052038373),
 (8, 0.0655807425860439),
 (9, 0.09764217237242317),
 (10, 0.13796813645057787),
 (11, 0.06299359124378018),
 (12, 0.1517805203512606),
 (13, 0.21872978431279783),
 (14, 0.06416105796420278),
 (15, 0.07274446947974281),
 (16, 0.10444594712345447),
 (17, 0.05928583468107513),
 (18, 0.2983321457985086),
 (19, 0.1731972460113172),
 (20, 0.0723453767111886),
 (21, 0.05195027710630144),
 (22, 0.04954041914244451),
 (23, 0.11851106812416558),
 (24, 0.18250684661075908),
 (25, 0.052417726437127096),
 (26, 0.13761311358788283),
 (27, 0.09746420513179158),
 (28, 0.17005268795818823),
 (29, 0.09099652818105437),
 (30, 0.06446758302897908),
 (31, 0.10815414807446412),
 (32, 0.08628679372702354),
 (33, 0.022196764226817163),
 (34, 0.06183992576521185),
 (35, 0.08244153383854216),
 (36, 0.0

## Running LDA using Bag of Words

In [0]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=topicN, id2word=dct, passes=2, workers=4)

Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-22:
Process ForkPoolWorker-24:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 103, in worker
    initializer(*initargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/gensim/models/ldamulticore.py", line 333, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tun

KeyboardInterrupt: ignored

In [0]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"time" + 0.003*"way" + 0.003*"could" + 0.003*"use" + 0.003*"new" + 0.003*"work" + 0.003*"american" + 0.003*"us" + 0.003*"state" + 0.003*"want"
Topic: 1 Word: 0.005*"state" + 0.005*"republican" + 0.005*"time" + 0.005*"vote" + 0.004*"democrat" + 0.004*"elect" + 0.004*"new" + 0.004*"could" + 0.003*"support" + 0.003*"want"
Topic: 2 Word: 0.004*"time" + 0.003*"new" + 0.003*"way" + 0.003*"first" + 0.003*"work" + 0.003*"thing" + 0.003*"call" + 0.002*"want" + 0.002*"come" + 0.002*"know"
Topic: 3 Word: 0.005*"work" + 0.004*"american" + 0.004*"time" + 0.004*"state" + 0.003*"could" + 0.003*"countri" + 0.003*"news" + 0.003*"new" + 0.003*"call" + 0.003*"first"
Topic: 4 Word: 0.006*"state" + 0.004*"new" + 0.004*"work" + 0.004*"time" + 0.004*"american" + 0.003*"us" + 0.003*"want" + 0.003*"way" + 0.003*"use" + 0.003*"could"
Topic: 5 Word: 0.005*"new" + 0.004*"us" + 0.004*"state" + 0.004*"republican" + 0.003*"plan" + 0.003*"could" + 0.003*"obama" + 0.003*"govern" + 0.003*"tax" + 0.

## Running LDA using TF-IDF

In [8]:
topicN = 25

In [9]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=topicN, id2word=dct, passes=2, workers=4)

In [10]:
fw = open("model_result.txt", 'a')

from datetime import datetime
now = datetime.now()
fw.write('%s-%s-%s\t' % ( now.year, now.month, now.day ))

fw.write(fname+" "+str(topicN)+'\n')
for idx, topic in lda_model_tfidf.print_topics(-1):
    fw.write('Topic: {} Word: {}\n'.format(idx, topic))
    print('Topic: {} Word: {}'.format(idx, topic))
fw.write('\n')
fw.close()

Topic: 0 Word: 0.001*"rubio" + 0.001*"gawker" + 0.001*"hogan" + 0.001*"parker" + 0.001*"cruz" + 0.001*"illeg" + 0.001*"women" + 0.001*"marco" + 0.001*"new" + 0.001*"denton"
Topic: 1 Word: 0.001*"abedin" + 0.001*"weiner" + 0.001*"mail" + 0.001*"huma" + 0.001*"2017" + 0.001*"fbi" + 0.001*"senat" + 0.001*"rousseff" + 0.001*"redact" + 0.001*"american"
Topic: 2 Word: 0.002*"gun" + 0.001*"news" + 0.001*"user" + 0.001*"media" + 0.001*"2016" + 0.001*"new" + 0.001*"check" + 0.001*"attack" + 0.001*"film" + 0.001*"post"
Topic: 3 Word: 0.001*"women" + 0.001*"ryan" + 0.001*"bill" + 0.001*"state" + 0.001*"plan" + 0.001*"american" + 0.001*"work" + 0.001*"gainor" + 0.001*"countri" + 0.001*"obamacar"
Topic: 4 Word: 0.003*"zika" + 0.002*"cartel" + 0.002*"border" + 0.001*"migrant" + 0.001*"mexican" + 0.001*"abort" + 0.001*"mosquito" + 0.001*"cair" + 0.001*"parenthood" + 0.001*"agent"
Topic: 5 Word: 0.001*"wasserman" + 0.001*"schultz" + 0.001*"parenthood" + 0.001*"yahoo" + 0.001*"debbi" + 0.001*"democrat"

## Save LDA model

In [11]:
lda_model_tfidf.save(save_name+str(topicN)+".model")


num_topics 10,15,20,25,30
