# Topic modeling using LDA

reference : https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [0]:
import gensim
from gensim import corpora, models
import re
from pprint import pprint

fname = "lib_result.txt"
save_name = "./new_model/liberal_"
topicN = 10

f = open(fname, 'r', encoding = "UTF8")

# 데이터 받아옴
dataset = []

while True:
    line = f.readline()
    if not line: break
    dataset += [line[:-1].split(' ')]

f.close()


In [80]:
print(dataset[:5])

[['godfrey'], ['thousand'], ['godfrey'], ['rodden'], ['godfrey']]


## Bag of words on the dataset

In [81]:
dct = gensim.corpora.Dictionary(dataset)
print('dictionary size : %d' % len(dct))

corpus = [dct.doc2bow(line) for line in dataset]


dictionary size : 28732


## TF-IDF

In [82]:
model = models.TfidfModel(corpus)
corpus_tfidf = model[corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


## Running LDA using Bag of Words

In [0]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=topicN, id2word=dct, passes=2, workers=4)

In [84]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"stoiko" + 0.004*"iea" + 0.004*"lycamobil" + 0.004*"scanlon" + 0.003*"gl" + 0.003*"trai" + 0.003*"lgbti" + 0.003*"ctu" + 0.003*"tantaro" + 0.003*"smallcomb"
Topic: 1 Word: 0.012*"leiderman" + 0.008*"kjellberg" + 0.007*"npu" + 0.005*"acic" + 0.005*"berian" + 0.005*"pineda" + 0.004*"etet" + 0.004*"bruenig" + 0.004*"biersack" + 0.003*"optern"
Topic: 2 Word: 0.007*"voxcar" + 0.006*"phage" + 0.005*"s8" + 0.004*"roosh" + 0.004*"nakamoto" + 0.003*"tardigrad" + 0.003*"bachelet" + 0.003*"hassett" + 0.003*"erd" + 0.003*"stoneheart"
Topic: 3 Word: 0.005*"techmem" + 0.004*"smurf" + 0.004*"allo" + 0.004*"kleiser" + 0.003*"chemsex" + 0.003*"kyran" + 0.003*"gossan" + 0.003*"lemair" + 0.002*"jb" + 0.002*"krauss"
Topic: 4 Word: 0.010*"csr" + 0.008*"rinku" + 0.007*"norwood" + 0.006*"bedbug" + 0.005*"melzer" + 0.004*"navient" + 0.004*"chadam" + 0.003*"bavor" + 0.003*"unfpa" + 0.003*"uberx"
Topic: 5 Word: 0.011*"fgm" + 0.008*"airpod" + 0.006*"akayesu" + 0.005*"kt4" + 0.004*"lidey" + 0

## Running LDA using TF-IDF

In [0]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=topicN, id2word=dct, passes=2, workers=4)

In [86]:
fw = open("model_result.txt", 'a')
fw.write(fname+" "+str(topicN)+'\n')
for idx, topic in lda_model_tfidf.print_topics(-1):
    fw.write('Topic: {} Word: {}\n'.format(idx, topic))
    print('Topic: {} Word: {}'.format(idx, topic))
fw.write('\n')
fw.close()

Topic: 0 Word: 0.001*"abnorm" + 0.001*"obama" + 0.001*"wallet" + 0.001*"u200bth" + 0.001*"moder" + 0.001*"tttthi" + 0.001*"bradd" + 0.001*"jaffi" + 0.001*"integr" + 0.001*"carrp"
Topic: 1 Word: 0.001*"jb" + 0.001*"norwood" + 0.001*"foreknowledg" + 0.001*"3g" + 0.001*"april" + 0.001*"2°c" + 0.001*"f8" + 0.001*"metacrit" + 0.000*"ouch" + 0.000*"2009"
Topic: 2 Word: 0.015*"norwood" + 0.001*"interview" + 0.001*"voxcar" + 0.001*"scandal" + 0.001*"period" + 0.001*"amptp" + 0.001*"det" + 0.001*"microbead" + 0.001*"freedhoff" + 0.001*"three"
Topic: 3 Word: 0.007*"godfrey" + 0.001*"tttthi" + 0.001*"biersack" + 0.001*"superrich" + 0.001*"gigafactori" + 0.001*"15" + 0.001*"earlier" + 0.001*"gerago" + 0.001*"timeshar" + 0.001*"ohlhausen"
Topic: 4 Word: 0.002*"voxcar" + 0.002*"norwood" + 0.001*"defens" + 0.001*"cpsc" + 0.001*"ddt" + 0.001*"u200aand" + 0.001*"tttthi" + 0.001*"sherkow" + 0.001*"eastsid" + 0.001*"kass"
Topic: 5 Word: 0.001*"offici" + 0.001*"via" + 0.001*"obamacar" + 0.001*"csr" + 0.00

## Save LDA model

In [0]:
lda_model_tfidf.save(save_name+str(topicN)+".model")

num_topics 10,15,20,25,30
