## 0. Importing module

In [1]:
import json
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim
import pyLDAvis.gensim_models
import re

import warnings
warnings.filterwarnings('ignore')



## 1. Preprocessing

In [2]:
df = pd.read_json('korea_review.json')

In [3]:
# 불용어 사전 정리
'''
토픽모델링 결과를 보고 불용어 추가하는 방식으로 진행
'''
stop_words = stopwords.words('english')
stop_words = stop_words+['one','two','three','four','five','six','seven','eight','nine','ten'] # 숫자
stop_words = stop_words+['may','must','would','could','should','might','upon'] # 조동사 및 전치사
stop_words = stop_words+['mr','mrs','man','men','say','said','mso'] # 기타 불용어
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# 통합
def get_topic(text_df, k, passes):
    txt = text_df.apply(regtext)
    token = txt.apply(tokeniser)
    token_words = token.apply(del_stopword)
    dic, corpus = dictionary(token_words)
    ldamodel = lda(corpus, k, dic, passes)      
    return ldamodel, corpus, dic

#######################################

# 정규표현
def regtext(text):
    _ = re.sub('\s+', ' ', text).lower()
    txt = re.sub('[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣]+',' ', _)
    return txt

# 토큰화
def tokeniser(txt):
    token = word_tokenize(txt)
    return token

# 불용어
def del_stopword(token):
    _ = [word for word in token if word not in stop_words]
    token_words = [word for word in _ if len(word)>2] # 2글자 이하 토큰은 삭제(의미를 알 수 없는 2글자 이하 단어가 주제어로 자주 걸림)
    return token_words

# 단어 사전
def dictionary(token_words):
    dic = gensim.corpora.Dictionary(token_words)
    corpus = [dic.doc2bow(word) for word in token_words]
    return dic, corpus

# LDA
def lda(corpus, k, dic, passes):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=k, id2word=dic, passes=passes)
    return ldamodel

In [5]:
%%time
model, corpus, dic = get_topic(df['Text'], 5, 15)

Wall time: 2min 13s


In [6]:
topics = model.print_topics()
for topic in topics:
    print(topic)

(0, '0.009*"king" + 0.006*"korea" + 0.005*"korean" + 0.005*"people" + 0.004*"time" + 0.004*"japanese" + 0.004*"seoul" + 0.004*"made" + 0.004*"sent" + 0.003*"great"')
(1, '0.013*"japanese" + 0.008*"korean" + 0.008*"korea" + 0.005*"people" + 0.005*"japan" + 0.005*"seoul" + 0.005*"government" + 0.004*"koreans" + 0.004*"time" + 0.004*"made"')
(2, '0.012*"japanese" + 0.008*"korean" + 0.005*"korea" + 0.005*"government" + 0.005*"seoul" + 0.005*"king" + 0.004*"time" + 0.004*"people" + 0.004*"koreans" + 0.004*"made"')
(3, '0.012*"korean" + 0.010*"japanese" + 0.007*"korea" + 0.005*"people" + 0.004*"koreans" + 0.004*"made" + 0.004*"seoul" + 0.004*"time" + 0.004*"japan" + 0.003*"work"')
(4, '0.008*"korea" + 0.008*"korean" + 0.007*"japanese" + 0.004*"people" + 0.004*"work" + 0.004*"time" + 0.004*"japan" + 0.003*"seoul" + 0.003*"new" + 0.003*"made"')


## 2. 시각화

In [7]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dic)
pyLDAvis.display(vis)