### 0. 비교분석을 위해 LDA model 만들기.

In [1]:
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
import re

In [2]:
with open("data/cleaned_data.pk", "rb") as f:
    data = pickle.load(f)

data.reset_index(drop=True, inplace=True)
print(data.head())
print(data.info())

               Date User            Message
0  2017/01/01 20:32   무지  이거보면 왜 갓창정인지 알게된다
1  2017/01/01 21:40  어피치                창정헌
2  2017/01/01 22:19  어피치   라이언은 내일부터 연구실 출근
3  2017/01/01 22:20   무지          파티 하는거 아님
4  2017/01/01 22:39  프로도                헬파티
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162861 entries, 0 to 162860
Data columns (total 3 columns):
Date       162861 non-null object
User       162861 non-null object
Message    162861 non-null object
dtypes: object(3)
memory usage: 3.7+ MB
None


### 2. 초록 없는 데이터 제거  및 분석시기 설정하기

In [3]:
# 시간정보 열 datetime 정보로 변환
data['Date'] = pd.to_datetime(data['Date'])
# 인덱스 넣기
data = data.set_index('Date')
data.head()

Unnamed: 0_level_0,User,Message
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 20:32:00,무지,이거보면 왜 갓창정인지 알게된다
2017-01-01 21:40:00,어피치,창정헌
2017-01-01 22:19:00,어피치,라이언은 내일부터 연구실 출근
2017-01-01 22:20:00,무지,파티 하는거 아님
2017-01-01 22:39:00,프로도,헬파티


In [5]:
year2019 = data['2019-01-01' : '2019-12-31']
slice2 = list(year2019["Message"])

tokenized_data = [msg.split() for msg in slice2]
print(tokenized_data[:10])
print(len(tokenized_data))

[['쉬발럼들', '사랑한다'], ['사랑한다'], ['내', '친구들'], ['고맙다'], ['새해도', '잘', '부탁해'], ['이런', '야한', '녀석들'], ['새해복', '많이', '받으시게들'], ['다들', '새해복', '많이받아'], ['러운녀석들'], ['아잉❤️']]
17293


### 3. LDA & Dynamic Topic Model 돌리기

In [6]:
from gensim.models import ldamodel
from gensim.models import ldaseqmodel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora
from tqdm import tqdm_notebook
from time import time

import os

##### dictionary와 doc2bow 만들기 ( LDA에서 2019년도 것만 사용)

In [7]:
# Create Dictionary
if not os.path.exists('kakao(LDA)_dict'):
    dictionary = corpora.Dictionary(tokenized_data)
    #dictionary.filter_extremes(no_below=5, no_above=500)  # 이 줄의 코드는 n회 미만 또는 n회 이상을 지울 때 사용
    dictionary.save('kakao(LDA)_dict')
    print(dictionary)
else:
    dictionary = Dictionary.load('kakao(LDA)_dict')

# Term Document Frequency (convert tokenized documents into a Document-Term Matrix)    
if not os.path.exists('kakao(LDA)_corpus'):
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]
    corpora.BleiCorpus.serialize('kakao(LDA)_corpus', corpus)
else:
    corpus = bleicorpus.BleiCorpus('kakao(LDA)_corpus')

Dictionary(18219 unique tokens: ['사랑한다', '쉬발럼들', '내', '친구들', '고맙다']...)


##### Run LDA model 

In [8]:
# DTM 분석에서 best topic으로 나온 결과를 비교하기 위해 같은 토픽 수로 설정.
NUM_TOPICS = 8

if not os.path.exists('kakao(LDA)_model'):
    lda_model = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=100)
    lda_model.save('kakao(LDA)_model')
else:
    lda_model = ldamodel.LdaModel.load('kakao(LDA)_model')

##### Run DTM Model

In [9]:
# DTM 분석에서 best topic으로 나온 결과를 비교하기 위해 같은 토픽 수로 설정.
NUM_TOPICS = 8

dtm_model = ldaseqmodel.LdaSeqModel.load('kakao_dtm_model_8')

##### LDA와 DTM 결과 비교해보기

In [19]:
lda_model.show_topic(topicid=0, topn=20)

[('프로도', 0.025016394),
 ('라이언', 0.019759238),
 ('일단', 0.0156757),
 ('닥쳐', 0.011239312),
 ('그래서', 0.010288102),
 ('아하', 0.007924918),
 ('그건', 0.0067233886),
 ('튜브', 0.0064142654),
 ('우리', 0.00595636),
 ('님들', 0.0049880818),
 ('솬', 0.004521181),
 ('집', 0.0034133708),
 ('갑자기', 0.003284215),
 ('빨리', 0.0030154504),
 ('수', 0.0029218711),
 ('간다', 0.0026682992),
 ('별로', 0.0026194502),
 ('옵치', 0.0024399178),
 ('어떰', 0.0023682115),
 ('가는중', 0.002226242)]

In [21]:
dtm_model.print_topic(topic=0, time=2, top_terms=20)

[('프로도', 0.03743823347924715),
 ('일단', 0.02614912548859356),
 ('다', 0.021407730619530767),
 ('프로도이', 0.01655230706806371),
 ('왜', 0.012999740607673945),
 ('후발', 0.012260384305229641),
 ('굿굿', 0.011315881246272213),
 ('아직', 0.008662723907897043),
 ('그래도', 0.008073826159202328),
 ('다들', 0.0077730104050908215),
 ('오늘은', 0.0074753942620117975),
 ('바로', 0.007034148094741422),
 ('가서', 0.006187572101185384),
 ('호우', 0.004591488235196132),
 ('이건', 0.004348428069662819),
 ('지노', 0.0040450108480990345),
 ('밥', 0.003856723259264248),
 ('사람', 0.0035409107611706573),
 ('혼자', 0.003459779520189684),
 ('무지', 0.002961536165195683)]

##### coherence score 계산 비교

In [31]:
dtm_corpus = corpus = bleicorpus.BleiCorpus('kakao(DTM)_corpus')
dtm_dictionary = Dictionary.load('kakao(DTM)_dict')
processing_data = [msg.split() for msg in data['Message']]

In [34]:
lda_cs = CoherenceModel(model=lda_model, texts=tokenized_data, corpus=corpus, dictionary=dictionary,
                        coherence='c_v').get_coherence()



topics_dtm = dtm_model.dtm_coherence(time=2)        
dtm_cs = CoherenceModel(topics=topics_dtm, texts=processing_data, corpus=dtm_corpus,
                        dictionary=dtm_dictionary, coherence='c_v').get_coherence()

In [35]:
lda_cs, dtm_cs

(0.7743403306495791, 0.6902980469748023)