## 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

참고자료: https://wikidocs.net/30708,  https://lettier.com/projects/lda-topic-modeling/

토픽 모델링은 문서의 집합에서 토픽을 찾아내는 프로세스를 말한다. 이는 검색 엔진, 고객 민원 시스템 등과 같이 문서의 주제를 알아내는 일이 중요한 곳에서 사용된다. 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)은 토픽 모델링의 대표적인 알고리즘으로 줄여서 LDA라고 한다.

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

#### 1) 데이터 로드

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('샘플의 수 :',len(documents))

샘플의 수 : 11314


In [4]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [5]:
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


#### 2) 텍스트 전처리

In [3]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [5]:
# NLTK로부터 불용어를 받아온다.
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

In [6]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


#### 3) 정수 인코딩과 단어 집합 생성
```corpora.Dictionary()``` : (word_id, word_frequency)의 형태로 변환, word_id는 단어가 정수 인코딩된 값이고, word_frequency는 해당 뉴스에서의 해당 단어의 빈도수를 의미

In [7]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [8]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [9]:
print(dictionary[66])   # 66으로 정수 인코딩 되기전 단어

faith


In [10]:
len(dictionary)

64281

#### 4) 모델훈련

In [11]:
import gensim
NUM_TOPICS = 20   # 20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.012*"people" + 0.009*"would" + 0.007*"jesus" + 0.006*"believe"')
(1, '0.017*"armenian" + 0.017*"said" + 0.015*"armenians" + 0.012*"people"')
(2, '0.006*"science" + 0.006*"book" + 0.005*"books" + 0.004*"theory"')
(3, '0.022*"space" + 0.009*"university" + 0.009*"nasa" + 0.006*"april"')
(4, '0.035*"health" + 0.026*"medical" + 0.018*"disease" + 0.017*"pain"')
(5, '0.029*"israel" + 0.027*"jews" + 0.018*"israeli" + 0.014*"jewish"')
(6, '0.013*"power" + 0.012*"bike" + 0.009*"used" + 0.009*"engine"')
(7, '0.044*"file" + 0.029*"output" + 0.026*"entry" + 0.019*"program"')
(8, '0.018*"government" + 0.011*"president" + 0.007*"state" + 0.007*"public"')
(9, '0.018*"would" + 0.013*"like" + 0.011*"know" + 0.011*"think"')
(10, '0.010*"available" + 0.008*"window" + 0.008*"also" + 0.008*"windows"')
(11, '0.032*"printf" + 0.026*"water" + 0.011*"compass" + 0.011*"hanging"')
(12, '0.017*"drive" + 0.014*"card" + 0.013*"system" + 0.011*"disk"')
(13, '0.017*"game" + 0.015*"team" + 0.013*"year" + 0.011*"

In [12]:
print(ldamodel.print_topics())

[(0, '0.012*"people" + 0.009*"would" + 0.007*"jesus" + 0.006*"believe" + 0.006*"many" + 0.005*"think" + 0.005*"even" + 0.004*"christian" + 0.004*"also" + 0.004*"life"'), (1, '0.017*"armenian" + 0.017*"said" + 0.015*"armenians" + 0.012*"people" + 0.012*"turkish" + 0.008*"went" + 0.008*"turkey" + 0.007*"killed" + 0.007*"women" + 0.007*"children"'), (2, '0.006*"science" + 0.006*"book" + 0.005*"books" + 0.004*"theory" + 0.004*"scientific" + 0.004*"problem" + 0.003*"reference" + 0.003*"also" + 0.003*"article" + 0.003*"time"'), (3, '0.022*"space" + 0.009*"university" + 0.009*"nasa" + 0.006*"april" + 0.006*"center" + 0.006*"program" + 0.006*"research" + 0.006*"national" + 0.005*"launch" + 0.005*"data"'), (4, '0.035*"health" + 0.026*"medical" + 0.018*"disease" + 0.017*"pain" + 0.015*"patients" + 0.014*"myers" + 0.013*"drugs" + 0.012*"among" + 0.012*"doctor" + 0.012*"drug"'), (5, '0.029*"israel" + 0.027*"jews" + 0.018*"israeli" + 0.014*"jewish" + 0.012*"arab" + 0.009*"peace" + 0.008*"nazi" + 0.

#### 5) LDA 시각화

In [18]:
# pip install pyLDAvis
import pyLDAvis.gensim_models

In [19]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
