# 토픽 모델링 - LDA

## 20 News Group 데이터 사례

In [1]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state = 2021,
                          remove=('headers','footers','quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
df = pd.DataFrame({'article' : news.data})
df.shape

(18846, 1)

In [6]:
# 특수문자 제거
df['article'] = df.article.str.replace('[^A-Za-z]', ' ')

In [9]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거
df['article'] = df.article.apply(lambda x: ' '.join(w.lower() for w in x.split() if len(w)>3))
df.article[0][:1000]

'just case original poster looking serious answer supply even when steering hands something quite similar countersteering basically turn left quick wiggle bike right first causing counteracting lean occur left more difficult motorcycle than bicycle though because extra weight motorcycle heavy maybe yous'

- NLTK를 통해서 단어 토큰화

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# 불용어 처리
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = df.article.apply(lambda x: [w for w in x.split() if w not in stop_words])

In [14]:
tokenized_doc[:5]

0    [case, original, poster, looking, serious, ans...
1    [thinking, sending, magazine, idea, parody, bo...
2    [dreamed, great, judgment, morning, dawned, tr...
3    [file, bignums, ripem, last, updated, april, r...
4    [peanut, butter, definitely, favorite, think, ...
Name: article, dtype: object

## 정수 인코딩과 단어 집합 만들기 - gensim

In [17]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)

In [18]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [19]:
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]


In [21]:
dictionary[0], dictionary[1]

('answer', 'basically')

## LDA 모델 훈련

In [24]:
from gensim.models.ldamodel import LdaModel
NUM_TOPICS = 20

In [None]:
# ldamodel = gensim.models.ldamodel.LdaModel()
ldamodel = LdaModel(
    corpus, num_topics=NUM_TOPICS, random_state=2021,
    id2word=dictionary, passes = 20
)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

# 훈련결과 시각화

In [29]:
# 최신버전의 pyLDAvis는 현시점(21.9)의 colab 버전과 맞지 않음
!pip install pyLDAvis==2.1.2 > /dev/null

In [33]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [51]:
pyLDAvis.save_html(vis, 'news_group_20.html')

## 문서별 토픽 분포

In [34]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(1, 0.21060997), (14, 0.76570576)]
1 번째 문서의 topic 비율은 [(1, 0.0484265), (3, 0.17474385), (5, 0.17966925), (7, 0.031786557), (11, 0.060401138), (12, 0.091925785), (14, 0.34559038), (15, 0.024157342), (19, 0.037320927)]
2 번째 문서의 topic 비율은 [(6, 0.36186185), (12, 0.12357302), (14, 0.45935187), (15, 0.027110968)]
3 번째 문서의 topic 비율은 [(5, 0.02760349), (7, 0.07531616), (8, 0.025203133), (9, 0.085529715), (10, 0.16337147), (11, 0.015387975), (12, 0.024777709), (14, 0.028632857), (18, 0.11858687), (19, 0.4116757)]
4 번째 문서의 topic 비율은 [(8, 0.31456077), (10, 0.06009591), (12, 0.047664586), (14, 0.5072697), (17, 0.052967172)]


In [41]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = []

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table.append([int(topic_num), round(prop_topic,4), topic_list])
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    df = pd.DataFrame(topic_table)
    return(df)

In [42]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,14,0.7657,"[(1, 0.21057051), (14, 0.76574534)]"
1,1,14,0.3456,"[(1, 0.048424587), (3, 0.17474413), (5, 0.1796..."
2,2,14,0.4594,"[(6, 0.36186153), (12, 0.12357297), (14, 0.459..."
3,3,19,0.4117,"[(5, 0.027603805), (7, 0.07531492), (8, 0.0252..."
4,4,14,0.5073,"[(8, 0.3145604), (10, 0.06012607), (12, 0.0476..."
5,5,12,0.3768,"[(0, 0.014733174), (2, 0.011818085), (5, 0.064..."
6,6,7,0.4282,"[(1, 0.21829434), (7, 0.4282409), (14, 0.20107..."
7,7,12,0.4394,"[(6, 0.39674428), (8, 0.023296045), (12, 0.439..."
8,8,5,0.4024,"[(0, 0.03006662), (3, 0.049999997), (5, 0.4024..."
9,9,14,0.2928,"[(5, 0.07132247), (7, 0.22602163), (11, 0.1598..."


## NUM_TOPICS = 24

In [43]:
ldamodel = LdaModel(
    corpus, num_topics=24, random_state=2021,
    id2word=dictionary, passes = 20)

In [44]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(4, '0.024*"book" + 0.018*"books" + 0.006*"pages" + 0.006*"edition"')
(2, '0.014*"people" + 0.009*"israel" + 0.007*"would" + 0.007*"government"')
(17, '0.013*"year" + 0.011*"runs" + 0.009*"clutch" + 0.008*"average"')
(14, '0.018*"would" + 0.013*"like" + 0.009*"time" + 0.009*"think"')
(0, '0.012*"appears" + 0.009*"cover" + 0.009*"wolverine" + 0.008*"espn"')
(1, '0.015*"cancer" + 0.010*"aids" + 0.009*"vitamin" + 0.009*"doctor"')
(23, '0.017*"health" + 0.017*"medical" + 0.010*"drug" + 0.009*"research"')
(10, '0.019*"mail" + 0.015*"information" + 0.015*"please" + 0.012*"send"')
(16, '0.039*"file" + 0.024*"jpeg" + 0.018*"files" + 0.017*"format"')
(9, '0.021*"windows" + 0.013*"window" + 0.011*"file" + 0.010*"server"')
(5, '0.021*"president" + 0.013*"think" + 0.012*"going" + 0.011*"stephanopoulos"')
(20, '0.007*"state" + 0.006*"states" + 0.005*"national" + 0.005*"american"')
(13, '0.021*"entry" + 0.021*"output" + 0.014*"file" + 0.011*"jumper"')
(6, '0.029*"jesus" + 0.019*"church" + 0.017*"chr

In [45]:
pyLDAvis.enable_notebook()
vis2 = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis2)

In [49]:
pyLDAvis.save_html(vis2, 'news_group_24.html')