토픽 모델링이란 문서 속에 숨어 있는 주제를 찾아내는 것.

많은 양의 문서가 있을 때 사람이 이 문서를 다 읽고 핵심 주제를 찾는 것은 매우 많은 시간이 소모되므로 머신러닝 기반의 토픽 모델링을 적용해 숨어 있는 중요 주제를 효과적으로 찾아낼 수 있다.

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats = ['rec.motorcycles','rec.sport.baseball','comp.graphics','comp.windows.x','talk.politics.mideast','soc.religion.christian','sci.electronics','sci.med']

news_df = fetch_20newsgroups(subset='all',remove=('header','footer','quotes'), categories=cats, random_state=0)

In [6]:
# 피처 벡터화
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [7]:
# 7862개의 문서가 1000개의 피처로 구성되었다.

In [8]:
# LDA 토픽 모델링
lda = LatentDirichletAllocation(n_components=8, random_state=0) # 토픽은 뉴스그룹에서 추출한 주제와 동일하게 8개
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

In [11]:
print(lda.components_.shape) # components_는 개별 토픽별로 각 word 피처가 얼마나 많이 그 토픽에 할당되었는지에 대한 수치를 가지고 있다.
lda.components_

(8, 1000)


array([[3.13483523e+02, 1.80694863e+02, 4.03990113e+01, ...,
        8.44355322e+01, 1.25001932e-01, 1.25001646e-01],
       [9.44043581e+01, 1.44080013e+01, 1.30242052e+02, ...,
        7.82561496e+01, 1.25016648e-01, 1.25014688e-01],
       [4.44766267e+00, 5.88923250e-01, 1.25170282e-01, ...,
        4.02190900e+01, 1.25003957e-01, 1.25003865e-01],
       ...,
       [9.40699411e+00, 4.55386620e+01, 1.25194439e-01, ...,
        3.24877033e+01, 1.25029032e-01, 1.25000579e-01],
       [2.10365037e+01, 7.02607423e+00, 1.35839302e+01, ...,
        2.20584629e+01, 1.25014981e-01, 1.25001025e-01],
       [1.25057672e-01, 1.25115975e-01, 1.25009770e-01, ...,
        4.22456002e+01, 1.25001380e-01, 1.25001457e-01]])

In [20]:
# CountVectorizer 객체 내의 전체 word 명칭을 get_feature_names()를 통해 추출
feature_names = count_vect.get_feature_names()

# 토픽별로 연관도가 높은 순으로 word 나열
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        
        topic_word_indexes = topic.argsort()[::-1] # 내림차순 정렬한 후 그 인덱스
        top_indexes = topic_word_indexes[:no_top_words]
        
        feature_concat = ' , '.join([feature_names[i] +'*'+ str(round(topic[i],1)) for i in top_indexes])
        print(feature_concat)

display_topics(lda, feature_names, 15)

Topic # 0
edu*568.9 , medical*526.9 , 10*475.4 , health*433.0 , information*422.0 , 1993*409.5 , research*402.2 , pitt*350.1 , disease*343.1 , cancer*341.1 , pitt edu*330.1 , 00*313.5 , new*306.1 , patients*303.9 , 12*298.7
Topic # 1
said*809.1 , people*577.7 , know*572.9 , don*571.2 , didn*531.2 , just*476.0 , went*440.7 , like*404.6 , say*362.0 , time*357.4 , did*352.0 , going*348.2 , came*340.3 , told*336.7 , ac*330.4
Topic # 2
don*937.1 , use*921.2 , just*859.0 , like*826.3 , good*689.6 , time*680.1 , make*644.4 , know*626.4 , way*588.6 , think*577.3 , does*556.0 , used*517.2 , organization*511.9 , people*495.7 , ve*488.9
Topic # 3
file*1522.9 , edu*1437.7 , image*1342.1 , graphics*1253.3 , program*1099.0 , use*980.7 , available*949.4 , window*941.2 , mit*902.9 , software*875.1 , windows*866.8 , ftp*852.1 , version*833.9 , jpeg*829.1 , server*809.1
Topic # 4
armenian*961.1 , israel*928.7 , turkish*917.8 , jews*701.9 , armenians*700.9 , people*655.0 , israeli*568.2 , jewish*547.8 , 

In [21]:
# 어떤 주제인지 해석이 애매한 경우도 있다.

In [22]:
# 개별 문서별 토픽 분포 확인
doc_topics = lda.transform(feat_vect)
print(doc_topics.shape)
print(doc_topics[:3])

(7862, 8)
[[0.00781693 0.00781854 0.19349207 0.00781857 0.00782114 0.00781901
  0.75958811 0.00782563]
 [0.24882762 0.07501554 0.31407259 0.00181456 0.00181406 0.35482677
  0.00181459 0.00181425]
 [0.00379355 0.00379119 0.55224446 0.10064123 0.00379194 0.00379369
  0.328152   0.00379195]]


In [None]:
# # 7862개의 문서가 각 8개의 토픽으로 구성되었다.

In [24]:
# 개별 문서별 토픽 분포도를 출력
def get_filename_list(newsdata):
    filename_list=[]

    for file in newsdata.filenames:
            #print(file)
            filename_temp = file.split('\\')[-2:]
            filename = '.'.join(filename_temp)
            filename_list.append(filename)
    
    return filename_list

filename_list = get_filename_list(news_df)
print("filename 개수:",len(filename_list), "filename list 10개만:",filename_list[:10])

filename 개수: 7862 filename list 10개만: ['soc.religion.christian.20630', 'sci.med.59422', 'comp.graphics.38765', 'comp.graphics.38810', 'sci.med.59449', 'comp.graphics.38461', 'comp.windows.x.66959', 'rec.motorcycles.104487', 'sci.electronics.53875', 'sci.electronics.53617']


In [25]:
# DataFrame으로 생성하여 문서별 토픽 분포도 확인
import pandas as pd 

topic_names = ['Topic #'+ str(i) for i in range(0, 8)]
doc_topic_df = pd.DataFrame(data=doc_topics, columns=topic_names, index=filename_list)
doc_topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7
soc.religion.christian.20630,0.007817,0.007819,0.193492,0.007819,0.007821,0.007819,0.759588,0.007826
sci.med.59422,0.248828,0.075016,0.314073,0.001815,0.001814,0.354827,0.001815,0.001814
comp.graphics.38765,0.003794,0.003791,0.552244,0.100641,0.003792,0.003794,0.328152,0.003792
comp.graphics.38810,0.003577,0.003583,0.322347,0.29448,0.003583,0.003574,0.36528,0.003575
sci.med.59449,0.076528,0.103664,0.661114,0.005212,0.005212,0.005218,0.137832,0.005219
comp.graphics.38461,0.003912,0.06782,0.288198,0.10039,0.003909,0.003911,0.527945,0.003914
comp.windows.x.66959,0.352067,0.012502,0.012503,0.012538,0.012507,0.012507,0.572872,0.012504
rec.motorcycles.104487,0.042139,0.003129,0.003133,0.059844,0.003131,0.677559,0.207938,0.003128
sci.electronics.53875,0.004473,0.004472,0.004468,0.00447,0.004468,0.968712,0.004469,0.004467
sci.electronics.53617,0.009637,0.009619,0.009658,0.009622,0.009619,0.009649,0.932567,0.009629
