In [2]:
""" BoW 행렬 만들기 """
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
count = CountVectorizer(stop_words = 'english',
                        max_df = .1, max_features = 5000)
# max_df는 단어의 최대 문서 빈도
# 10%로 지정하여 너무 자주 등장하는 단어를 제외했다.
# 또한 max_feature를 통해 자주 등장하는 단어를 5000개로 제한했다.
X = count.fit_transform(df['review'].values)

In [3]:
""" LDA 추정기를 BoW 행렬에 학습 """
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 10,
                               random_state = 123,
                               learning_method = 'batch')
X_topics = lda.fit_transform(X)
# X_topics의 크기는 전체 문서 5만개에 토픽 개수 10개로 (50000, 10) 이다.

In [4]:
""" 토픽에서 가장 중요한 단어 추려보기 """

n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_) : 
    print("토픽 %d : " % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort() [:-n_top_words - 1 : -1]]))
    

토픽 1 : 
worst minutes awful script stupid
토픽 2 : 
family mother father girl children
토픽 3 : 
american war dvd music tv
토픽 4 : 
human audience cinema art feel
토픽 5 : 
police guy car dead murder
토픽 6 : 
horror house sex gore blood
토픽 7 : 
role performance comedy actor performances
토픽 8 : 
series episode episodes war tv
토픽 9 : 
book version original effects read
토픽 10 : 
action fight guy guys cool


In [7]:
""" 공포로 추정되는 6번 토픽 리뷰 확인해보기 """

horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]) : 
    print('\n공포 영화 #%d: ' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


공포 영화 #1: 
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

공포 영화 #2: 
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of the Wolf Man, Frankenstein's monster, and Dracula one "la ...

공포 영화 #3: 
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of the Wolf Man, Frankenstein's monster, and Dracula one "la ...
