In [13]:
from sklearn.datasets import load_files
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import numpy as np
import mglearn

In [None]:
# puple machine - 07.LDA

- 영화 리뷰 데이터셋
- 비지도 학습 모델에서 분석결과가 왜곡되는걸 방지하기 위해 자주나타나는 단어 등 제거

In [2]:
reviews_train = load_files("C:/Users/Adonishan/basic/introduction_to_ml_with_python-master/data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[6]:\n{}".format(text_train[6]))

type of text_train: <class 'list'>
length of text_train: 75000
text_train[6]:
b'Gloomy Sunday - Ein Lied von Liebe und Tod directed by Rolf Sch\xc3\xbcbel in 1999 is a romantic, absorbing, beautiful, and heartbreaking movie. It started like Jules and Jim; it ended as one of Agatha Christie\'s books, and in between it said something about love, friendship, devotion, jealousy, war, Holocaust, dignity, and betrayal, and it did better than The Black Book which is much more popular. It is not perfect, and it made me, a cynic, wonder in the end on the complexity of the relationships and sensational revelations, and who is who to whom but the movie simply overwhelmed me. Perfect or not, it is unforgettable. All four actors as the parts of the tragic not even a triangle but a rectangle were terrific. I do believe that three men could fell deeply for one girl as beautiful and dignified as Ilona in a star-making performance by young Hungarian actress Erica Marozs\xc3\xa1n and who would not? The 

- 문서에서 나타나는 단어에 15% 나타나는 단어를 삭제
- 가장 많이 등장하는 10,000개에 대한 Bag Of Word 모델 설계

In [5]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)

## LDA function build

- We build the model and transform the data in one step. (모델 생성과 변환을 한 번에)
- Computing transform takes some time, and we can save time by doing both at once(변환 시간이 좀 걸리므로 시간을 절약하기 위해 동시처리) 
- n_topic = 10; 10개의 토픽으로 토픽 모델 학습할 것. 토픽의 수를 바꾸면 모든 토픽이 바뀐다.
- batch 방법을 사용
- max_iter :기본값 10; 모델 성능을 위해 max_iter 값 증가.

In [6]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, learning_method="batch",
                                max_iter=25, random_state=0)

document_topics = lda.fit_transform(X)



In [7]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (10, 10000)


In [10]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending 
# 내림차순이 되도록 [:, ::-1]을 사용해 행의 정렬을 반대로 바꾼다.

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names = np.array(vect.get_feature_names())

In [None]:
# mglearn 으로 10개 토픽을 출력.
# topic per chunk 
'''
mglearn/tool.py
def print_topics(topics, feature_names, sorting, topics_per_chunk=6,
                 n_words=20):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        # print topic headers
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(
                    *feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")
'''

In [16]:
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
action        guy           war           show          didn          
game          re            world         series        thought       
effects       around        american      tv            actors        
special       nothing       us            episode       nothing       
fight         sex           our           years         am            
fi            looks         documentary   old           10            
sci           pretty        history       now           worst         
10            look          real          kids          going         
star          worst         black         dvd           got           
alien         stupid        america       shows         actually      


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
dire

- 결과를 보면 
- 0:  가족물
- 1: 역사와 전쟁 영화처럼 보이고 
- 2: 코미디물
- 3: TV 시리즈물
- 4: 일반적인 단어
- 5: 영화용어 (약함)
- 6: 키드물
- 7 : 영화용어 (약함)
- 8 : 리뷰
- 9 : 영화용어 (약함)

# 100개의 토픽 LDA
- 재밌는 것만 뽑기.
- 예를들어 음악 관련 토픽 45를 가중치로 정렬한다.
    -  이 토픽의 비중이 가장 크 문서 다섯 개를 출력한다.
    -  첫 두 문장을 출력 한다.

In [None]:
lda100 = LatentDirichletAllocation(n_topics=100, learning_method="batch",
                                   max_iter=25, random_state=0)
document_topics100 = lda100.fit_transform(X)

In [None]:
topics = np.array([7, 16, 24, 25, 28, 36, 37, 41, 45, 51, 53, 54, 63, 89, 97])

In [None]:
sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=topics, feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=20)

In [None]:
# sort by weight of "music" topic 45
music = np.argsort(document_topics100[:, 45])[::-1]
# print the five documents where the topic is most important
for i in music[:10]:
    # show first two sentences
    print(b".".join(text_train[i].split(b".")[:2]) + b".\n")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 10))
topic_names = ["{:>2} ".format(i) + " ".join(words)
               for i, words in enumerate(feature_names[sorting[:, :2]])]
# two column bar chart:
for col in [0, 1]:
    start = col * 50
    end = (col + 1) * 50
    ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end], ha="left", va="top")
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, 2000)
    yax = ax[col].get_yaxis()
    yax.set_tick_params(pad=130)
plt.tight_layout()

### 인사이트
- 빈도가 많은 단어는 불용어에 가깝고
- 어떠한 토픽은 부정적으로 보이고 칭찬멘트가 많은 단어들로 구성된 것들이 있는지 알아낼수 있다.
- 특정 영화에 대한 의견이거나 평가 점수를 합리화 하거나 강조하기 위한 댓글이라는 사실이 재미있다.
- LDA라는 토픽모델은 라벨이 없거나 있어도 상광넚이 텍스트 말뭉치를 해석하는데 도움을 준다.
- LDA는 확률적 알고리즘이기때문에 random_state 를 바꾸면 결과가 바뀌니 주의하자.
- 보수적으로 평가해야 하는 이유는 비지도학습이기 때문에 과정을 이해하는데 해당 하는 문서를 직접 보지 않고 판단하는건 좋지 않다.