In [24]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import lda
import numpy as np
import mglearn
import matplotlib.pyplot as plt
import re

In [40]:
docs = open("data/thesis/news2.txt").read()
docs2 = open("data/thesis/news.txt").read()

In [41]:
documents = [docs,docs2]

In [43]:
documents[0]



In [44]:
stoplist = set('for a of the and to in'.split())

In [47]:
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [48]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [49]:
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [50]:
from pprint import pprint

In [51]:
pprint(texts)

[['west',
  'florida',
  'donald',
  'trump',
  'view',
  'on',
  'wednesday',
  'his',
  'planned',
  'talks',
  'with',
  'north',
  'kim',
  'jong',
  'what',
  'his',
  'could',
  'he',
  'maintained',
  'be',
  'stand',
  'up',
  'leave',
  'highly',
  'anticipated',
  'summit',
  'should',
  'meeting',
  'fall',
  'short',
  'his',
  '"if',
  'we',
  "it's",
  'going',
  'be',
  'we',
  'have',
  'it,"',
  'president',
  'alongside',
  'his',
  'japanese',
  'at',
  "trump's",
  'mar-a-lago',
  '"if',
  'meeting',
  'when',
  'there',
  "isn't",
  'i',
  'will',
  'leave',
  'content',
  'from',
  'go',
  'content',
  'by',
  'international',
  'trade',
  '"i',
  'like',
  'trump',
  'said.',
  'trump',
  'also',
  'maintained',
  'that',
  'his',
  'position',
  'with',
  'than',
  'been',
  'position',
  'like',
  'this',
  'with',
  'that',
  'whether',
  "it's",
  'or',
  'he',
  'said.',
  'trump',
  'says',
  '5',
  'locations',
  'are',
  'being',
  'considered',
  'kim',


In [13]:
def preprocessing(content):
    content = re.sub('\\xa0', '', content)
    content = re.sub('\\n', '', content)
    content = re.sub('\\\\xa0', '', content)
    content = re.sub('\\\\n', '', content)
    content = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@#$%&\\\=\(\'\")]', '', content)
    content = ' '.join(content.split())
                     
    return content

docs_ko=preprocessing(docs)

## 전처리과정

In [14]:
docs_ko = [docs_ko]

## list 변환

In [15]:
print(docs_ko)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [16]:
vect = CountVectorizer(max_features=100,stop_words="english").fit(docs_ko)
X = vect.fit_transform(docs_ko)

In [19]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)

In [21]:
model.fit(X)

INFO:lda:n_documents: 1
INFO:lda:vocab_size: 100
INFO:lda:n_words: 156537
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1139676
INFO:lda:<10> log likelihood: -1100049
INFO:lda:<20> log likelihood: -1065824
INFO:lda:<30> log likelihood: -1036405
INFO:lda:<40> log likelihood: -1011233
INFO:lda:<50> log likelihood: -990975
INFO:lda:<60> log likelihood: -969200
INFO:lda:<70> log likelihood: -951836
INFO:lda:<80> log likelihood: -936328
INFO:lda:<90> log likelihood: -921856
INFO:lda:<100> log likelihood: -907221
INFO:lda:<110> log likelihood: -892683
INFO:lda:<120> log likelihood: -880148
INFO:lda:<130> log likelihood: -872359
INFO:lda:<140> log likelihood: -862508
INFO:lda:<150> log likelihood: -854593
INFO:lda:<160> log likelihood: -847932
INFO:lda:<170> log likelihood: -838209
INFO:lda:<180> log likelihood: -831416
INFO:lda:<190> log likelihood: -825171
INFO:lda:<200> log likelihood: -824519
INFO:lda:<210> log likelihood: -820831
INFO:lda:<220> log likelihood:

<lda.lda.LDA at 0x22f5a03d9e8>

In [22]:
topic_test = model.transform(X)

In [23]:
for topics in topic_test:
    print("{}".format(topics))

[ 0.26480761  0.08775402  0.01251913  0.02002519  0.0173036   0.01088763
  0.02029978  0.01019877  0.00423731  0.01070678  0.02386365  0.01564012
  0.00788722  0.01633686  0.00814883  0.02308263  0.02669746  0.05688173
  0.09201588  0.27070581]


## stop_words 는 불용어를 제거시켜준다

## data를 BOW형 으로 변환

In [9]:
lda = LatentDirichletAllocation(n_components=10, learning_method="online",
                               max_iter=25, random_state=0)
documents_topics = lda.fit_transform(X)

In [10]:
documents_topics

array([[  6.38852768e-07,   6.38847717e-07,   2.54777877e-01,
          6.38901069e-07,   6.38937732e-07,   6.38943252e-07,
          1.73611265e-01,   1.54235494e-01,   1.93782608e-01,
          2.23589561e-01]])

## lda 변환

In [77]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (10, 100)


In [78]:
sorting = np.argsort(lda.components_,axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

## 행의 특성을 내림차순으로 정렬 
## Countervectorizer 객체에서 특성 이름 추출

In [79]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=5)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
영화            영화            영화            영화            영화            
정말            진짜            너무            정말            정말            
너무            정말            정말            너무            너무            
진짜            너무            진짜            진짜            진짜            
그냥            그냥            이런            그냥            영화를           


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
영화            영화            영화            영화            영화            
정말            정말            정말            너무            너무            
너무            너무            진짜            정말            진짜            
진짜            진짜            너무            진짜            정말            
그냥            영화를           그냥            이런            이런            


