# 잠재 디리클레 할당(LDA) 실습2

#### 1) 뉴스 기사 제목 데이터에 대한 이해

[dataset] https://www.kaggle.com/therohk/million-headlines

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv('C:/Users/it/Downloads/dataset/abcnews-date-text.csv', error_bad_lines=False)

In [7]:
print(len(data))
print(data.head(5))

1103663
   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [8]:
text = pd.DataFrame(data[['headline_text']])
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


#### 2) 텍스트 전처리

NLTK의 word_tokenize를 통해 단어 토큰화를 수행합니다.

In [9]:
import nltk

In [10]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

In [11]:
print(text.head(10))

                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...
5            [ambitious, olsson, wins, triple, jump]
6  [antic, delighted, with, record, breaking, barca]
7  [aussie, qualifier, stosur, wastes, four, memp...
8  [aust, addresses, un, security, council, over,...
9  [australia, is, locked, into, war, timetable, ...


불용어를 제거합니다.

In [14]:
from nltk.corpus import stopwords

In [15]:
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])

In [16]:
print(text.head(10))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]
5            [ambitious, olsson, wins, triple, jump]
6        [antic, delighted, record, breaking, barca]
7  [aussie, qualifier, stosur, wastes, four, memp...
8     [aust, addresses, un, security, council, iraq]
9           [australia, locked, war, timetable, opp]


표제어 추출을 수행

In [21]:
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(10))

                                       headline_text
0       [aba, decide, community, broadcast, licence]
1      [act, fire, witness, must, aware, defamation]
2      [g, call, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]
5             [ambitious, olsson, win, triple, jump]
6             [antic, delight, record, break, barca]
7  [aussie, qualifier, stosur, waste, four, memph...
8       [aust, address, un, security, council, iraq]
9             [australia, lock, war, timetable, opp]


길이가 3이하인 단어에 대해서 제거

In [24]:
tokenized_doc = text['headline_text'].apply(lambda x:[word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


#### 3) TF-IDF 행렬 만들기

In [44]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장

In [45]:
text['headline_text'][:5]

0       decide community broadcast licence
1       fire witness must aware defamation
2    call infrastructure protection summit
3                   staff aust strike rise
4      strike affect australian travellers
Name: headline_text, dtype: object

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인

(1103663, 1000)

#### 4) 토픽 모델링

In [48]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777, max_iter=1)

In [49]:
lda_top = lda_model.fit_transform(X)

In [50]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.00000703e-01 1.00000829e-01 1.00003578e-01 ... 1.00004871e-01
  1.00003129e-01 1.00002930e-01]
 [1.00001421e-01 8.66862951e+02 1.00008903e-01 ... 1.00004224e-01
  1.00005598e-01 7.01841034e+02]
 [1.00000648e-01 1.00000545e-01 1.00002661e-01 ... 1.00005158e-01
  1.00008596e-01 1.00001987e-01]
 ...
 [1.00001636e-01 1.00000889e-01 2.68570402e+03 ... 1.00003039e-01
  1.00010511e-01 1.00004475e-01]
 [1.00001352e-01 1.00000852e-01 1.00003353e-01 ... 1.00003378e-01
  1.00005211e-01 1.00003635e-01]
 [1.00002244e-01 1.00000967e-01 1.00003675e-01 ... 1.00002444e-01
  1.00003580e-01 1.00004738e-01]]
(10, 1000)


In [54]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d: " % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:n -1:-1]])
get_topics(lda_model.components_,terms)

Topic 1:  [('government', 8658.95), ('queensland', 8134.58), ('perth', 6332.45), ('year', 5981.93), ('change', 5833.07), ('home', 5751.08), ('china', 4419.14), ('drug', 4189.73), ('final', 3767.14), ('return', 3531.88), ('bank', 3466.13), ('lead', 3463.08), ('students', 2825.23), ('save', 2779.71), ('future', 2733.51), ('million', 2678.8), ('rule', 2585.54), ('western', 2445.04), ('sport', 2432.76), ('company', 2294.38), ('chinese', 2259.32), ('team', 2229.03), ('rugby', 2226.74), ('link', 2141.06), ('know', 2126.45), ('india', 2006.13), ('post', 1938.92), ('dollar', 1937.26), ('best', 1928.72), ('bomb', 1828.05), ('sell', 1810.82), ('point', 1796.86), ('begin', 1734.75), ('safety', 1690.61), ('hand', 1634.98), ('outback', 1633.3), ('union', 1608.74), ('music', 1501.75), ('send', 1463.59), ('struggle', 1450.48), ('aussie', 1444.25), ('victory', 1432.52), ('waste', 1430.33), ('loss', 1386.21), ('shop', 1376.93), ('double', 1373.1), ('southern', 1336.82), ('bali', 1322.56), ('worker', 12