# 딥 러닝을 이용한 자연어 처리 입문

아래 링크의 E-book을 보고 실습한 내용입니다.

WikiDocs 주소: https://wikidocs.net/31766

# 6장 토픽 모델링

## 2절 잠재 디리클레 할당 (Latent Dirichlet Allocation, LDA)


## 뉴스 그룹 데이터를 활용한 LDA 실습 - scikit learn

### 뉴스 제목 데이터 다운로드


In [1]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)




  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
print("데이터의 개수:", len(data))
print(data.head())


데이터의 개수: 1082168
   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


### 텍스트 전처리
- 불용어 제거
- 표제어 추출
- 길이가 짧은 단어 제거

In [18]:
import nltk
from nltk.stem import WordNetLemmatizer
from IPython.display import display

# 헤드라인만 따로 저장
text = data[['headline_text']].copy()

# 토큰화
text.loc[:, 'headline_text'] = text.apply(
    lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
print("토큰화")
display(text.head(5))

# 표제어 추출
text.loc[:, 'headline_text'] = text.loc[:, 'headline_text'].apply(
    lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print("표제어 추출")
display(text.head(5))

# 길이가 짧은 단어 제거
tokenized_doc = text.loc[:, 'headline_text'].apply(
    lambda x: [word for word in x if len(word) > 3])
print("길이가 짧은 단어 제거")
display(tokenized_doc[:5])


토큰화


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


표제어 추출


Unnamed: 0,headline_text
0,"[aba, decide, against, community, broadcast, l..."
1,"[act, fire, witness, must, be, aware, of, defa..."
2,"[a, g, call, for, infrastructure, protection, ..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


길이가 짧은 단어 제거


0    [decide, against, community, broadcast, licence]
1            [fire, witness, must, aware, defamation]
2          [call, infrastructure, protection, summit]
3                         [staff, aust, strike, rise]
4            [strike, affect, australian, travellers]
Name: headline_text, dtype: object

### TF-IDF 행렬 만들기

In [19]:
detokenized_doc = []
for i in range(len(text)):
    t = ' ' .join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc
text['headline_text'].head()


0    decide against community broadcast licence
1            fire witness must aware defamation
2         call infrastructure protection summit
3                        staff aust strike rise
4           strike affect australian travellers
Name: headline_text, dtype: object

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['headline_text'])
X.shape


(1082168, 1000)

### 토픽 모델링

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(
    n_components=10, learning_method='online', random_state=2, max_iter=1)
lda_top = lda_model.fit_transform(X)


In [27]:
print("Components shape:", lda_model.components_.shape)
print("토픽 내의 단어들의 분포\n", lda_model.components_)


Components shape: (10, 1000)
토픽 내의 단어들의 분포
 [[1.00000871e-01 1.00003339e-01 1.00014905e-01 ... 1.00004036e-01
  1.00002263e-01 1.00003288e-01]
 [1.00002227e-01 1.00002129e-01 1.00005836e-01 ... 1.00008125e-01
  1.00003007e-01 1.00003514e-01]
 [3.51600415e+02 1.13513398e+03 1.00010754e-01 ... 1.00004829e-01
  1.00002995e-01 1.00003080e-01]
 ...
 [1.00001571e-01 1.00001298e-01 3.50170828e+03 ... 1.77619510e+03
  1.50652739e+02 1.00004157e-01]
 [1.00000922e-01 1.00000856e-01 1.00003222e-01 ... 1.00004189e-01
  1.00005413e-01 1.00002368e-01]
 [1.00001079e-01 1.00001123e-01 1.00003873e-01 ... 1.00003931e-01
  1.00001535e-01 1.00003452e-01]]


In [26]:
terms = vectorizer.get_feature_names()


def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (
            idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])


get_topics(lda_model.components_, terms)


Topic 1: [('police', 12092.44), ('government', 8725.19), ('sydney', 8393.29), ('south', 6677.03), ('help', 5225.56)]
Topic 2: [('trump', 11966.41), ('attack', 6959.64), ('change', 5874.27), ('year', 5586.42), ('china', 4533.32)]
Topic 3: [('australian', 11088.95), ('charge', 8428.8), ('queensland', 7720.12), ('world', 6707.7), ('murder', 6268.13)]
Topic 4: [('melbourne', 7528.43), ('canberra', 6112.23), ('plan', 6033.16), ('live', 5488.62), ('brisbane', 4857.21)]
Topic 5: [('2016', 5488.19), ('crash', 5281.14), ('state', 4923.41), ('people', 4121.07), ('national', 4038.68)]
Topic 6: [('australia', 13691.08), ('coast', 5429.41), ('woman', 3909.11), ('leave', 3849.71), ('gold', 3793.71)]
Topic 7: [('election', 7561.63), ('adelaide', 6758.36), ('death', 5935.06), ('home', 5674.38), ('make', 5658.99)]
Topic 8: [('warn', 5115.01), ('tasmanian', 4859.02), ('jail', 4632.85), ('turnbull', 4269.85), ('women', 4232.53)]
Topic 9: [('court', 7542.74), ('perth', 6456.53), ('house', 6113.49), ('open