# 2) 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

## 5. 실습을 통한 이해


### 1) 정수 인코딩과 단어 집합 만들기


In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [2]:
news_df = pd.DataFrame({'document':documents})

# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")

# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.

tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [19]:
corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 4),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1)],
 [(52, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 2),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 2),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1)],
 [(4, 1

In [10]:
print(dictionary[66])

faith


In [16]:
tokenized_doc[1].count('faith')

2

In [17]:
len(dictionary)

64281

### 2) LDA 모델 훈련시키기


In [20]:
import gensim
NUM_TOPICS = 20 # 20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15) 
# passes는 알고리즘의 동작 횟수를 말하는데, 알고리즘이 결정하는 토픽의 값이 적절히 수렴할 수 있도록 충분히 적당한 횟수를 정해주면 됨
topics = ldamodel.print_topics(num_words=4) # num_words=4로 총 4개의 단어만 출력
for topic in topics:
    print(topic)

(0, '0.010*"guns" + 0.007*"right" + 0.006*"control" + 0.006*"weapons"')
(1, '0.021*"period" + 0.012*"power" + 0.011*"detroit" + 0.010*"play"')
(2, '0.012*"thanks" + 0.011*"software" + 0.011*"please" + 0.010*"anyone"')
(3, '0.016*"health" + 0.013*"medical" + 0.009*"disease" + 0.008*"food"')
(4, '0.010*"science" + 0.009*"university" + 0.007*"earth" + 0.007*"space"')
(5, '0.015*"space" + 0.006*"data" + 0.006*"also" + 0.006*"program"')
(6, '0.007*"number" + 0.006*"used" + 0.006*"water" + 0.004*"time"')
(7, '0.016*"drive" + 0.009*"disk" + 0.009*"scsi" + 0.009*"system"')
(8, '0.012*"bike" + 0.009*"ground" + 0.007*"wire" + 0.006*"cover"')
(9, '0.011*"gordon" + 0.011*"soon" + 0.011*"pitt" + 0.010*"banks"')
(10, '0.010*"armenian" + 0.010*"said" + 0.009*"armenians" + 0.007*"people"')
(11, '0.015*"game" + 0.015*"team" + 0.012*"year" + 0.011*"israel"')
(12, '0.023*"jesus" + 0.013*"bible" + 0.011*"christian" + 0.011*"church"')
(13, '0.029*"file" + 0.014*"output" + 0.013*"entry" + 0.011*"program"')


In [21]:
print(ldamodel.print_topics())

[(0, '0.010*"guns" + 0.007*"right" + 0.006*"control" + 0.006*"weapons" + 0.006*"crime" + 0.006*"firearms" + 0.006*"year" + 0.005*"cars" + 0.004*"second" + 0.004*"carry"'), (1, '0.021*"period" + 0.012*"power" + 0.011*"detroit" + 0.010*"play" + 0.008*"scorer" + 0.007*"kings" + 0.007*"calgary" + 0.006*"flames" + 0.006*"chicago" + 0.006*"winnipeg"'), (2, '0.012*"thanks" + 0.011*"software" + 0.011*"please" + 0.010*"anyone" + 0.009*"would" + 0.009*"know" + 0.009*"mail" + 0.009*"like" + 0.009*"also" + 0.009*"windows"'), (3, '0.016*"health" + 0.013*"medical" + 0.009*"disease" + 0.008*"food" + 0.007*"patients" + 0.007*"tobacco" + 0.006*"april" + 0.006*"among" + 0.005*"medicine" + 0.005*"cancer"'), (4, '0.010*"science" + 0.009*"university" + 0.007*"earth" + 0.007*"space" + 0.006*"research" + 0.006*"scientific" + 0.006*"center" + 0.005*"orbit" + 0.005*"nasa" + 0.005*"lunar"'), (5, '0.015*"space" + 0.006*"data" + 0.006*"also" + 0.006*"program" + 0.005*"year" + 0.005*"list" + 0.004*"nasa" + 0.004*"

### 3) LDA 시각화 하기


In [22]:
pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |▏                               | 10kB 16.9MB/s eta 0:00:01[K     |▍                               | 20kB 22.2MB/s eta 0:00:01[K     |▋                               | 30kB 26.0MB/s eta 0:00:01[K     |▉                               | 40kB 20.1MB/s eta 0:00:01[K     |█                               | 51kB 15.7MB/s eta 0:00:01[K     |█▏                              | 61kB 17.6MB/s eta 0:00:01[K     |█▍                              | 71kB 15.4MB/s eta 0:00:01[K     |█▋                              | 81kB 13.5MB/s eta 0:00:01[K     |█▉                              | 92kB 14.1MB/s eta 0:00:01[K     |██                              | 102kB 13.0MB/s eta 0:00:01[K     |██▎                             | 112kB 13.0MB/s eta 0:00:01[K     |██▍                             | 122kB 13.0MB/s eta

In [23]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

### 4) 문서 별 토픽 분포 보기


In [24]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(6, 0.04684219), (7, 0.154812), (8, 0.04203505), (10, 0.17033873), (11, 0.07978495), (16, 0.49489674)]
1 번째 문서의 topic 비율은 [(6, 0.027282994), (7, 0.09802766), (16, 0.85395765)]
2 번째 문서의 topic 비율은 [(0, 0.036065433), (3, 0.12519968), (11, 0.17161195), (16, 0.65421975)]
3 번째 문서의 topic 비율은 [(0, 0.031662755), (6, 0.088716246), (7, 0.085348636), (16, 0.3436014), (18, 0.4394769)]
4 번째 문서의 topic 비율은 [(0, 0.2204341), (11, 0.41878298), (13, 0.08384308), (16, 0.24731024)]


In [25]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [26]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,16.0,0.4949,"[(6, 0.046823714), (7, 0.15482612), (8, 0.0420..."
1,1,16.0,0.8539,"[(6, 0.027281), (7, 0.098112516), (16, 0.85387..."
2,2,16.0,0.6543,"[(0, 0.036066104), (3, 0.12519778), (11, 0.171..."
3,3,18.0,0.4395,"[(0, 0.031650618), (6, 0.08872149), (7, 0.0853..."
4,4,11.0,0.4187,"[(0, 0.22042754), (11, 0.41867986), (13, 0.083..."
5,5,16.0,0.609,"[(0, 0.17309813), (6, 0.05198254), (12, 0.1326..."
6,6,1.0,0.6675,"[(1, 0.6675404), (2, 0.13250752), (7, 0.129258..."
7,7,16.0,0.6592,"[(3, 0.15885141), (11, 0.15214685), (16, 0.659..."
8,8,16.0,0.3671,"[(3, 0.1166521), (5, 0.24096334), (9, 0.107237..."
9,9,7.0,0.4548,"[(5, 0.19585109), (7, 0.45476317), (16, 0.3377..."


# 3) 잠재 디리클레 할당(LDA) 실습2


## 1. 실습을 통한 이해

### 1) 뉴스 기사 제목 데이터에 대한 이해


In [27]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

In [28]:
print(len(data))

1082168


In [29]:
data[:5]

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [30]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


### 2) 텍스트 전처리


In [32]:
# 단어 토큰화
import nltk
nltk.download('punkt')
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
text.head(5)

Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


In [35]:
# 불용어 제거
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
text.head(5)

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [39]:
# 표제어 추출
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
                                       headline_text
0       [aba, decide, community, broadcast, licence]
1      [act, fire, witness, must, aware, defamation]
2      [g, call, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [40]:
# 길이가 3이하인 단어 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


### 3) TF-IDF 행렬 만들기


In [41]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [42]:
text['headline_text'][:5]

0       decide community broadcast licence
1       fire witness must aware defamation
2    call infrastructure protection summit
3                   staff aust strike rise
4      strike affect australian travellers
Name: headline_text, dtype: object

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인

(1082168, 1000)

### 4) 토픽 모델링 (LDA 수행)

In [44]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)

In [45]:
lda_top=lda_model.fit_transform(X)

In [46]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.00001533e-01 1.00001269e-01 1.00004179e-01 ... 1.00006124e-01
  1.00003111e-01 1.00003064e-01]
 [1.00001199e-01 1.13513398e+03 3.50170830e+03 ... 1.00009349e-01
  1.00001896e-01 1.00002937e-01]
 [1.00001811e-01 1.00001151e-01 1.00003566e-01 ... 1.00002693e-01
  1.00002061e-01 7.53381835e+02]
 ...
 [1.00001065e-01 1.00001689e-01 1.00003278e-01 ... 1.00006721e-01
  1.00004902e-01 1.00004759e-01]
 [1.00002401e-01 1.00000732e-01 1.00002989e-01 ... 1.00003517e-01
  1.00001428e-01 1.00005266e-01]
 [1.00003427e-01 1.00002313e-01 1.00007340e-01 ... 1.00003732e-01
  1.00001207e-01 1.00005153e-01]]
(10, 1000)


In [47]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('government', 8725.19), ('sydney', 8393.29), ('queensland', 7720.12), ('change', 5874.27), ('home', 5674.38)]
Topic 2: [('australia', 13691.08), ('australian', 11088.95), ('melbourne', 7528.43), ('world', 6707.7), ('south', 6677.03)]
Topic 3: [('death', 5935.06), ('interview', 5924.98), ('kill', 5851.6), ('jail', 4632.85), ('life', 4275.27)]
Topic 4: [('house', 6113.49), ('2016', 5488.19), ('state', 4923.41), ('brisbane', 4857.21), ('tasmania', 4610.97)]
Topic 5: [('court', 7542.74), ('attack', 6959.64), ('open', 5663.0), ('face', 5193.63), ('warn', 5115.01)]
Topic 6: [('market', 5545.86), ('rural', 5502.89), ('plan', 4828.71), ('indigenous', 4223.4), ('power', 3968.26)]
Topic 7: [('charge', 8428.8), ('election', 7561.63), ('adelaide', 6758.36), ('make', 5658.99), ('test', 5062.69)]
Topic 8: [('police', 12092.44), ('crash', 5281.14), ('drug', 4290.87), ('beat', 3257.58), ('rise', 2934.92)]
Topic 9: [('fund', 4693.03), ('labor', 4047.69), ('national', 4038.68), ('council', 40