# Latent Dirichlet Allocation, LDA
- https://wikidocs.net/30708
- 토픽의 개수(k) 결정
- 모든 단어를 k개 중 하나의 토픽에 랜덤으로 할당
- 이제 모든 문서의 모든 단어에 대해서 아래의 사항을 반복 진행
    - 어떤 문서의 각 단어 w는 자신은 잘못된 토픽에 할당되어져 있지만, 다른 단어들은 전부 올바른 토픽에 할당되어져 있는 상태라고 가정합니다. 이에 따라 단어 w는 아래의 두 가지 기준에 따라서 토픽이 재할당됩니다.
        - p(topic t | document d) : 문서 d의 단어들 중 토픽 t에 해당하는 단어들의 비율
        - p(word w | topic t) : 단어 w를 갖고 있는 모든 문서들 중 토픽 t가 할당된 비율

잠재 디리클레 할당과 잠재 의미 분석의 차이
- LSA : DTM을 차원 축소 하여 축소 차원에서 근접 단어들을 토픽으로 묶는다.
- LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출한다.

## Import

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

import gensim
from gensim import corpora
from sklearn.datasets import fetch_20newsgroups

# !pip install pyLDAvis
import pyLDAvis.gensim

## Exercise - use gensim

### Load dataset

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes"))
documents = dataset.data
len(documents)

11314

### Text preprocessing

In [3]:
df = pd.DataFrame({"document":documents})
# 알파벳만 남기고 제거
df["clean_document"] = df["document"].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어 제거 > 왱? 짧은 단어는 유용한 정보를 담고있지 않다고 가정한다는군
df["clean_document"] = df["clean_document"].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))
# 소문자로 변환
df["clean_document"] = df["clean_document"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,document,clean_document
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...


In [4]:
english_stopwords = stopwords.words('english')
len(english_stopwords)

179

In [5]:
df["tokenize_document"] = df["clean_document"].apply(lambda x: [w for w in x.split() if x not in english_stopwords])
df["tokenize_document"][0]

['well',
 'sure',
 'about',
 'story',
 'seem',
 'biased',
 'what',
 'disagree',
 'with',
 'your',
 'statement',
 'that',
 'media',
 'ruin',
 'israels',
 'reputation',
 'that',
 'rediculous',
 'media',
 'most',
 'israeli',
 'media',
 'world',
 'having',
 'lived',
 'europe',
 'realize',
 'that',
 'incidences',
 'such',
 'described',
 'letter',
 'have',
 'occured',
 'media',
 'whole',
 'seem',
 'ignore',
 'them',
 'subsidizing',
 'israels',
 'existance',
 'europeans',
 'least',
 'same',
 'degree',
 'think',
 'that',
 'might',
 'reason',
 'they',
 'report',
 'more',
 'clearly',
 'atrocities',
 'what',
 'shame',
 'that',
 'austria',
 'daily',
 'reports',
 'inhuman',
 'acts',
 'commited',
 'israeli',
 'soldiers',
 'blessing',
 'received',
 'from',
 'government',
 'makes',
 'some',
 'holocaust',
 'guilt',
 'away',
 'after',
 'look',
 'jews',
 'treating',
 'other',
 'races',
 'when',
 'they',
 'power',
 'unfortunate']

### 정수 인코딩, 단어 집합 만들기

In [8]:
dictionary = corpora.Dictionary(df["tokenize_document"])
corpus = [dictionary.doc2bow(text) for text in df["tokenize_document"]]

In [13]:
corpus[10][:10]

[(0, 7),
 (2, 1),
 (5, 1),
 (14, 1),
 (17, 57),
 (20, 12),
 (21, 2),
 (29, 2),
 (30, 2),
 (32, 5)]

In [15]:
len(dictionary), dictionary[0]

(64365, 'about')

### Train LDA

In [16]:
NUM_TOPICS = 20
lda_model = gensim.models.ldamodel.LdaModel(
    corpus, 
    num_topics=NUM_TOPICS,
    id2word=dictionary,
    passes=15 #알고리즘 동작횟수
)

In [17]:
# 각 토픽별 n개의 단어 기여도
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.037*"window" + 0.021*"display" + 0.014*"widget" + 0.014*"mouse" + 0.013*"color"')
(1, '0.014*"bike" + 0.011*"engine" + 0.011*"cars" + 0.009*"than" + 0.007*"road"')
(2, '0.017*"master" + 0.010*"plane" + 0.010*"slave" + 0.008*"allah" + 0.007*"part"')
(3, '0.017*"game" + 0.015*"team" + 0.013*"year" + 0.013*"will" + 0.012*"games"')
(4, '0.049*"that" + 0.020*"this" + 0.015*"have" + 0.011*"with" + 0.010*"what"')
(5, '0.025*"guns" + 0.016*"control" + 0.015*"firearms" + 0.015*"crime" + 0.012*"weapons"')
(6, '0.018*"were" + 0.013*"their" + 0.012*"from" + 0.010*"that" + 0.009*"they"')
(7, '0.019*"that" + 0.016*"this" + 0.016*"will" + 0.012*"with" + 0.010*"government"')
(8, '0.026*"with" + 0.024*"have" + 0.020*"this" + 0.017*"that" + 0.009*"would"')
(9, '0.009*"overtime" + 0.009*"arafat" + 0.007*"bird" + 0.006*"padded" + 0.005*"brilliant"')
(10, '0.019*"scores" + 0.015*"maine" + 0.012*"mydisplay" + 0.011*"stats" + 0.008*"joystick"')
(11, '0.006*"cross" + 0.006*"mary" + 0.005*"scsi" + 0.005

### Visualization

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

### 문서 별 토픽 분포
토픽 번호와 해당 토픽이 해당 문서에서 차지하는 분포도

In [21]:
for i, topic_list in enumerate(lda_model[corpus]):
    if i == 5:
        break
    print(f"{i}번째 문서의 topic 비율은 {topic_list}")

0번째 문서의 topic 비율은 [(0, 0.04549789), (1, 0.027804269), (4, 0.49468353), (6, 0.42247725)]
1번째 문서의 topic 비율은 [(3, 0.11492364), (4, 0.50609195), (8, 0.11009235), (15, 0.23139717), (16, 0.021851415)]
2번째 문서의 topic 비율은 [(4, 0.0679482), (6, 0.22659662), (7, 0.17762864), (15, 0.4937417), (17, 0.024774402)]
3번째 문서의 topic 비율은 [(2, 0.033558078), (4, 0.20757632), (7, 0.316315), (8, 0.06129396), (14, 0.07817704), (15, 0.29518023)]
4번째 문서의 topic 비율은 [(3, 0.69520897), (4, 0.2766572)]


In [28]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(doc):
            if j == 0:
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
            else:
                break
    return(topic_table)

In [29]:
topictable = make_topictable_per_doc(lda_model, corpus)
topictable = topictable.reset_index()
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,4.0,0.4947,"[(0, 0.04549803), (1, 0.027804693), (4, 0.4946..."
1,1,4.0,0.506,"[(3, 0.11491867), (4, 0.50600535), (8, 0.11010..."
2,2,15.0,0.4918,"[(4, 0.07055347), (6, 0.22636026), (7, 0.17720..."
3,3,7.0,0.3163,"[(2, 0.033558078), (4, 0.20754296), (7, 0.3163..."
4,4,3.0,0.6952,"[(3, 0.6952174), (4, 0.27664882)]"
5,5,4.0,0.3843,"[(2, 0.32366607), (3, 0.043017816), (4, 0.3842..."
6,6,8.0,0.6058,"[(3, 0.051869333), (4, 0.17554958), (7, 0.0121..."
7,7,6.0,0.3761,"[(4, 0.28630856), (6, 0.37614602), (15, 0.3261..."
8,8,15.0,0.4655,"[(4, 0.27312687), (7, 0.09122808), (11, 0.0282..."
9,9,15.0,0.396,"[(1, 0.14238901), (4, 0.08519253), (6, 0.07451..."


## Exercise2 - use sklearn

### Import

In [30]:
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
import pandas as pd
import urllib.request

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Load dataset
약 15년 동안 발행되었던 뉴스 기사 제목을 모아놓은 영어 데이터

In [52]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
df = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print(df.shape)
df.head()

(1082168, 2)


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [53]:
text = df[["headline_text"]]
len(text)

1082168

### Text preprocessing
- tokenize
- remove stopwords
- apply lemmatizer

In [54]:
en_stopwords = stopwords.words("english")
len(en_stopwords)

179

In [55]:
text["headline_text"] = text.apply(lambda row: nltk.word_tokenize(row["headline_text"]), axis=1)
text["headline_text"] = text["headline_text"].apply(lambda x: [w for w in x if w not in en_stopwords])
# 표제어 추출로 3인칭 단수 표현을 1인칭으로 바꾸고, 과거 현재형 동사를 현재형으로 바꿉니다.
text["headline_text"] = text["headline_text"].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos="v") for word in x])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [56]:
tokenized_doc = text["headline_text"].apply(lambda x: [word for word in x if len(word) > 3])
tokenized_doc.head()

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object

### Create TF-IDF

In [57]:
detokenized_doc = []
for i in range(len(text)):
    t = " ".join(tokenized_doc[i])
    detokenized_doc.append(t)
    
text["headline_text"] = detokenized_doc
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,headline_text
0,decide community broadcast licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian travellers


In [59]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X = vectorizer.fit_transform(text["headline_text"])
X.shape

(1082168, 1000)

### Topic modeling

In [61]:
lda_model = LatentDirichletAllocation(
    n_components=10,
    learning_method="online",
    random_state=0,
    max_iter=1
)
lda_top=lda_model.fit_transform(X)

In [62]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[3.51600412e+02 1.00001494e-01 1.00005049e-01 ... 1.00004264e-01
  1.00003474e-01 1.00004800e-01]
 [1.00001272e-01 1.00001222e-01 1.00003570e-01 ... 1.00008733e-01
  1.00001628e-01 1.00003957e-01]
 [1.00001823e-01 1.00000548e-01 1.00001147e-01 ... 1.00003255e-01
  1.00003647e-01 1.00003533e-01]
 ...
 [1.00002890e-01 1.13513398e+03 1.00017071e-01 ... 1.00005619e-01
  1.00002781e-01 1.00002958e-01]
 [1.00001584e-01 1.00001246e-01 1.00004508e-01 ... 1.00004064e-01
  1.00002811e-01 7.53381835e+02]
 [1.00002401e-01 1.00002691e-01 1.00015040e-01 ... 1.77619511e+03
  1.50652739e+02 1.00004547e-01]]
(10, 1000)


In [74]:
# terms = vectorizer.get_feature_nemas()
terms = list(vectorizer.vocabulary_.keys()) # 이게 맞는건가?

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('east', 7561.63), ('record', 6758.36), ('reporter', 5658.99), ('teacher', 5611.33), ('western', 5429.41)]
Topic 2: [('protect', 13691.08), ('press', 7528.43), ('super', 5053.45), ('cyclone', 4610.97), ('year', 4546.98)]
Topic 3: [('team', 12092.44), ('king', 8725.19), ('likely', 6113.49), ('pilot', 5851.6), ('howard', 5674.38)]
Topic 4: [('check', 5935.06), ('push', 5874.27), ('tsunami', 5586.42), ('surprise', 4533.32), ('affect', 4070.65)]
Topic 5: [('whale', 6677.03), ('witness', 5488.19), ('newcastle', 5225.56), ('legal', 4693.03), ('declare', 3652.78)]
Topic 6: [('heritage', 11088.95), ('gaza', 8428.8), ('video', 8393.29), ('expand', 6268.13), ('horse', 5663.0)]
Topic 7: [('help', 6959.64), ('refuse', 5924.98), ('time', 5834.63), ('hurt', 5545.86), ('answer', 5465.06)]
Topic 8: [('carbon', 6707.7), ('south', 6456.53), ('issue', 6112.23), ('tourism', 5488.62), ('japanese', 5251.55)]
Topic 9: [('commissioner', 11966.41), ('marine', 5502.89), ('korea', 5142.38), ('august', 