<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/12_Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic Modeling - Latent Dirichlet Allocation 실습**

## **1. 잠재 디리클레 할당(LDA)**

In [9]:
docs_ls = ["Cute kitty",
          "Eat rice or cake",
          "Kitty and hamster",
          "Eat bread",
          "Rice, bread and cake",
          "Cute hamster eats bread and cake"]

### **1-1. 데이터 전처리**

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

# 문장 전처리
pos_docs = []
for line in docs_ls:
    doc = line.split(" ")
    tmp_docs = []
    for word in doc:
        # 소문자화, Lemmatize
        tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
    # 영어 품사 부착(PoS Tagging)
    pos_docs.append(pos_tag(tmp_docs))

pos_docs

[[('cute', 'NN'), ('kitty', 'NN')],
 [('eat', 'NN'), ('rice', 'NN'), ('or', 'CC'), ('cake', 'VB')],
 [('kitty', 'NNS'), ('and', 'CC'), ('hamster', 'NN')],
 [('eat', 'NN'), ('bread', 'NN')],
 [('rice,', 'NN'), ('bread', 'NN'), ('and', 'CC'), ('cake', 'NN')],
 [('cute', 'NN'),
  ('hamster', 'NN'),
  ('eat', 'NN'),
  ('bread', 'NN'),
  ('and', 'CC'),
  ('cake', 'NN')]]

In [35]:
# 불용어 처리(stopWord)
stopPos = ['CC']
stopWord = [',']

docs_token = []
tokens = []

for pos_doc in pos_docs:
    doc_token_tmp = []
    for pos_token in pos_doc:
        # 불용 품사 지정
        if pos_token[1] not in stopPos:
            # 불용어 지정
            if pos_token[0] not in stopWord:
                doc_token_tmp.append(pos_token[0])
                tokens.append(pos_token[0])
    # 문서 사용 단어
    docs_token.append(doc_token_tmp)
# 전체 문서 단어
tokens = list(set(tokens))

docs_token, tokens

([['cute', 'kitty'],
  ['eat', 'rice', 'cake'],
  ['kitty', 'hamster'],
  ['eat', 'bread'],
  ['rice,', 'bread', 'cake'],
  ['cute', 'hamster', 'eat', 'bread', 'cake']],
 ['cute', 'rice,', 'hamster', 'bread', 'cake', 'rice', 'eat', 'kitty'])

### **1-2. LDA**

In [96]:
from random import randint

# 토픽 랜덤 설정
topic = 2 # 임의의 랜덤값
topic_set = []

# 토픽 랜덤 부여
for i in range(len(docs_token)):
    topic_count = [randint(1, topic) for a in range(len(docs_token[i]))]
    topic_set.append(topic_count)

topic_set

[[2, 2], [2, 1, 2], [1, 2], [2, 2], [1, 1, 2], [1, 2, 2, 1, 1]]

In [98]:
import numpy as np

# 문서 내 토픽 분포
alpha = 0.1 # 알파 값 부여

topic_doc = []
for i in range(len(topic_set)):
    tmp = []
    for j in range(1, topic+1):
        if j in topic_set[i]:
            tmp.append(topic_set[i].count(j) + seta)
        else:
            tmp.append(0)
    topic_doc.append(tmp)

topic_doc

[[0, 2.1], [1.1, 2.1], [1.1, 1.1], [0, 2.1], [2.1, 1.1], [3.1, 2.1]]

In [168]:
# 토픽 내 단어 분포
beta = 0.001 # 베타값 부여

topic_word = [[0 for a in range((len(tokens)))] for b in range(topic)]

for i in range(len(docs_token)):
    for j in range(len(docs_token[i])):
        for k in range(1, topic+1):
            if topic_set[i][j] == k:
                    topic_word[k-1][tokens.index(docs_token[i][j])] += 1

for i in range(len(topic_word)):
    for j in range(len(topic_word[i])):
        topic_word[i][j] += beta

topic_word

[[1.001, 1.001, 0.001, 2.001, 1.001, 1.001, 0.001, 1.001],
 [1.001, 0.001, 2.001, 1.001, 2.001, 0.001, 3.001, 1.001]]

In [169]:
# 합계
prob_td = []
prob_tw = []

# 문서내 토픽 확률
for i in range(len(topic_doc)):
    td_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_doc[i])):
        prob_tmp.append(topic_doc[i][j]/td_total)
    prob_td.append(prob_tmp)

# 토픽 내 단어 합계
for i in range(len(topic_word)):
    tw_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_word[i])):
        prob_tmp.append(topic_word[i][j]/tw_total)
    prob_tw.append(prob_tmp)

prob_td, prob_tw

([[0.0, 1.0],
  [0.34375, 0.65625],
  [0.5, 0.5],
  [0.0, 1.0],
  [0.65625, 0.34375],
  [0.5961538461538461, 0.40384615384615385]],
 [[0.47666666666666657,
   0.47666666666666657,
   0.0004761904761904762,
   0.9528571428571427,
   0.47666666666666657,
   0.47666666666666657,
   0.0004761904761904762,
   0.47666666666666657],
  [0.31281249999999994,
   0.0003125,
   0.6253124999999999,
   0.31281249999999994,
   0.6253124999999999,
   0.0003125,
   0.9378124999999999,
   0.31281249999999994]])

In [187]:
# 토픽 부여 행렬
for i in range(len(docs_token)):
    topic_prob = [[0 for a in range((topic))] for a in range(len(docs_token[i]))]
    topic_result.append(topic_prob)
topic_result

# LDA 계산
for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        for k in range(topic):
                topic_result[i][j][k] = prob_td[i][k] * prob_tw[k][tokens.index(docs_token[i][j])]

topic_result

[[[0.0, 0.31281249999999994], [0.0, 0.31281249999999994]],
 [[0.00016369047619047618, 0.6154394531249999],
  [0.16385416666666663, 0.000205078125],
  [0.16385416666666663, 0.410361328125]],
 [[0.23833333333333329, 0.15640624999999997],
  [0.0002380952380952381, 0.31265624999999997]],
 [[0.0, 0.9378124999999999], [0.0, 0.31281249999999994]],
 [[0.31281249999999994, 0.000107421875],
  [0.6253124999999999, 0.10752929687499999],
  [0.31281249999999994, 0.21495117187499999]],
 [[0.2841666666666666, 0.12632812499999999],
  [0.0002838827838827839, 0.25253004807692303],
  [0.0002838827838827839, 0.37873197115384616],
  [0.5680494505494504, 0.12632812499999999],
  [0.2841666666666666, 0.25253004807692303]]]

In [198]:
# 최종 토픽 할당
LDA_result = []
for i in range(len(docs_token)):
    LDA_result.append([[0 for a in range((topic))] for a in range(len(docs_token[i]))])

for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        LDA_result[i][j] = topic_result[i][j].index(np.max(topic_result[i][j])) + 1

LDA_result

[[2, 2], [2, 1, 2], [1, 2], [2, 2], [1, 1, 1], [1, 2, 2, 1, 1]]