<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/12_Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic Modeling - Latent Dirichlet Allocation 실습**

## **1. 잠재 디리클레 할당(LDA)**

In [20]:
docs_ls = ["Cute kitty",
          "Eat rice or cake",
          "Kitty and hamster",
          "Eat bread",
          "Rice, bread and cake",
          "Cute hamster eats bread and cake"]

### **1-1. 데이터 전처리**

In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

# 문장 전처리
pos_docs = []
for line in docs_ls:
    doc = line.split(" ")
    tmp_docs = []
    for word in doc:
        # 소문자화, Lemmatize
        tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
    # 영어 품사 부착(PoS Tagging)
    pos_docs.append(pos_tag(tmp_docs))

pos_docs

[[('cute', 'NN'), ('kitty', 'NN')],
 [('eat', 'NN'), ('rice', 'NN'), ('or', 'CC'), ('cake', 'VB')],
 [('kitty', 'NNS'), ('and', 'CC'), ('hamster', 'NN')],
 [('eat', 'NN'), ('bread', 'NN')],
 [('rice,', 'NN'), ('bread', 'NN'), ('and', 'CC'), ('cake', 'NN')],
 [('cute', 'NN'),
  ('hamster', 'NN'),
  ('eat', 'NN'),
  ('bread', 'NN'),
  ('and', 'CC'),
  ('cake', 'NN')]]

In [23]:
# 불용어 처리(stopWord)
stopPos = ['CC']
stopWord = [',']

docs_token = []
tokens = []

for pos_doc in pos_docs:
    doc_token_tmp = []
    for pos_token in pos_doc:
        # 불용 품사 지정
        if pos_token[1] not in stopPos:
            # 불용어 지정
            if pos_token[0] not in stopWord:
                doc_token_tmp.append(pos_token[0])
                tokens.append(pos_token[0])
    # 문서 사용 단어
    docs_token.append(doc_token_tmp)
# 전체 문서 단어
tokens = list(set(tokens))

docs_token, tokens

([['cute', 'kitty'],
  ['eat', 'rice', 'cake'],
  ['kitty', 'hamster'],
  ['eat', 'bread'],
  ['rice,', 'bread', 'cake'],
  ['cute', 'hamster', 'eat', 'bread', 'cake']],
 ['bread', 'cake', 'hamster', 'kitty', 'rice,', 'cute', 'rice', 'eat'])

### **1-2. LDA**

In [24]:
from random import randint

# 토픽 랜덤 설정
topic = 2 # 임의의 랜덤값
topic_set = []

# 토픽 랜덤 부여
for i in range(len(docs_token)):
    topic_count = [randint(1, topic) for a in range(len(docs_token[i]))]
    topic_set.append(topic_count)

topic_set

[[2, 1], [2, 1, 1], [1, 1], [1, 1], [2, 2, 1], [1, 2, 2, 2, 2]]

In [112]:
import numpy as np

# 문서 내 토픽 분포
alpha = 0.1 # 알파 값 부여

topic_doc = []
for i in range(len(topic_set)):
    tmp = []
    for j in range(1, topic+1):
        if j in topic_set[i]:
            tmp.append(topic_set[i].count(j) + alpha)
        else:
            tmp.append(0)
    topic_doc.append(tmp)

topic_doc

NameError: ignored

In [26]:
# 토픽 내 단어 분포
beta = 0.001 # 베타값 부여

topic_word = [[0 for a in range((len(tokens)))] for b in range(topic)]

for i in range(len(docs_token)):
    for j in range(len(docs_token[i])):
        for k in range(1, topic+1):
            if topic_set[i][j] == k:
                    topic_word[k-1][tokens.index(docs_token[i][j])] += 1

for i in range(len(topic_word)):
    for j in range(len(topic_word[i])):
        topic_word[i][j] += beta

topic_word

[[1.001, 2.001, 1.001, 2.001, 0.001, 1.001, 1.001, 1.001],
 [2.001, 1.001, 1.001, 0.001, 1.001, 1.001, 0.001, 2.001]]

In [27]:
# 합계
prob_td = []
prob_tw = []

# 문서내 토픽 확률
for i in range(len(topic_doc)):
    td_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_doc[i])):
        prob_tmp.append(topic_doc[i][j]/td_total)
    prob_td.append(prob_tmp)

# 토픽 내 단어 합계
for i in range(len(topic_word)):
    tw_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_word[i])):
        prob_tmp.append(topic_word[i][j]/tw_total)
    prob_tw.append(prob_tmp)

prob_td, prob_tw

([[0.5, 0.5],
  [0.65625, 0.34375],
  [1.0, 0.0],
  [1.0, 0.0],
  [0.34375, 0.65625],
  [0.2115384615384616, 0.7884615384615385]],
 [[0.4549999999999999,
   0.9095454545454544,
   0.4549999999999999,
   0.9095454545454544,
   0.00045454545454545455,
   0.4549999999999999,
   0.4549999999999999,
   0.4549999999999999],
  [0.6253124999999999,
   0.31281249999999994,
   0.31281249999999994,
   0.0003125,
   0.31281249999999994,
   0.31281249999999994,
   0.0003125,
   0.6253124999999999]])

In [28]:
# 토픽 부여 행렬
topic_result = []
for i in range(len(docs_token)):
    topic_prob = [[0 for a in range((topic))] for a in range(len(docs_token[i]))]
    topic_result.append(topic_prob)
topic_result

# LDA 계산
for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        for k in range(topic):
                topic_result[i][j][k] = prob_td[i][k] * prob_tw[k][tokens.index(docs_token[i][j])]

topic_result

[[[0.22749999999999995, 0.15640624999999997],
  [0.4547727272727272, 0.00015625]],
 [[0.29859374999999994, 0.21495117187499999],
  [0.29859374999999994, 0.000107421875],
  [0.5968892045454545, 0.10752929687499999]],
 [[0.9095454545454544, 0.0], [0.4549999999999999, 0.0]],
 [[0.4549999999999999, 0.0], [0.4549999999999999, 0.0]],
 [[0.00015625, 0.20528320312499995],
  [0.15640624999999997, 0.410361328125],
  [0.31265624999999997, 0.20528320312499995]],
 [[0.09625, 0.24664062499999997],
  [0.09625, 0.24664062499999997],
  [0.09625, 0.4930348557692308],
  [0.09625, 0.4930348557692308],
  [0.19240384615384618, 0.24664062499999997]]]

In [29]:
# 최종 토픽 할당
LDA_result = []
for i in range(len(docs_token)):
    LDA_result.append([[0 for a in range((topic))] for a in range(len(docs_token[i]))])

for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        LDA_result[i][j] = topic_result[i][j].index(np.max(topic_result[i][j])) + 1

LDA_result

[[1, 1], [1, 1, 1], [1, 1], [1, 1], [2, 2, 1], [2, 2, 2, 2, 2]]

## **2. 잠재 디리클레 할당(LDA) 클래스화**

In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### **2-1. LDA 클래스**

In [204]:
from random import randint
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd

class LDA():
    def __init__(self, topic):
        self.topic = topic
        self.pos_docs = []
        self.docs_token = []
        self.tokens = []
        self.topic_set = []
        self.topic_doc = []
        self.topic_word = []
        self.prob_td = []
        self.prob_tw = []
        self.topic_result = []
        self.LDA_result = []
        self.result_word = []
        self.result_topic = []

        # 불용어, 불용품사
        self.stopPos = ['CC', 'IN', 'TO']
        self.stopWord = [',', '.', 'be']

    # 문장 전처리
    def preprocessing_data(self, docs_ls):
        wl = WordNetLemmatizer()
        for line in docs_ls:
            doc = line.split(" ")
            tmp_docs = []
            for word in doc:
                # 소문자화, Lemmatize
                tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
            # 영어 품사 부착(PoS Tagging)
            self.pos_docs.append(pos_tag(tmp_docs))

        return self.pos_docs

    # 토큰화
    def tokenize(self):
        for pos_doc in self.pos_docs:
            doc_token_tmp = []
            for pos_token in pos_doc:
                # 불용 품사 지정
                if pos_token[1] not in self.stopPos:
                    # 불용어 지정
                    if pos_token[0] not in self.stopWord:
                        doc_token_tmp.append(pos_token[0])
                        self.tokens.append(pos_token[0])
                # 문서 사용 단어
            self.docs_token.append(doc_token_tmp)
            # 전체 문서 단어
        self.tokens = list(set(self.tokens))

        return self.docs_token, self.tokens

    # 토픽 설정
    def give_topic(self):
        tmp_topic_set = []
        for i in range(len(self.docs_token)):
            topic_count = [randint(1, self.topic) for a in range(len(self.docs_token[i]))]
            tmp_topic_set.append(topic_count)
        self.topic_set = tmp_topic_set

        return self.topic_set

    # 문서 내 토픽 분포
    def chk_topicdoc(self, alpha):
        tmp_topic_doc = []
        for i in range(len(self.topic_set)):
            tmp = []
            for j in range(1, self.topic+1):
                if j in self.topic_set[i]:
                    tmp.append(self.topic_set[i].count(j) + alpha)
                else:
                    tmp.append('0')
            tmp_topic_doc.append(tmp)
        self.topic_doc = tmp_topic_doc

        return self.topic_doc

    # 토픽 내 단어 분포
    def chk_topicword(self, beta):
        self.topic_word = [[0 for a in range((len(self.tokens)))] for b in range(self.topic)]
        for i in range(len(self.docs_token)):
            for j in range(len(self.docs_token[i])):
                for k in range(1, self.topic+1):
                    if self.topic_set[i][j] == k:
                            self.topic_word[k-1][self.tokens.index(self.docs_token[i][j])] += 1
        for i in range(len(self.topic_word)):
            for j in range(len(self.topic_word[i])):
                self.topic_word[i][j] += beta
        
        return self.topic_word

    # 문서 내 토픽, 토픽 내 단어 확률
    def cal_probabilty(self):
        tmp_prob_td = []
        for i in range(len(self.topic_doc)):
            td_total = np.sum(self.topic_doc[i])
            prob_tmp = []
            for j in range(len(self.topic_doc[i])):
                prob_tmp.append(self.topic_doc[i][j]/td_total)
            tmp_prob_td.append(prob_tmp)
        self.prob_td = tmp_prob_td
        # 토픽 내 단어 합계
        tmp_prob_tw = []
        for i in range(len(self.topic_word)):
            tw_total = np.sum(self.topic_doc[i])
            prob_tmp = []
            for j in range(len(self.topic_word[i])):
                prob_tmp.append(self.topic_word[i][j]/tw_total)
            tmp_prob_tw.append(prob_tmp)
        self.prob_tw = tmp_prob_tw

        return self.prob_td, self.prob_tw

    # 토픽 부여 행렬
    def cal_LDA(self):
        # LDA 행렬 생성
        tmp_topic_result = []
        for i in range(len(self.docs_token)):
            topic_prob = [[0 for a in range((self.topic))] for a in range(len(self.docs_token[i]))]
            tmp_topic_result.append(topic_prob)
        self.topic_result = tmp_topic_result
        # LDA 계산
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                for k in range(self.topic):
                    self.topic_result[i][j][k] = self.prob_td[i][k] * self.prob_tw[k][self.tokens.index(self.docs_token[i][j])]

        return self.topic_result

    # 결과 토픽 할당
    def result_LDA(self):
        tmp_LDA_result = []
        for i in range(len(self.docs_token)):
            tmp_LDA_result.append([[0 for a in range((self.topic))] for a in range(len(self.docs_token[i]))])
        self.LDA_result = tmp_LDA_result
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                self.LDA_result[i][j] = self.topic_result[i][j].index(np.max(self.topic_result[i][j])) + 1
        
        return self.LDA_result
        
    # 결과 출력
    def print_result(self):
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                self.result_word.append(self.docs_token[i][j])
                self.result_topic.append(self.LDA_result[i][j])

    # 자동 실행
    def run(self, docs_ls, alpha, beta):
        self.preprocessing_data(docs_ls)
        self.tokenize()
        self.give_topic()
        count_time = 0
        while True:
            self.chk_topicdoc(alpha)
            self.chk_topicword(beta)
            self.cal_probabilty()
            self.cal_LDA()
            self.result_LDA()
            count_time += 1
            if self.LDA_result == self.topic_set:
                self.print_result()
                print("반복 횟수 : {}회".format(count_time))
                break
            else:
                self.topic_set = self.LDA_result.copy()
        
        return pd.DataFrame([self.result_topic], columns=self.result_word)

### **2-1. LDA 클래스 결과 확인**

In [205]:
lda = LDA(4)

In [206]:
docs_ls = ["Major highways running across the capital Seoul were partly closed Thursday as heavy rains pushed up the water level of the city's Han River.",
            "Several sections of the Dongbu Urban Expressway, Seoul Inner Loop, Olympic-daero and Gangbyeon Northern Highway have been closed to traffic due to the inner city river's swelled water level, according to the police and the city of Seoul.",
            "As heavy rains continued to batter the country's metropolitan and central regions, authorities opened the floodgates of Soyang River Dam and Paldang Dam a day earlier, releasing the waters to empty into the sea through the Han River.",
            "Eleven people remained missing as of Thursday morning due to the cloudburst, while more than 1,600 people were displaced from their homes in areas hit hardest, such as North and South Chungcheong provinces, Gyeonggi Province and Gangwon Province.",
            "Three train lines, including Taebaek and Chungbuk, remained totally or partially out of service, while 39 roads across the affected areas were off limits due to mudslide damage from the torrential rains, according to authorities."
            "As of Thursday morning, more than 5,000 houses and facilities were reported flooded or damaged in the latest bout of heavy rains. Nearly 8,065 hectares of farm land have been inundated or ravaged."
            "The weather agency urged extra caution and safety measures, as the central and southern regions are expected to receive rainfall of up to 50 millimeters per hour with strong winds on Thursday and Friday."
          ]

In [207]:
lda.run(docs_ls, 0.1, 0.001)

반복 횟수 : 4회


Unnamed: 0,major,highways,run,the,capital,seoul,partly,close,thursday,heavy,rain,push,up,the.1,water,level,the.2,city's,han,river.,several,section,the.3,dongbu,urban,"expressway,",seoul.1,inner,"loop,",olympic-daero,gangbyeon,northern,highway,have,close.1,traffic,due,the.4,inner.1,city,...,"5,000",house,facilities,report,flood,damage,the.5,latest,bout,heavy.1,rains.,nearly,"8,065",hectares,farm,land,have.1,inundate,ravaged.the,weather,agency,urge,extra,caution,safety,"measures,",the.6,central,southern,regions,expect,receive,rainfall,50,millimeters,hour,strong,wind,thursday.1,friday.
0,4,3,1,3,3,1,2,1,3,3,3,3,4,3,4,3,3,1,3,4,1,2,3,2,3,2,1,3,3,3,4,4,4,3,1,1,1,3,3,3,...,2,2,2,2,2,3,3,2,2,3,1,2,3,1,3,1,3,3,3,4,4,3,1,4,4,4,3,3,2,3,2,3,4,3,2,4,1,3,3,3
