<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/12_Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic Modeling - Latent Dirichlet Allocation 실습**

## **1. 잠재 디리클레 할당(LDA)**

In [392]:
docs_ls = ["Cute kitty",
          "Eat rice or cake",
          "Kitty and hamster",
          "Eat bread",
          "Rice, bread and cake",
          "Cute hamster eats bread and cake"]

### **1-1. 데이터 전처리**

In [306]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [393]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

# 문장 전처리
pos_docs = []
for line in docs_ls:
    doc = line.split(" ")
    tmp_docs = []
    for word in doc:
        # 소문자화, Lemmatize
        tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
    # 영어 품사 부착(PoS Tagging)
    pos_docs.append(pos_tag(tmp_docs))

pos_docs

[[('cute', 'NN'), ('kitty', 'NN')],
 [('eat', 'NN'), ('rice', 'NN'), ('or', 'CC'), ('cake', 'VB')],
 [('kitty', 'NNS'), ('and', 'CC'), ('hamster', 'NN')],
 [('eat', 'NN'), ('bread', 'NN')],
 [('rice,', 'NN'), ('bread', 'NN'), ('and', 'CC'), ('cake', 'NN')],
 [('cute', 'NN'),
  ('hamster', 'NN'),
  ('eat', 'NN'),
  ('bread', 'NN'),
  ('and', 'CC'),
  ('cake', 'NN')]]

In [394]:
# 불용어 처리(stopWord)
stopPos = ['CC']
stopWord = [',']

docs_token = []
tokens = []

for pos_doc in pos_docs:
    doc_token_tmp = []
    for pos_token in pos_doc:
        # 불용 품사 지정
        if pos_token[1] not in stopPos:
            # 불용어 지정
            if pos_token[0] not in stopWord:
                doc_token_tmp.append(pos_token[0])
                tokens.append(pos_token[0])
    # 문서 사용 단어
    docs_token.append(doc_token_tmp)
# 전체 문서 단어
tokens = list(set(tokens))

docs_token, tokens

([['cute', 'kitty'],
  ['eat', 'rice', 'cake'],
  ['kitty', 'hamster'],
  ['eat', 'bread'],
  ['rice,', 'bread', 'cake'],
  ['cute', 'hamster', 'eat', 'bread', 'cake']],
 ['eat', 'rice', 'kitty', 'cute', 'hamster', 'cake', 'bread', 'rice,'])

### **1-2. LDA**

In [395]:
from random import randint

# 토픽 랜덤 설정
topic = 2 # 임의의 랜덤값
topic_set = []

# 토픽 랜덤 부여
for i in range(len(docs_token)):
    topic_count = [randint(1, topic) for a in range(len(docs_token[i]))]
    topic_set.append(topic_count)

topic_set

[[1, 1], [2, 2, 1], [1, 1], [2, 1], [1, 1, 1], [2, 1, 2, 2, 1]]

In [396]:
import numpy as np

# 문서 내 토픽 분포
alpha = 0.1 # 알파 값 부여

topic_doc = []
for i in range(len(topic_set)):
    tmp = []
    for j in range(1, topic+1):
        if j in topic_set[i]:
            tmp.append(topic_set[i].count(j) + alpha)
        else:
            tmp.append(0)
    topic_doc.append(tmp)

topic_doc

[[2.1, 0], [1.1, 2.1], [2.1, 0], [1.1, 1.1], [3.1, 0], [2.1, 3.1]]

In [397]:
# 토픽 내 단어 분포
beta = 0.001 # 베타값 부여

topic_word = [[0 for a in range((len(tokens)))] for b in range(topic)]

for i in range(len(docs_token)):
    for j in range(len(docs_token[i])):
        for k in range(1, topic+1):
            if topic_set[i][j] == k:
                    topic_word[k-1][tokens.index(docs_token[i][j])] += 1

for i in range(len(topic_word)):
    for j in range(len(topic_word[i])):
        topic_word[i][j] += beta

topic_word

[[0.001, 0.001, 2.001, 1.001, 2.001, 3.001, 2.001, 1.001],
 [3.001, 1.001, 0.001, 1.001, 0.001, 0.001, 1.001, 0.001]]

In [398]:
# 합계
prob_td = []
prob_tw = []

# 문서내 토픽 확률
for i in range(len(topic_doc)):
    td_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_doc[i])):
        prob_tmp.append(topic_doc[i][j]/td_total)
    prob_td.append(prob_tmp)

# 토픽 내 단어 합계
for i in range(len(topic_word)):
    tw_total = np.sum(topic_doc[i])
    prob_tmp = []
    for j in range(len(topic_word[i])):
        prob_tmp.append(topic_word[i][j]/tw_total)
    prob_tw.append(prob_tmp)

prob_td, prob_tw

([[1.0, 0.0],
  [0.34375, 0.65625],
  [1.0, 0.0],
  [0.5, 0.5],
  [1.0, 0.0],
  [0.40384615384615385, 0.5961538461538461]],
 [[0.0004761904761904762,
   0.0004761904761904762,
   0.9528571428571427,
   0.47666666666666657,
   0.9528571428571427,
   1.429047619047619,
   0.9528571428571427,
   0.47666666666666657],
  [0.9378124999999999,
   0.31281249999999994,
   0.0003125,
   0.31281249999999994,
   0.0003125,
   0.0003125,
   0.31281249999999994,
   0.0003125]])

In [399]:
# 토픽 부여 행렬
topic_result = []
for i in range(len(docs_token)):
    topic_prob = [[0 for a in range((topic))] for a in range(len(docs_token[i]))]
    topic_result.append(topic_prob)
topic_result

# LDA 계산
for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        for k in range(topic):
                topic_result[i][j][k] = prob_td[i][k] * prob_tw[k][tokens.index(docs_token[i][j])]

topic_result

[[[0.47666666666666657, 0.0], [0.9528571428571427, 0.0]],
 [[0.00016369047619047618, 0.6154394531249999],
  [0.00016369047619047618, 0.20528320312499995],
  [0.491235119047619, 0.000205078125]],
 [[0.9528571428571427, 0.0], [0.9528571428571427, 0.0]],
 [[0.0002380952380952381, 0.46890624999999997],
  [0.47642857142857137, 0.15640624999999997]],
 [[0.47666666666666657, 0.0],
  [0.9528571428571427, 0.0],
  [1.429047619047619, 0.0]],
 [[0.19249999999999998, 0.18648437499999995],
  [0.38480769230769224, 0.00018629807692307693],
  [0.0001923076923076923, 0.5590805288461538],
  [0.38480769230769224, 0.18648437499999995],
  [0.5771153846153846, 0.00018629807692307693]]]

In [400]:
# 최종 토픽 할당
LDA_result = []
LDA_prob = []

for i in range(len(docs_token)):
    LDA_result.append([[0 for a in range((topic))] for a in range(len(docs_token[i]))])

for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        LDA_result[i][j] = topic_result[i][j].index(np.max(topic_result[i][j])) + 1

LDA_result, LDA_prob

([[1, 1], [2, 2, 1], [1, 1], [2, 1], [1, 1, 1], [1, 1, 2, 1, 1]], [])

In [412]:
result_word = []
result_topic = []
result_prob = []

for i in range(len(topic_result)):
    for j in range(len(topic_result[i])):
        result_word.append(docs_token[i][j])
        result_topic.append(LDA_result[i][j])

prob_tw, tokens
topic_idx = [str(i+1) for i in range(topic)]
df = pd.DataFrame(prob_tw, columns=tokens, index=topic_idx)
df

Unnamed: 0,eat,rice,kitty,cute,hamster,cake,bread,"rice,"
1,0.000476,0.000476,0.952857,0.476667,0.952857,1.429048,0.952857,0.476667
2,0.937812,0.312812,0.000313,0.312812,0.000313,0.000313,0.312812,0.000313


In [438]:
import matplotlib.pyplot as plt

# 데이터 길이에 대한 히스토그램 확인
plt.figure(figsize=(12, 5))

plt(df, label=df.columns)
plt.show()

TypeError: ignored

<Figure size 864x360 with 0 Axes>

In [430]:
predict_word = "cute kitty eat"

for word in predict_word.split(" "):
    tmp = list(df[word])
    print("{}의 주제는 토픽 {}입니다.".format(word, topic_idx[tmp.index(max(tmp))]))

cute의 주제는 토픽 1입니다.
kitty의 주제는 토픽 1입니다.
eat의 주제는 토픽 2입니다.


## **2. 잠재 디리클레 할당(LDA) 클래스화**

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### **2-1. LDA 클래스**

In [77]:
from random import randint
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd

class LDA():
    def __init__(self, topic):
        self.topic = topic
        self.docs_token = []
        self.tokens = []
        self.topic_set = []
        self.topic_doc = []
        self.topic_word = []
        self.prob_td = []
        self.prob_tw = []
        self.topic_result = []
        self.LDA_result = []
        self.result_word = []
        self.result_topic = []

        # 불용어, 불용품사
        self.stopPos = ['IN', 'CC', 'UH', 'TO', 'MD', 'DT', 'VBZ','VBP']
        self.stopWord = ['.', ',','be','able']

    # 문장 전처리 및 토큰화
    def tokenize(self, docs_ls):
        pos_docs = []
        wl = WordNetLemmatizer()
        for line in docs_ls:
            doc = line.split(" ")
            tmp_docs = []
            for word in doc:
                # 소문자화, Lemmatize
                tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
            # 영어 품사 부착(PoS Tagging)
            pos_docs.append(pos_tag(tmp_docs))
        # 토큰화
        for pos_doc in pos_docs:
            doc_token_tmp = []
            for pos_token in pos_doc:
                # 불용 품사 지정
                if pos_token[1] not in self.stopPos:
                    # 불용어 지정
                    if pos_token[0] not in self.stopWord:
                        doc_token_tmp.append(pos_token[0])
                        self.tokens.append(pos_token[0])
                # 문서 사용 단어
            self.docs_token.append(doc_token_tmp)
            # 전체 문서 단어
        self.tokens = list(set(self.tokens))

        return self.docs_token, self.tokens

    # 토픽 설정
    def give_topic(self):
        tmp_topic_set = []
        for i in range(len(self.docs_token)):
            topic_count = [randint(1, self.topic) for a in range(len(self.docs_token[i]))]
            tmp_topic_set.append(topic_count)
        self.topic_set = tmp_topic_set

        return self.topic_set

    # 문서 내 토픽 분포
    def chk_topicdoc(self, alpha):
        tmp_topic_doc = []
        for i in range(len(self.topic_set)):
            tmp = []
            for j in range(1, self.topic+1):
                if j in self.topic_set[i]:
                    tmp.append(self.topic_set[i].count(j) + alpha)
                else:
                    tmp.append('0')
            tmp_topic_doc.append(tmp)
        self.topic_doc = tmp_topic_doc

        return self.topic_doc

    # 토픽 내 단어 분포
    def chk_topicword(self, beta):
        self.topic_word = [[0 for a in range((len(self.tokens)))] for b in range(self.topic)]
        for i in range(len(self.docs_token)):
            for j in range(len(self.docs_token[i])):
                for k in range(1, self.topic+1):
                    if self.topic_set[i][j] == k:
                            self.topic_word[k-1][self.tokens.index(self.docs_token[i][j])] += 1
        for i in range(len(self.topic_word)):
            for j in range(len(self.topic_word[i])):
                self.topic_word[i][j] += beta
        
        return self.topic_word

    # 문서 내 토픽, 토픽 내 단어 확률
    def cal_probabilty(self):
        tmp_prob_td = []
        for i in range(len(self.topic_doc)):
            td_total = np.sum(self.topic_doc[i])
            prob_tmp = []
            for j in range(len(self.topic_doc[i])):
                prob_tmp.append(self.topic_doc[i][j]/td_total)
            tmp_prob_td.append(prob_tmp)
        self.prob_td = tmp_prob_td
        # 토픽 내 단어 합계
        tmp_prob_tw = []
        for i in range(len(self.topic_word)):
            tw_total = np.sum(self.topic_doc[i])
            prob_tmp = []
            for j in range(len(self.topic_word[i])):
                prob_tmp.append(self.topic_word[i][j]/tw_total)
            tmp_prob_tw.append(prob_tmp)
        self.prob_tw = tmp_prob_tw

        return self.prob_td, self.prob_tw

    # 토픽 부여 행렬
    def cal_LDA(self):
        # LDA 행렬 생성
        tmp_topic_result = []
        for i in range(len(self.docs_token)):
            topic_prob = [[0 for a in range((self.topic))] for a in range(len(self.docs_token[i]))]
            tmp_topic_result.append(topic_prob)
        self.topic_result = tmp_topic_result
        # LDA 계산
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                for k in range(self.topic):
                    self.topic_result[i][j][k] = self.prob_td[i][k] * self.prob_tw[k][self.tokens.index(self.docs_token[i][j])]

        return self.topic_result

    # 결과 토픽 할당
    def result_LDA(self):
        tmp_LDA_result = []
        for i in range(len(self.docs_token)):
            tmp_LDA_result.append([[0 for a in range((self.topic))] for a in range(len(self.docs_token[i]))])
        self.LDA_result = tmp_LDA_result
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                self.LDA_result[i][j] = self.topic_result[i][j].index(np.max(self.topic_result[i][j])) + 1
        
        return self.LDA_result
        
    # 결과 출력
    def print_result(self):
        for i in range(len(self.topic_result)):
            for j in range(len(self.topic_result[i])):
                self.result_word.append(self.docs_token[i][j])
                self.result_topic.append(self.LDA_result[i][j])
        
        return self.result_word, self.result_topic

    # 자동 실행
    def run(self, docs_ls, alpha, beta):
        self.tokenize(docs_ls)
        self.give_topic()
        count_time = 0
        # Iteration 과정 추가
        while True:
            self.chk_topicdoc(alpha)
            self.chk_topicword(beta)
            self.cal_probabilty()
            self.cal_LDA()
            self.result_LDA()
            count_time += 1
            if self.LDA_result == self.topic_set:
                self.print_result()
                print("반복 횟수 : {}회".format(count_time))
                break
            else:
                self.topic_set = self.LDA_result.copy()
    
        return pd.DataFrame([self.result_topic], columns=self.result_word)

    # 예측
    def predict_topic(self, pre_doc):
        # 토픽 내 단어 확률
        topic_idx = [str(i+1) for i in range(self.topic)]
        topic_count = []
        df = pd.DataFrame(self.prob_tw, columns=self.tokens, index=topic_idx)
        for word in pre_doc.split(" "):
            tmp = list(df[word])
            topic_count.append(topic_idx[tmp.index(max(tmp))])
            print("{}의 주제는 토픽 {}입니다.".format(word, topic_idx[tmp.index(max(tmp))]))
        # print("{}의 주제는 토픽 {}입니다".format(pre_doc, topic_count))

### **2-1. LDA 클래스 결과 확인**

In [78]:
lda = LDA(3)

In [79]:
docs_ls = ["Major highways running across the capital Seoul were partly closed Thursday as heavy rains pushed up the water level of the city's Han River.",
            "Several sections of the Dongbu Urban Expressway, Seoul Inner Loop, Olympic-daero and Gangbyeon Northern Highway have been closed to traffic due to the inner city river's swelled water level, according to the police and the city of Seoul.",
            "As heavy rains continued to batter the country's metropolitan and central regions, authorities opened the floodgates of Soyang River Dam and Paldang Dam a day earlier, releasing the waters to empty into the sea through the Han River.",
            "Eleven people remained missing as of Thursday morning due to the cloudburst, while more than 1,600 people were displaced from their homes in areas hit hardest, such as North and South Chungcheong provinces, Gyeonggi Province and Gangwon Province.",
            "Three train lines, including Taebaek and Chungbuk, remained totally or partially out of service, while 39 roads across the affected areas were off limits due to mudslide damage from the torrential rains, according to authorities.",
            "As of Thursday morning, more than 5,000 houses and facilities were reported flooded or damaged in the latest bout of heavy rains. Nearly 8,065 hectares of farm land have been inundated or ravaged.",
          ]

In [80]:
lda.run(docs_ls, 0.1, 0.001)

반복 횟수 : 2회


Unnamed: 0,major,highways,run,capital,seoul,partly,close,thursday,heavy,rain,push,up,water,level,city's,han,river.,several,section,dongbu,urban,"expressway,",seoul.1,inner,"loop,",olympic-daero,gangbyeon,northern,highway,close.1,traffic,due,inner.1,city,river's,water.1,"level,",accord,police,city.1,...,train,"lines,",taebaek,"chungbuk,",totally,partially,"service,",39,roads,affect,areas,off,limit,due.1,mudslide,damage,torrential,"rains,",accord.1,authorities.,thursday.1,"morning,",more,"5,000",house,facilities,report,flood,damage.1,latest,bout,heavy.1,rains.,nearly,"8,065",hectares,farm,land,inundate,ravaged.
0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,3,2,1,2,3,2,1,1,1,2,1,2,3,1,1,1,1,1,1,1,2,1,3,1,...,2,1,3,3,3,2,2,1,2,3,3,3,2,1,2,1,3,3,3,1,1,1,1,1,1,1,2,2,1,1,1,1,1,3,2,3,3,3,1,1


In [81]:
predict = "highway seoul thursday"

In [82]:
lda.predict_topic(predict)

highway의 주제는 토픽 3입니다.
seoul의 주제는 토픽 1입니다.
thursday의 주제는 토픽 1입니다.
