<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HyeonminNam/200805_LDA_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 문서 토큰화

In [None]:
# 입력값
doc_ls = ['Cute kitty', 'Eat rice or cake', 'Kitty and hamster', 'Eat bread', 'Rice, bread and cake', 'Cute hamster eats bread and cake']
alpha = 0.1
beta = 0.001

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 

# 토크나이저 함수
def tokenizer(doc):
    retokenize = RegexpTokenizer("[\w]+")
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    
    # 포함할 품사 리스트
    tag_lst = ['FW', # 외래어
           'JJ', 'JJR', 'JJS', # 형용사 
           'NNP', 'NN', 'NNS', 'NNPS', # 명사
           'RB', 'RBR', 'RBS', 'RP', # 부사
           'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' # 동사
           ]
    # 토큰화하고 품사 태깅한 뒤에 품사 걸러서 리스트 다시 만들기
    token_lst = [word[0] for word in pos_tag(retokenize.tokenize(doc)) if word[1] in tag_lst]
    
    # lemmatize, stemming
    token_lst = [lemmatizer.lemmatize(token) for token in token_lst]
    token_lst = [ps.stem(token) for token in token_lst]
    
    return token_lst



In [None]:
# 각 문서별로 토크나이즈 함수 적용해서 토큰 리스트 만들기
doc_token = {}
for idx, x in enumerate(doc_ls):
    doc_token[idx] = tokenizer(x)
doc_token

{0: ['cute', 'kitti'],
 1: ['eat', 'rice', 'cake'],
 2: ['kitti', 'hamster'],
 3: ['eat', 'bread'],
 4: ['rice', 'bread', 'cake'],
 5: ['cute', 'hamster', 'eat', 'bread', 'cake']}

## 임의의 토픽 배정

In [None]:
# 토픽 숫자 지정해주고 그에 맞게 랜덤하게 토픽 지정
import random
topic_num = 2
#lst = [[2,2],[1,1,1],[2,2],[1,1],[1,1,1],[2,2,1,1,1]]
for idx, doc in doc_token.items():
    random_n = [random.randint(1, topic_num) for x in range(len(doc))]
    #random_n = lst[idx]
    doc_token[idx] = list(zip(doc, random_n))

In [None]:
doc_token

{0: [('cute', 2), ('kitti', 2)],
 1: [('eat', 1), ('rice', 2), ('cake', 2)],
 2: [('kitti', 1), ('hamster', 2)],
 3: [('eat', 1), ('bread', 1)],
 4: [('rice', 1), ('bread', 2), ('cake', 1)],
 5: [('cute', 1), ('hamster', 1), ('eat', 1), ('bread', 2), ('cake', 1)]}

## 데이터프레임 구축

In [None]:
import pandas as pd

# 토큰 전체에 대해서 데이터프레임 구축(문서, 토픽 정보 포함) 
df = pd.DataFrame(columns = ['doc', 'token', 'topic'])
n = 0
for idx, doc in doc_token.items():
    for token in doc:
        df.loc[n] = [idx, token[0], token[1]]
        n+=1

In [None]:
df

Unnamed: 0,doc,token,topic
0,0,cute,2
1,0,kitti,2
2,1,eat,1
3,1,rice,2
4,1,cake,2
5,2,kitti,1
6,2,hamster,2
7,3,eat,1
8,3,bread,1
9,4,rice,1


In [None]:
# 문서별 토픽 분포 정보 데이터프레임
df_topic = pd.DataFrame(columns = list(doc_token.keys()), index = range(1, topic_num+1))

In [None]:
df_topic

Unnamed: 0,0,1,2,3,4,5
1,,,,,,
2,,,,,,


In [None]:
# df 바탕으로 토픽 분포 정보 갱신하는 함수
def doc_top(df, df_topic, alpha):
    for idx, row in df_topic.iterrows():
        for x in list(df_topic.columns):
            row[x] = len(df[(df['doc']==x) & (df['topic']==idx)])+alpha
            
doc_top(df, df_topic, alpha)

In [None]:
df_topic

Unnamed: 0,0,1,2,3,4,5
1,0.1,1.1,1.1,2.1,2.1,4.1
2,2.1,2.1,1.1,0.1,1.1,1.1


In [None]:
# 토큰별 토픽 분포 정보 데이터프레임
token_lst = list(set(df['token']))
df_token = pd.DataFrame(columns = token_lst, index = range(1, topic_num+1))

In [None]:
df_token

Unnamed: 0,cake,kitti,eat,cute,hamster,bread,rice
1,,,,,,,
2,,,,,,,


In [None]:
# df 바탕으로 토큰별 토픽 분포 정보 갱신하는 함수
def top_tok(df, df_token, beta):
    for idx, row in df_token.iterrows():
        for x in list(df_token.columns):
            row[x] = len(df[(df['token']==x) & (df['topic']==idx)])+beta
            
top_tok(df, df_token, beta)

In [None]:
df_token

Unnamed: 0,cake,kitti,eat,cute,hamster,bread,rice
1,2.001,1.001,3.001,1.001,1.001,1.001,1.001
2,1.001,1.001,0.001,1.001,1.001,2.001,1.001


## 각 토큰의 토픽 갱신

In [None]:
import numpy as np


while True:
    column_topic = list(df['topic'])
    for idx in range(len(df)):
        # 해당 줄의 토큰과 문서 정보
        document = df.iloc[idx]['doc']
        token = df.iloc[idx]['token']

        # 해당 토큰이 포함된 줄을 제외하고 새로운 분포의 데이터프레임 구성
        df_o = df.drop([idx])
        df_o_topic = pd.DataFrame(columns = list(doc_token.keys()), index = range(1, topic_num+1))
        df_o_token = pd.DataFrame(columns = token_lst, index = range(1, topic_num+1))

        doc_top(df_o, df_o_topic, alpha)
        top_tok(df_o, df_o_token, beta)

        # 토큰의 사후 확률 계산하여 토픽 갱신
        p = 0
        for topic in range(1, topic_num+1):
            a = df_o_topic[document][topic]/np.sum(df_o_topic[document])
            b = df_o_token[token][topic]/np.sum(df_o_token.loc[topic])
            if p < a*b:
                p = a*b
                df.iloc[idx]['topic'] = topic

        # 새롭게 바뀐 데이터프레임을 바탕으로 분포들 다시 갱신
        doc_top(df, df_topic, alpha)
        top_tok(df, df_token, beta)
        
    # 갱신이 되지 않을 때까지 반복
    if column_topic == list(df['topic']):
        break

In [None]:
df

Unnamed: 0,doc,token,topic
0,0,cute,1
1,0,kitti,1
2,1,eat,1
3,1,rice,1
4,1,cake,1
5,2,kitti,1
6,2,hamster,1
7,3,eat,1
8,3,bread,2
9,4,rice,1


In [None]:
df_token

Unnamed: 0,cake,kitti,eat,cute,hamster,bread,rice
1,3.001,2.001,3.001,2.001,2.001,0.001,2.001
2,0.001,0.001,0.001,0.001,0.001,3.001,0.001


In [None]:
df_topic

Unnamed: 0,0,1,2,3,4,5
1,2.1,3.1,2.1,1.1,2.1,4.1
2,0.1,0.1,0.1,1.1,1.1,1.1


## 토픽별로 결과 출력

In [None]:
#상위 5개
num = 5
for idx_1, row in df_token.iterrows():
    sorted_row = row.sort_values(ascending=False)
    lst = []
    all_p = sum(sorted_row.to_dict().values())
    for idx_2, x in sorted_row.to_dict().items():
        lst.append((idx_2, round(x/all_p, 4)))
        if len(lst) == num:
            break
        
    print('Topic {}: {}'.format(idx_1, lst))

Topic 1: [('eat', 0.2143), ('cake', 0.2143), ('rice', 0.1429), ('hamster', 0.1429), ('cute', 0.1429)]
Topic 2: [('bread', 0.998), ('rice', 0.0003), ('hamster', 0.0003), ('cute', 0.0003), ('eat', 0.0003)]


## LDA 클래스화(Train까지 완성)

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import random
import pandas as pd
import numpy as np


class LDA:
    def __init__(self, doc_ls, alpha, beta):
        self.doc_ls = doc_ls
        self.alpha = alpha
        self.beta = beta
        

    def tokenizer(self, doc):
        retokenize = RegexpTokenizer("[\w]+")
        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()

        tag_lst = ['FW', # 외래어
               'JJ', 'JJR', 'JJS', # 형용사 
               'NNP', 'NN', 'NNS', 'NNPS', # 명사
               'RB', 'RBR', 'RBS', 'RP', # 부사
               'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' # 동사
               ]

        token_lst = [word[0] for word in pos_tag(retokenize.tokenize(doc)) if word[1] in tag_lst]
        token_lst = [lemmatizer.lemmatize(token) for token in token_lst]
        token_lst = [ps.stem(token) for token in token_lst]
        return token_lst
    
    def top_tok(self, df, df_token, beta):
        for idx, row in df_token.iterrows():
            for x in list(df_token.columns):
                row[x] = len(df[(df['token']==x) & (df['topic']==idx)])+beta
                
    def doc_top(self, df, df_topic, alpha):
        for idx, row in df_topic.iterrows():
            for x in list(df_topic.columns):
                row[x] = len(df[(df['doc']==x) & (df['topic']==idx)])+alpha

    
    def train(self, doc_ls, topic_num = 2):
        doc_token = {}
        for idx, x in enumerate(doc_ls):
            doc_token[idx] = self.tokenizer(x)
            
        #lst = [[2,2],[1,1,1],[2,2],[1,1],[1,1,1],[2,2,1,1,1]]
        for idx, doc in doc_token.items():
            random_n = [random.randint(1, topic_num) for x in range(len(doc))]
            #random_n = lst[idx]
            doc_token[idx] = list(zip(doc, random_n))
            
            
        df = pd.DataFrame(columns = ['doc', 'token', 'topic'])
        n = 0
        for idx, doc in doc_token.items():
            for token in doc:
                df.loc[n] = [idx, token[0], token[1]]
                n+=1
                
        df_topic = pd.DataFrame(columns = list(doc_token.keys()), index = range(1, topic_num+1))
        token_lst = list(set(df['token']))
        df_token = pd.DataFrame(columns = token_lst, index = range(1, topic_num+1))
        
        self.top_tok(df, df_token, beta)
        self.doc_top(df, df_topic, alpha)

        while True:
            column_topic = list(df['topic'])
            for idx in range(len(df)):
                # 해당 줄의 토큰과 문서 정보
                document = df.iloc[idx]['doc']
                token = df.iloc[idx]['token']

                # 해당 단어를 제외하고 새로운 분포 데이터프레임들 구성
                df_o = df.drop([idx])
                df_o_topic = pd.DataFrame(columns = list(doc_token.keys()), index = range(1, topic_num+1))
                df_o_token = pd.DataFrame(columns = token_lst, index = range(1, topic_num+1))

                self.doc_top(df_o, df_o_topic, alpha)
                self.top_tok(df_o, df_o_token, beta)

                # 사후 확률 계산하여 해당 줄의 토픽 갱신
                p = 0
                for topic in range(1, topic_num+1):
                    a = df_o_topic[document][topic]/np.sum(df_o_topic[document])
                    b = df_o_token[token][topic]/np.sum(df_o_token.loc[topic])
                    if p < a*b:
                        p = a*b
                        df.iloc[idx]['topic'] = topic

                # 새롭게 바뀐 데이터프레임을 바탕으로 분포들 갱신
                self.doc_top(df, df_topic, alpha)
                self.top_tok(df, df_token, beta)
            if column_topic == list(df['topic']):
                break
                
        #상위 5개 출력
        num = 5
        for idx, row in df_token.iterrows():
            sorted_row = row.sort_values(ascending=False)
            lst = []
            all_p = sum(sorted_row.to_dict().values())
            for idx, x in sorted_row.to_dict().items():
                lst.append((idx, round(x/all_p, 4)))
                if len(lst) == num:
                    break

            print('Topic {}: {}'.format(idx, lst))

## 클래스 테스트

In [None]:
# 입력값
doc_ls = ['Cute kitty', 'Eat rice or cake', 'Kitty and hamster', 'Eat bread', 'Rice, bread and cake', 'Cute hamster eats bread and cake']
alpha = 0.1
beta = 0.001

In [None]:
lda = LDA(doc_ls, alpha, beta)

In [None]:
lda.train(doc_ls, 2)

Topic cute: [('cake', 0.2726), ('eat', 0.2726), ('bread', 0.2726), ('kitti', 0.1818), ('cute', 0.0001)]
Topic kitti: [('cute', 0.3331), ('rice', 0.3331), ('hamster', 0.3331), ('cake', 0.0002), ('kitti', 0.0002)]
