<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HTLim/NLP_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

토픽모델링

In [1]:
# TDM 생성


In [2]:
# SVD 특이값 분해

In [3]:
import pandas as pd
import numpy as np

from numpy.linalg import norm
from sklearn.decomposition import TruncatedSVD

In [4]:
class tomo():

    def __init__(self):
        self.doc = None
        self.token = set([])
        self.tdm_ = pd.DataFrame([], columns=['token'])
        self.doc_vector = None
        self.word_vector = None
        self.eigen_vector = None

    # data 추가
    def add_data(self, doc):
        data_df = []
        for idx, data in enumerate(doc):
            data_dict = {}   
            data_dict['title'] = 'doc{}'.format(idx+1)
            data_dict['doc'] = data
            data_df.append(data_dict)
        self.doc = pd.DataFrame(data_df)

        # tdm 데이터 setting
        for title in self.doc.title:
            self.tdm_[title] = 0

        # 토큰분리
        self.tokenizer()

    # 토크나이저
    def tokenizer(self):
        for doc in self.doc.doc:
            token = doc.split()
            self.token = set(self.token).union(set(token))

    # tdm 계산
    def tdm(self):
        self.tdm_ = self.tdm_.iloc[0:0]
        for token in self.token:
            data_dict = {}
            data_dict['token'] = token
            for index, data in self.doc.iterrows():

                count = data.doc.split().count(token)
                data_dict[data.title] = count

            self.tdm_ = self.tdm_.append(data_dict, True)

        self.tdm_ = self.tdm_.T.rename(columns=self.tdm_.T.iloc[0]).drop(self.tdm_.T.index[0])
    
    # 특이값분해
    def svd(self, dim=5):
        # TruncatedSVD를 사용해서 차원축소를 해보자. 여기서 k는 12로 두었다. n_iter는 랜덤 SVD 계산기의 반복횟수이다.
        svd = TruncatedSVD(n_components=dim, n_iter=5)

        # 모델을 데이터에 맞추는 작업
        svd.fit(np.array(self.tdm_))

        # 다음의 코드를 통해 각각 행렬 U, S, V^T 를 변수에 담아낼 수 있다.
        U = svd.fit_transform(np.array(self.tdm_))
        Sigma = svd.explained_variance_ratio_
        VT = svd.components_
        print(U.shape, Sigma.shape, VT.shape)
        print(Sigma)

        self.doc_vector = U
        self.word_vector = VT.T
        self.eigen_vector = Sigma

    def word_simiarity(self):
        word_sim_df = self.tdm_.iloc[0:0]
        tokens = self.tdm_.columns

        for idx, vector1 in enumerate(self.word_vector):
            sim_list = []
            for vector2 in self.word_vector:
                sim = self.cal_cos_simiarity(vector1, vector2)
                sim_list.append(sim)
            word_sim_df = word_sim_df.append(pd.Series(sim_list, index=tokens), ignore_index=True)
        word_sim_df.index = tokens
        return word_sim_df

    def doc_simiarity(self):
        doc_sim_df = self.tdm_.T.iloc[0:0]
        tokens = self.tdm_.T.columns

        for idx, vector1 in enumerate(self.doc_vector):
            sim_list = []
            for vector2 in self.doc_vector:
                sim = self.cal_cos_simiarity(vector1, vector2)
                sim_list.append(sim)
            doc_sim_df = doc_sim_df.append(pd.Series(sim_list, index=tokens), ignore_index=True)
        doc_sim_df.index = tokens
        return doc_sim_df

    def word_doc_simiarity(self):
        sim_df = self.tdm_.iloc[0:0]
        column_name = self.tdm_.columns
        index_ = self.tdm_.T.columns

        for idx, vector1 in enumerate(self.doc_vector):
            sim_list = []
            for vector2 in self.word_vector:
                sim = self.cal_cos_simiarity(vector1, vector2)
                sim_list.append(sim)
            sim_df = sim_df.append(pd.Series(sim_list, index=column_name), ignore_index=True)
        sim_df.index = index_
        return sim_df

    def cal_cos_simiarity(self, vec1, vec2):
        return np.dot(vec1, vec2)/(norm(vec1)*norm(vec2))


In [5]:
# docs
doc_ls = ['바나나 사과 포도 포도 짜장면',
          '사과 포도',
          '포도 바나나',
          '짜장면 짬뽕 탕수육',
          '볶음밥 탕수육',
          '짜장면 짬뽕',
          '라면 스시',
          '스시 짜장면',
          '가츠동 스시 소바',
          '된장찌개 김치찌개 김치',
          '김치 된장 짜장면',
          '비빔밥 김치']

In [6]:
# 클래스 생성
tm = tomo()

# 데이터 추가
tm.add_data(doc_ls)
tm.doc

Unnamed: 0,title,doc
0,doc1,바나나 사과 포도 포도 짜장면
1,doc2,사과 포도
2,doc3,포도 바나나
3,doc4,짜장면 짬뽕 탕수육
4,doc5,볶음밥 탕수육
5,doc6,짜장면 짬뽕
6,doc7,라면 스시
7,doc8,스시 짜장면
8,doc9,가츠동 스시 소바
9,doc10,된장찌개 김치찌개 김치


In [7]:
tm.token

{'가츠동',
 '김치',
 '김치찌개',
 '된장',
 '된장찌개',
 '라면',
 '바나나',
 '볶음밥',
 '비빔밥',
 '사과',
 '소바',
 '스시',
 '짜장면',
 '짬뽕',
 '탕수육',
 '포도'}

In [8]:
tm.tdm()
tm.tdm_

Unnamed: 0,탕수육,짜장면,소바,짬뽕,볶음밥,김치,된장,김치찌개,포도,스시,된장찌개,라면,가츠동,비빔밥,사과,바나나
doc1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,1,1
doc2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
doc3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
doc4,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
doc5,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
doc6,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
doc7,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
doc8,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
doc9,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
doc10,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0


In [9]:
tm.svd()

(12, 5) (5,) (5, 16)
[0.21575689 0.17214826 0.16524827 0.13876802 0.0787858 ]


In [10]:
# 단어 유사도
tm.word_simiarity()

Unnamed: 0,탕수육,짜장면,소바,짬뽕,볶음밥,김치,된장,김치찌개,포도,스시,된장찌개,라면,가츠동,비빔밥,사과,바나나
탕수육,1.0,0.046527,0.118956,0.557777,0.940827,-0.066737,-0.455463,0.262514,-0.011381,-0.058536,0.262514,-0.060549,0.118956,-0.031851,-0.011381,-0.011381
짜장면,0.046527,1.0,-0.11395,0.753162,-0.25116,0.144381,0.640491,-0.21245,0.092433,0.153475,-0.21245,0.003247,-0.11395,-0.048837,0.092433,0.092433
소바,0.118956,-0.11395,1.0,-0.233856,0.236814,-0.014366,-0.270641,0.171395,0.024052,0.939605,0.171395,0.972697,1.0,0.039963,0.024052,0.024052
짬뽕,0.557777,0.753162,-0.233856,1.0,0.243521,-0.148902,0.184598,-0.275696,-0.186527,-0.092322,-0.275696,-0.217562,-0.233856,-0.292581,-0.186527,-0.186527
볶음밥,0.940827,-0.25116,0.236814,0.243521,1.0,-0.014242,-0.605402,0.421773,0.064104,-0.02814,0.421773,0.020456,0.236814,0.084954,0.064104,0.064104
김치,-0.066737,0.144381,-0.014366,-0.148902,-0.014242,1.0,0.66735,0.861625,-0.03577,-0.031085,0.861625,-0.040719,-0.014366,0.978599,-0.03577,-0.03577
된장,-0.455463,0.640491,-0.270641,0.184598,-0.605402,0.66735,1.0,0.198979,-0.14989,-0.027888,0.198979,-0.128181,-0.270641,0.514091,-0.14989,-0.14989
김치찌개,0.262514,-0.21245,0.171395,-0.275696,0.421773,0.861625,0.198979,1.0,0.053827,-0.013781,1.0,0.036885,0.171395,0.932924,0.053827,0.053827
포도,-0.011381,0.092433,0.024052,-0.186527,0.064104,-0.03577,-0.14989,0.053827,1.0,-0.034608,0.053827,-0.014803,0.024052,-0.002103,1.0,1.0
스시,-0.058536,0.153475,0.939605,-0.092322,-0.02814,-0.031085,-0.027888,-0.013781,-0.034608,1.0,-0.013781,0.987925,0.939605,-0.042321,-0.034608,-0.034608


In [11]:
# 문서 유사도
tm.doc_simiarity()

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10,doc11,doc12
doc1,1.0,0.947032,0.947032,0.231552,-0.006239,0.297324,0.010609,0.28679,0.001886,-0.002807,0.246941,0.006556
doc2,0.947032,1.0,1.0,-0.0167,0.019107,-0.014738,-0.029985,0.040349,-0.005955,0.009394,0.000486,-0.028138
doc3,0.947032,1.0,1.0,-0.0167,0.019107,-0.014738,-0.029985,0.040349,-0.005955,0.009394,0.000486,-0.028138
doc4,0.231552,-0.0167,-0.0167,1.0,0.563329,0.891784,-0.005877,0.531498,-0.026251,-0.039273,0.457001,-0.03372
doc5,-0.006239,0.019107,0.019107,0.563329,1.0,0.129123,-0.042754,-0.079941,0.059497,0.147648,-0.176668,-0.032374
doc6,0.297324,-0.014738,-0.014738,0.891784,0.129123,1.0,0.028267,0.692476,-0.051539,-0.112538,0.656785,-0.007939
doc7,0.010609,-0.029985,-0.029985,-0.005877,-0.042754,0.028267,1.0,0.719671,0.989973,-0.018245,0.036925,-0.034437
doc8,0.28679,0.040349,0.040349,0.531498,-0.079941,0.692476,0.719671,1.0,0.646253,-0.038832,0.556995,0.046224
doc9,0.001886,-0.005955,-0.005955,-0.026251,0.059497,-0.051539,0.989973,0.646253,1.0,0.02849,-0.031353,-0.018441
doc10,-0.002807,0.009394,0.009394,-0.039273,0.147648,-0.112538,-0.018245,-0.038832,0.02849,1.0,0.575168,0.974381


In [12]:
tm.word_doc_simiarity()

Unnamed: 0,탕수육,짜장면,소바,짬뽕,볶음밥,김치,된장,김치찌개,포도,스시,된장찌개,라면,가츠동,비빔밥,사과,바나나
doc1,0.004567,0.407301,-0.01469,0.071822,-0.022206,0.013757,0.069088,-0.019148,0.947032,0.017756,-0.019148,-0.01253,-0.01469,-0.017679,0.947032,0.947032
doc2,-0.011381,0.092433,0.024052,-0.186527,0.064104,-0.03577,-0.14989,0.053827,1.0,-0.034608,0.053827,-0.014803,0.024052,-0.002103,1.0,1.0
doc3,-0.011381,0.092433,0.024052,-0.186527,0.064104,-0.03577,-0.14989,0.053827,1.0,-0.034608,0.053827,-0.014803,0.024052,-0.002103,1.0,1.0
doc4,0.666241,0.765443,-0.075349,0.956566,0.388997,-0.004452,0.169945,-0.071276,-0.0167,0.021704,-0.071276,-0.09474,-0.075349,-0.131232,-0.0167,-0.0167
doc5,0.990486,-0.073682,0.168542,0.43826,0.978512,-0.046389,-0.522961,0.330996,0.019107,-0.047042,0.330996,-0.028533,0.168542,0.015119,0.019107,0.019107
doc6,0.256726,0.963795,-0.169828,0.901302,-0.066684,0.034721,0.496595,-0.25165,-0.014738,0.063644,-0.25165,-0.086048,-0.169828,-0.150753,-0.014738,-0.014738
doc7,-0.059141,0.118168,0.949513,-0.122239,-0.016671,-0.033439,-0.051745,-0.00179,-0.029985,0.999324,-0.00179,0.992953,0.949513,-0.037212,-0.029985,-0.029985
doc8,-0.006012,0.774469,0.524477,0.450195,-0.187853,0.077732,0.415256,-0.152484,0.040349,0.743979,-0.152484,0.634662,0.524477,-0.060118,0.040349,0.040349
doc9,0.028873,0.022786,0.984165,-0.164169,0.103256,-0.023246,-0.149103,0.078142,-0.005955,0.985393,0.078142,0.995605,0.984165,-0.002033,-0.005955,-0.005955
doc10,0.101593,-0.035419,0.081455,-0.220098,0.211377,0.964731,0.448788,0.964841,0.009394,-0.023245,0.964841,-0.001956,0.081455,0.990627,0.009394,0.009394
