In [8]:
import json
import numpy as np
import os
import re
import unicodedata

from bokeh.models import HoverTool
from bokeh.palettes import Category20
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from collections import Counter
from datetime import datetime
from konlpy.tag import Okt
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.manifold import TSNE
from tqdm import tqdm

okt = Okt()
current_path = os.getcwd()
dataset_path = os.path.join(current_path, 'data')

In [2]:
file_list = os.listdir(dataset_path)
file_list = [os.path.join(dataset_path, file) for _, file in enumerate(file_list) if '.txt' in file]

In [3]:
date_regex = r'^[\d]{4}년[\s0-9]{1,3}월[\s0-9]{1,3}일[\s][월화수목금토일]요일'
chat_date_regex = r'^[\d]{4}.[\s0-9]{1,3}.[\s0-9]{1,3}.[\s][오전후]{2}[\s][0-9]{1,2}:[0-9]{1,2}'
id_regex = r'^(.*)(?=:)'

In [4]:
chat_list = []
for _, file in enumerate(tqdm(file_list)):
    with open(file, 'r') as f:
        current_data = f.read()
    
    current_data = unicodedata.normalize('NFKC', current_data)
    splited_list = current_data.split('\n')
    # remove except 'chat data' then, remove the date
    splited_list = [re.sub(chat_date_regex, '', s) for _, s in enumerate(splited_list) if re.match(chat_date_regex, s)]
    # remove id
    splited_list = [re.sub(r'^(.*)(?=:)', '', s) for _, s in enumerate(splited_list)]
    # remove ':'
    splited_list = [re.sub(r'^[: ]+', '', s) for _, s in enumerate(splited_list)]
    chat_list.extend(splited_list)

100%|██████████| 23/23 [00:01<00:00, 13.86it/s]


In [5]:
#tag_set = set(['Noun', 'Verb', 'Adjective'])
tag_set = set(['Noun'])

stop_words = set(['채팅', '혹시', '선생님', '그냥', ])

In [6]:
token_list = []
for _, chat in enumerate(tqdm(chat_list)):
    words = okt.pos(chat, norm=True, stem=True)
    words = [w for (w, tag) in words if tag in tag_set and len(w) > 1 and w not in stop_words]
    if len(words) == 0:
        continue
    
    token_list.append({
        'chat': chat,
        'tokens': words,
    })

100%|██████████| 180929/180929 [07:57<00:00, 378.70it/s] 


In [7]:
word_counter = Counter()
for _, tokens in enumerate(token_list):
    word_counter.update(tokens['tokens'])

In [8]:
num_words = 20000
voca = word_counter.most_common(num_words)
word2id = {w:i for i, (w, _) in enumerate(voca)}

In [9]:
tdm = np.zeros((len(token_list), len(voca)), dtype=np.float32)
for i, chat in enumerate(tqdm(token_list)):
    for word in chat['tokens']:
        if word in word2id:
            tdm[i, word2id[word]] += 1

100%|██████████| 150271/150271 [00:02<00:00, 70075.18it/s]


In [10]:
tdm = TfidfTransformer().fit_transform(tdm)

In [15]:
K = 20
nmf = NMF(n_components=K, max_iter=5000)
W = nmf.fit_transform(tdm)
H = nmf.components_

In [16]:
for k in range(K):
    print(f"{k+1}th topic")
    keyword_indices = H[k, :].argsort()[::-1][:50]
    for index in keyword_indices:
        print(voca[index], end=' ')
    print()

1th topic
('오픈', 3166) ('바이오', 2625) ('직종', 2432) ('제약', 2884) ('업계', 2875) ('위해', 2663) ('카카오', 296) ('시작', 1076) ('카톡', 232) ('룰루', 40) ('사가', 150) ('국내', 614) ('다른', 1630) ('사도', 92) ('시간', 1761) ('프로필', 70) ('톡방', 171) ('포지션', 288) ('이슈', 444) ('하나', 1539) ('링크', 144) ('챗방', 20) ('분위기', 530) ('기획', 52) ('부탁', 333) ('라벨', 307) ('연락', 1194) ('사면', 51) ('취준님', 368) ('개발', 289) ('여기', 1122) ('공유', 502) ('고민', 920) ('경험', 882) ('사실', 964) ('관련', 1224) ('다시', 1210) ('평균', 150) ('주심', 54) ('근무', 481) ('문제', 1096) ('이유', 523) ('아시', 1791) ('스터디', 508) ('대화', 177) ('계시', 238) ('중견', 57) ('대해', 488) ('우리', 491) ('관심', 202) 
2th topic
('교환', 7147) ('보유', 5163) ('신촌', 1592) ('세브란스', 1554) ('요청', 2201) ('이후', 1335) ('희망', 466) ('신분', 1263) ('강남', 586) ('안암', 279) ('부산', 542) ('하루', 798) ('날짜', 757) ('원함', 255) ('국립', 427) ('구합', 685) ('중순', 188) ('사이', 333) ('자리', 1285) ('연락', 1194) ('빈센트', 143) ('세브란스병원', 221) ('성심', 237) ('중앙대', 147) ('충북대', 150) ('대구', 176) ('완료', 471) ('해운대', 173) ('제외', 43

In [17]:
filtered_chat_list = []
for _, token in enumerate(token_list):
    filtered_chat_list.append(token['chat'])

In [19]:
# select random index
#selectNum = W.shape[0]
selectNum = 20000
randIndex = np.random.choice(W.shape[0], selectNum, replace=False)
randIndex.sort()

tsne = TSNE(n_components=2, init='pca', verbose=1)
W2d = tsne.fit_transform(W[randIndex, :])
topicIndex = [v.argmax() for v in W[randIndex, :]]

tools_to_show = 'hover,box_zoom,pan,save,reset,wheel_zoom'
p = figure(width=800, height=600, tools=tools_to_show)

source = ColumnDataSource(data={
    'x': W2d[:, 0],
    'y': W2d[:, 1],
    'id': [i for i in randIndex],
    'document': [filtered_chat_list[randInd][:100] for randInd in randIndex],  
    'topic': [str(i) for i in topicIndex],  # 토픽 번호
    'color': [Category20[K][i] for i in topicIndex]
})
p.circle(
    'x', 'y',
    source=source,
    legend_label='topic',
    color='color',
    fill_alpha=0.7,
    line_alpha=0.7)

p.legend.location = "top_left"
hover = p.select(dict(type=HoverTool))
hover.tooltips = [("Topic", "@topic"), ('id', '@id'), ("Article", "@document")]
hover.mode = 'mouse'

output_notebook()
show(p)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 20000 samples in 0.001s...
[t-SNE] Computed neighbors for 20000 samples in 0.610s...
[t-SNE] Computed conditional probabilities for sample 1000 / 20000
[t-SNE] Computed conditional probabilities for sample 2000 / 20000
[t-SNE] Computed conditional probabilities for sample 3000 / 20000
[t-SNE] Computed conditional probabilities for sample 4000 / 20000
[t-SNE] Computed conditional probabilities for sample 5000 / 20000
[t-SNE] Computed conditional probabilities for sample 6000 / 20000
[t-SNE] Computed conditional probabilities for sample 7000 / 20000
[t-SNE] Computed conditional probabilities for sample 8000 / 20000
[t-SNE] Computed conditional probabilities for sample 9000 / 20000
[t-SNE] Computed conditional probabilities for sample 10000 / 20000
[t-SNE] Computed conditional probabilities for sample 11000 / 20000
[t-SNE] Computed conditional probabilities for sample 12000 / 20000
[t-SNE] Computed conditional probabilities for sam