In [82]:
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Mecab

In [None]:
project_root = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_root, 'data', 'processed')

In [83]:
tokenizer = Mecab(os.path.join(project_root, '.venv/lib/mecab/dic/mecab-ko-dic'))
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.5)

In [84]:
documents = []
originals = []
for filename in os.listdir(data_dir):
    with open(os.path.join(data_dir, filename)) as f:
        assert f.readline() == '@title\n'

        title = f.readline().strip()

        assert f.readline() == '@content\n'
        content = f.read()

        nouns = tokenizer.nouns(content)
        documents.append(' '.join(nouns))
        originals.append({'title': title, 'nouns': nouns})
        

In [85]:
vectorizer.fit(documents)
dtm = vectorizer.transform(documents)

In [86]:
idx2vocab = [vocab for vocab, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])]
words = []
for idx, w in enumerate(dtm[1].toarray().squeeze()):
    if w > 0.0:
        words.append((idx2vocab[idx], w))

words = sorted(words, key=lambda word: word[1])
words.reverse()

print(words)
print(originals[1])

[('가락지', 0.8324437557952904), ('봄바람', 0.11351505760844868), ('나비', 0.1115895944535558), ('흰나비', 0.10479242970913093), ('사방', 0.0935588792056974), ('봄날', 0.0788917873035908), ('정신', 0.07778160868061054), ('흰나비 눈물', 0.07567670507229912), ('햇볕 봄바람', 0.07567670507229912), ('하나 가락지', 0.07567670507229912), ('정신 무엇', 0.07567670507229912), ('무엇 가락지', 0.07567670507229912), ('날개 가락지', 0.07567670507229912), ('가락지 나비', 0.07567670507229912), ('가락지 가락지', 0.07567670507229912), ('바람', 0.07008685061286668), ('처음', 0.06471696275671526), ('방울', 0.05770987307004989), ('햇볕', 0.0557947972267779), ('눈물', 0.05194153930021694), ('날개', 0.048831758263065884), ('겨울', 0.04776891914728928), ('얼굴', 0.041449752191576344), ('흰나비 날개', 0.03783835253614956), ('햇빛 가락지', 0.03783835253614956), ('하루 햇볕', 0.03783835253614956), ('태양 정말', 0.03783835253614956), ('큰일 하나', 0.03783835253614956), ('큰일 가락지', 0.03783835253614956), ('처음 흰나비', 0.03783835253614956), ('처음 정신', 0.03783835253614956), ('처음 나비', 0.03783835253614956), ('처음 가락지