In [31]:
import os
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Komoran

In [32]:
project_root = os.path.dirname(os.getcwd())

input_dir = os.path.join(project_root, 'data', 'processed')
output_dir = os.path.join(project_root, 'data')

In [33]:
class Tokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        pos = ['{}/{}'.format(word,tag) for word, tag in pos if tag.startswith('NN')]
        return pos

tagger = Komoran()
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.6, tokenizer=Tokenizer(tagger))

In [34]:
documents = []
for filename in os.listdir(input_dir):
    with open(os.path.join(input_dir, filename)) as f:
        assert f.readline() == '@title\n'
        title = f.readline().strip()

        assert f.readline() == '@content\n'
        contents = [line.strip() for line in f.readlines()]
        tags = [tagger.pos(sent) for sent in contents]

        documents.append({'title': title, 'contents': contents, 'tags': tags})

In [35]:
# build DTM
dt_matrices = vectorizer.fit_transform([' '.join(doc['contents']) for doc in documents])

# build vocabulary
idx2vocab = [vocab for vocab, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])]

# add additional informations to documents
for doc, dtm in zip(documents, dt_matrices):
    # add DTM
    doc['dtm'] = dtm

    # add sorted keywords
    keywords = []
    for idx, w in enumerate(dtm.toarray().squeeze()):
        if w > 0.0:
            keywords.append((idx2vocab[idx], w))
        keywords = sorted(keywords, key=lambda word: word[1])
        keywords.reverse()
    doc['keywords'] = [w for w in keywords if w[1] > 0.01]

In [38]:
# save documents and vocabulary
with open(os.path.join(output_dir, 'documents.pkl'), 'wb') as f:
    pickle.dump(documents, f)
    
with open(os.path.join(output_dir, 'vocab.pkl'), 'wb') as f:
    pickle.dump(idx2vocab, f)