# TextRank - ver01

## 00. imports

In [1]:
# black formatting
%load_ext lab_black

In [2]:
import platform
from collections import Counter

import numpy as np

# tokenizer import
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

from types_ import *

## 00. Common - `utils.py`

### 1) data load

In [3]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

### 2) get tokenizer & get tokens

In [4]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    else:
        tokenizer = Mecab()
    return tokenizer

In [5]:
tokenizer = get_tokenizer("mecab")
tokenizer.pos("아버지가방에들어가신다_")

[('아버지', 'NNG'),
 ('가', 'JKS'),
 ('방', 'NNG'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('신다', 'EP+EC'),
 ('_', 'SY')]

In [6]:
# # 각 tokenizer 별 명사, 형용사, 동사, 어근
# komoran_pos = ['/NN', '/XR', '/VA', '/VV']
# okt_pos = ['/Noun', '/Verb', '/Adjective']
# mecab_pos = ['']

In [7]:
def get_tokens(
    sents: List[List[str]], noun=False, tokenizer="mecab"
) -> List[List[str]]:

    tokenizer = get_tokenizer(tokenizer)

    if noun:
        return [tokenizer.nouns(sent) for sent in sents]

    #     tokens_list = [tokenizer.pos(sent) for sent in sents]
    return [[f"{word}/{pos}" for word, pos in tokenizer.pos(sent)] for sent in sents]

In [8]:
# get_tokens(sents)

### 3) get vocab

In [15]:
corpus = get_tokens(sents)

In [19]:
min_count = 2
min_len = 2

In [22]:
counter = Counter(token for tokens in corpus for token in tokens)
counter = {
    w: c
    for w, c in counter.items()
    if c >= min_count and len(w.split("/")[0]) >= min_len
}

In [28]:
idx_vocab = [w for w, _ in sorted(counter.items(), key=lambda x: -x[1])]
vocab_idx = {vocab: idx for idx, vocab in enumerate(idx_vocab)}

In [31]:
# vocab_idx

In [32]:
# idx_vocab

In [33]:
def get_vocab(corpus: List[List[str]], min_count=2, min_len=2) -> List[str] and Dict:

    counter = Counter(token for tokens in corpus for token in tokens)
    counter = {
        w: c
        for w, c in counter.items()
        if c >= min_count and len(w.split("/")[0]) >= min_len
    }

    idx_vocab = [w for w, _ in sorted(counter.items(), key=lambda x: -x[1])]
    vocab_idx = {vocab: idx for idx, vocab in enumerate(idx_vocab)}
    return idx_vocab, vocab_idx