# TextRank - ver02 using sklearn

## 00. imports

In [1]:
# black formatting
%load_ext lab_black

In [2]:
import platform
from collections import Counter

import numpy as np

# tokenizer import
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

from types_ import *

## 01. Common 

### 1) data load

In [3]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

### 2) get tokenizer & get tokens

In [4]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    else:
        tokenizer = Mecab()
    return tokenizer

In [5]:
# tokenizer = get_tokenizer("mecab")
# tokenizer.pos("아버지가방에들어가신다_")

In [6]:
# # 각 tokenizer 별 명사, 형용사, 동사, 어근
# komoran_pos = ['/NN', '/XR', '/VA', '/VV']
# okt_pos = ['/Noun', '/Verb', '/Adjective']
# mecab_pos = ['']

In [25]:
def get_tokens(sent: List[str], noun=False, tokenizer="mecab") -> List[str]:
    tokenizer = get_tokenizer(tokenizer)

    if noun:
        return tokenizer.nouns(sent)

    return tokenizer.morphs(sent)

In [26]:
# get_tokens(sents[0])

## 01. Sentence

### 1) Vectorize Sents using CountVectorizer

In [27]:
from functools import partial

from sklearn.feature_extraction.text import CountVectorizer

In [77]:
stopwords = ["연합뉴스", "가방"]

vectorizer = CountVectorizer(
    stop_words=stopwords,
    tokenizer=partial(get_tokens, noun=False, tokenizer="mecab"),
    min_df=2,
)

x = vectorizer.fit_transform(sents)

# vectorizer.get_feature_names()

x.toarray().shape

vocab_idx = vectorizer.vocabulary_
idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}

In [84]:
def vectorize_sents(
    sents: List[str], stopwords=None, min_count=2, tokenizer="mecab", noun=False
):

    vectorizer = CountVectorizer(
        stop_words=stopwords,
        tokenizer=partial(get_tokens, noun=False, tokenizer="mecab"),
        min_df=min_count,
    )

    vec = vectorizer.fit_transform(sents)
    vocab_idx = vectorizer.vocabulary_
    idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}
    return vec, vocab_idx, idx_vocab

In [85]:
stopwords = ["연합뉴스", "가방"]

In [90]:
x, vocab_idx, idx_vocab = vectorize_sents(sents, stopwords)

### 2) similarity matrix

In [291]:
# binary csr_matrix
numerators = (x > 0) * 1

# Inverse sentence length
min_length = 1
denominators = np.asarray(x.sum(axis=1))
denominators[np.where(denominators <= min_length)] = 10000
denominators = np.log(denominators)

denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

sim_mat = np.dot(numerators, numerators.T)

sim_mat = sim_mat / (denom_log1 + denom_log2)

min_sim = 0.3
sim_mat[np.where(sim_mat <= min_sim)] = 0

#### TextRank Similarity

In [293]:
def similarity_matrix(x, min_sim=0.3, min_length=1):
    """
    $$
    sim(s_1, s_2) = 
    \frac{\vert \{ w_k \vert w_k \in S_1 \& w_k \in S_2 \} \vert}
    {log \vert S_1 \vert + log \vert S_2 \vert}
    $$
    """

    # binary csr_matrix
    numerators = (x > 0) * 1

    # denominator
    min_length = 1
    denominators = np.asarray(x.sum(axis=1))
    denominators[np.where(denominators <= min_length)] = 10000
    denominators = np.log(denominators)
    denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
    denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

    sim_mat = np.dot(numerators, numerators.T)
    sim_mat = sim_mat / (denom_log1 + denom_log2)
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [306]:
mat = similarity_matrix(x)

In [305]:
# mat = csr_matrix(mat)

In [296]:
# mat

#### Cosine Similarity

In [298]:
from sklearn.metrics import pairwise_distances

In [309]:
def cosine_similarity_matrix(x, min_sim=0.3):
    sim_mat = 1 - pairwise_distances(x, metric="cosine")
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [310]:
mat = cosine_similarity_matrix(x)