# TextRank - ver02 using sklearn

## 00. imports

In [1]:
# black formatting
%load_ext lab_black

In [2]:
import platform
from collections import Counter

import numpy as np

# tokenizer import
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

from types_ import *

## 01. Common 

### 1) data load

In [3]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

### 2) get tokenizer & get tokens

In [4]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    else:
        tokenizer = Mecab()
    return tokenizer

In [5]:
# tokenizer = get_tokenizer("mecab")
# tokenizer.pos("아버지가방에들어가신다_")

In [6]:
# # 각 tokenizer 별 명사, 형용사, 동사, 어근
# komoran_pos = ['/NN', '/XR', '/VA', '/VV']
# okt_pos = ['/Noun', '/Verb', '/Adjective']
# mecab_pos = ['']

In [7]:
def get_tokens(sent: List[str], noun=False, tokenizer="mecab") -> List[str]:
    tokenizer = get_tokenizer(tokenizer)

    if noun:
        return tokenizer.nouns(sent)

    return tokenizer.morphs(sent)

In [8]:
# get_tokens(sents[0])

## 02. Sentence

### 1) Vectorize Sents using CountVectorizer

In [9]:
from functools import partial

from sklearn.feature_extraction.text import CountVectorizer

In [10]:
stopwords = ["연합뉴스", "가방"]

vectorizer = CountVectorizer(
    stop_words=stopwords,
    tokenizer=partial(get_tokens, noun=False, tokenizer="mecab"),
    min_df=2,
)

x = vectorizer.fit_transform(sents)

# vectorizer.get_feature_names()

x.toarray().shape

vocab_idx = vectorizer.vocabulary_
idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}

In [11]:
def vectorize_sents(
    sents: List[str], stopwords=None, min_count=2, tokenizer="mecab", noun=False
):

    vectorizer = CountVectorizer(
        stop_words=stopwords,
        tokenizer=partial(get_tokens, noun=False, tokenizer="mecab"),
        min_df=min_count,
    )

    vec = vectorizer.fit_transform(sents)
    vocab_idx = vectorizer.vocabulary_
    idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}
    return vec, vocab_idx, idx_vocab

In [12]:
stopwords = ["연합뉴스", "가방"]

In [13]:
x, vocab_idx, idx_vocab = vectorize_sents(sents, stopwords)

### 2) similarity matrix

In [14]:
# binary csr_matrix
numerators = (x > 0) * 1

# Inverse sentence length
min_length = 1
denominators = np.asarray(x.sum(axis=1))
denominators[np.where(denominators <= min_length)] = 10000
denominators = np.log(denominators)

denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

sim_mat = np.dot(numerators, numerators.T)

sim_mat = sim_mat / (denom_log1 + denom_log2)

min_sim = 0.3
sim_mat[np.where(sim_mat <= min_sim)] = 0

#### TextRank Similarity

In [15]:
def similarity_matrix(x, min_sim=0.3, min_length=1):
    """
    $$
    sim(s_1, s_2) = 
    \frac{\vert \{ w_k \vert w_k \in S_1 \& w_k \in S_2 \} \vert}
    {log \vert S_1 \vert + log \vert S_2 \vert}
    $$
    """

    # binary csr_matrix
    numerators = (x > 0) * 1

    # denominator
    min_length = 1
    denominators = np.asarray(x.sum(axis=1))
    denominators[np.where(denominators <= min_length)] = 10000
    denominators = np.log(denominators)
    denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
    denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

    sim_mat = np.dot(numerators, numerators.T)
    sim_mat = sim_mat / (denom_log1 + denom_log2)
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [16]:
mat = similarity_matrix(x)

In [305]:
# mat = csr_matrix(mat)

In [296]:
# mat

#### Cosine Similarity

In [20]:
from sklearn.metrics import pairwise_distances

In [21]:
def cosine_similarity_matrix(x, min_sim=0.3):
    sim_mat = 1 - pairwise_distances(x, metric="cosine")
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [22]:
mat = cosine_similarity_matrix(x)

### 3) Sentence Graph

In [18]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

In [19]:
stopwords = ["연합뉴스", "가방"]
min_count = 2
tokenizer = "mecab"

vecs, vocab_idx, idx_vocab = vectorize_sents(
    sents, stopwords, min_count=min_count, tokenizer=tokenizer
)

In [23]:
similarity = "cosine"
min_sim = 0.3

if similarity == "cosine":
    vecs = cosine_similarity_matrix(vecs, min_sim=min_sim)
else:
    vecs = similarity_matrix(vecs, min_sim=min_sim)

In [49]:
def sent_graph(
    sents: List[str],
    min_count=2,
    min_sim=0.3,
    tokenizer="mecab",
    noun=False,
    similarity=None,
    stopwords: List[str] = ["뉴스", "그리고"],
):

    mat, vocab_idx, idx_vocab = vectorize_sents(
        sents, stopwords, min_count=min_count, tokenizer=tokenizer
    )

    if similarity == "cosine":
        mat = cosine_similarity_matrix(mat, min_sim=min_sim)
    else:
        mat = similarity_matrix(mat, min_sim=min_sim)

    return mat, vocab_idx, idx_vocab

In [41]:
stopwords = ["연합뉴스", "가방"]

mat, vocab_idx, idx_vocab = sent_graph(sents, stopwords=stopwords, similarity="cosine")

In [34]:
# mat

## 03. PageRank

In [42]:
import numpy as np
from sklearn.preprocessing import normalize

In [48]:
# from sknetwork.ranking import PageRank

# pr = PageRank()

# scores = pr.fit_transform(mat)

In [46]:
scores

array([0.06486137, 0.03777102, 0.06637652, 0.06136609, 0.04007189,
       0.055716  , 0.04917619, 0.05037918, 0.06195448, 0.05159843,
       0.04580437, 0.05314427, 0.04182777, 0.03399724, 0.04825147,
       0.04439981, 0.05794319, 0.03137895, 0.06365065, 0.04033112])

0.9999999999999998

In [109]:
df = 0.85
max_iter = 50
method = "iterative"

assert 0 < df < 1

In [110]:
A = normalize(mat, axis=0, norm="l1")
R = np.ones(A.shape[0])
N = np.ones(R.shape[0]) / R.shape[0]

In [111]:
# iteration
for _ in range(max_iter):
    R = df * np.matmul(A, R) + (1 - df) * N

In [129]:
# R

In [125]:
method = "algebraic"

A = normalize(mat, axis=0, norm="l1")
I = np.eye(A.shape[0])
N = np.ones(A.shape[0]) / A.shape[0]

In [126]:
R = np.linalg.inv((I - df * A))
R = np.matmul(R, (1 - df) * N)

In [130]:
# R

In [138]:
def pagerank(x: np.ndarray, df=0.85, max_iter=50, method="iterative"):
    """
    PageRank function
    ==================
    
    Arguments
    ---------
    x : np.ndarray
    df : float
        Damping Factor, 0 < df < 1
    max_iter : int
        Maximum number of iteration for Power method
    method : str
        default is iterative, oter algebraic
        
    Returns
    -------
    R : np.ndarray
        PageRank vector (score)
    """

    assert 0 < df < 1

    A = normalize(mat, axis=0, norm="l1")
    N = np.ones(A.shape[0]) / A.shape[0]

    if method == "iterative":
        R = np.ones(A.shape[0])
        # iteration
        for _ in range(max_iter):
            R = df * np.matmul(A, R) + (1 - df) * N
    elif method == "algebraic":
        R = np.linalg.inv((I - df * A))
        R = np.matmul(R, (1 - df) * N)

    return R

In [141]:
R = pagerank(mat, method="iterative")

In [151]:
topk = 3
idxs = R.argsort()[-topk:]

In [157]:
keysents = [(idx, R[idx], sents[idx]) for idx in sorted(idxs)]

In [158]:
keysents

[(0,
  0.06523941197137159,
  '오패산터널 총격전 용의자 검거 서울 연합뉴스 경찰 관계자들이 19일 오후 서울 강북구 오패산 터널 인근에서 사제 총기를 발사해 경찰을 살해한 용의자 성모씨를 검거하고 있다 성씨는 검거 당시 서바이벌 게임에서 쓰는 방탄조끼에 헬멧까지 착용한 상태였다'),
 (2,
  0.06676676734182561,
  '경찰에 따르면 성씨는 19일 오후 강북경찰서 인근 부동산 업소 밖에서 부동산업자 이모 67 씨가 나오기를 기다렸다 이씨와는 평소에도 말다툼을 자주 한 것으로 알려졌다'),
 (18,
  0.06401999523713223,
  '경찰은 인근을 수색해 성씨가 만든 사제총 16정과 칼 7개를 압수했다 실제 폭발할지는 알 수 없는 요구르트병에 무언가를 채워두고 심지를 꽂은 사제 폭탄도 발견됐다')]

In [156]:
idxs

array([18,  0,  2])