# TextRank - ver02 using sklearn

## 00. imports

In [1]:
# black formatting
%load_ext lab_black

In [2]:
import platform
from collections import Counter

import numpy as np

# tokenizer import
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

from types_ import *

## 01. Common 

### 1) data load

In [3]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

### 2) get tokenizer & get tokens

In [187]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    else:
        tokenizer = Mecab()
    return tokenizer

In [188]:
# tokenizer = get_tokenizer("mecab")
# tokenizer.pos("아버지가방에들어가신다_")

In [189]:
# # 각 tokenizer 별 명사, 형용사, 동사, 어근
# komoran_pos = ['/NN', '/XR', '/VA', '/VV']
# okt_pos = ['/Noun', '/Verb', '/Adjective']
# mecab_pos = ['']

In [190]:
def get_tokens(sent: List[str], noun=False, tokenizer="mecab") -> List[str]:
    tokenizer = get_tokenizer(tokenizer)

    if noun:
        nouns = tokenizer.nouns(sent)
        nouns = [word for word in nouns if len(word) > 1]
        return nouns

    return tokenizer.morphs(sent)

In [183]:
# get_tokens(sents[0], noun=True)

## 02. Sentence

### 1) Vectorize Sents using CountVectorizer

In [9]:
from functools import partial

from sklearn.feature_extraction.text import CountVectorizer

In [10]:
stopwords = ["연합뉴스", "가방"]

vectorizer = CountVectorizer(
    stop_words=stopwords,
    tokenizer=partial(get_tokens, noun=False, tokenizer="mecab"),
    min_df=2,
)

x = vectorizer.fit_transform(sents)

# vectorizer.get_feature_names()

x.toarray().shape

vocab_idx = vectorizer.vocabulary_
idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}

In [191]:
def vectorize_sents(
    sents: List[str], stopwords=None, min_count=2, tokenizer="mecab", noun=False
):

    vectorizer = CountVectorizer(
        stop_words=stopwords,
        tokenizer=partial(get_tokens, noun=noun, tokenizer="mecab"),
        min_df=min_count,
    )

    vec = vectorizer.fit_transform(sents)
    vocab_idx = vectorizer.vocabulary_
    idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}
    return vec, vocab_idx, idx_vocab

In [12]:
stopwords = ["연합뉴스", "가방"]

In [13]:
x, vocab_idx, idx_vocab = vectorize_sents(sents, stopwords)

### 2) similarity matrix

In [14]:
# binary csr_matrix
numerators = (x > 0) * 1

# Inverse sentence length
min_length = 1
denominators = np.asarray(x.sum(axis=1))
denominators[np.where(denominators <= min_length)] = 10000
denominators = np.log(denominators)

denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

sim_mat = np.dot(numerators, numerators.T)

sim_mat = sim_mat / (denom_log1 + denom_log2)

min_sim = 0.3
sim_mat[np.where(sim_mat <= min_sim)] = 0

#### TextRank Similarity

In [15]:
def similarity_matrix(x, min_sim=0.3, min_length=1):
    """
    $$
    sim(s_1, s_2) = 
    \frac{\vert \{ w_k \vert w_k \in S_1 \& w_k \in S_2 \} \vert}
    {log \vert S_1 \vert + log \vert S_2 \vert}
    $$
    """

    # binary csr_matrix
    numerators = (x > 0) * 1

    # denominator
    min_length = 1
    denominators = np.asarray(x.sum(axis=1))
    denominators[np.where(denominators <= min_length)] = 10000
    denominators = np.log(denominators)
    denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
    denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

    sim_mat = np.dot(numerators, numerators.T)
    sim_mat = sim_mat / (denom_log1 + denom_log2)
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [16]:
mat = similarity_matrix(x)

In [305]:
# mat = csr_matrix(mat)

In [296]:
# mat

#### Cosine Similarity

In [20]:
from sklearn.metrics import pairwise_distances

In [21]:
def cosine_similarity_matrix(x, min_sim=0.3):
    sim_mat = 1 - pairwise_distances(x, metric="cosine")
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [22]:
mat = cosine_similarity_matrix(x)

### 3) Sentence Graph

In [18]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")

In [19]:
stopwords = ["연합뉴스", "가방"]
min_count = 2
tokenizer = "mecab"

vecs, vocab_idx, idx_vocab = vectorize_sents(
    sents, stopwords, min_count=min_count, tokenizer=tokenizer
)

In [23]:
similarity = "cosine"
min_sim = 0.3

if similarity == "cosine":
    vecs = cosine_similarity_matrix(vecs, min_sim=min_sim)
else:
    vecs = similarity_matrix(vecs, min_sim=min_sim)

In [49]:
def sent_graph(
    sents: List[str],
    min_count=2,
    min_sim=0.3,
    tokenizer="mecab",
    noun=False,
    similarity=None,
    stopwords: List[str] = ["뉴스", "그리고"],
):

    mat, vocab_idx, idx_vocab = vectorize_sents(
        sents, stopwords, min_count=min_count, tokenizer=tokenizer
    )

    if similarity == "cosine":
        mat = cosine_similarity_matrix(mat, min_sim=min_sim)
    else:
        mat = similarity_matrix(mat, min_sim=min_sim)

    return mat, vocab_idx, idx_vocab

In [41]:
stopwords = ["연합뉴스", "가방"]

mat, vocab_idx, idx_vocab = sent_graph(sents, stopwords=stopwords, similarity="cosine")

In [34]:
# mat

## 03. Word

### 1) Similarity matrix

In [164]:
def word_similarity_matrix(x, min_sim=0.3):
    sim_mat = 1 - pairwise_distances(x.T, metric="cosine")
    sim_mat[np.where(sim_mat <= min_sim)] = 0

    return sim_mat

In [166]:
mat = word_similarity_matrix(x)

### 2) Word Graph

In [223]:
def word_graph(
    sents: List[str],
    min_count=2,
    min_sim=0.3,
    tokenizer="mecab",
    noun=True,
    stopwords: List[str] = ["연합뉴스", "그리고", "기자"],
):

    mat, vocab_idx, idx_vocab = vectorize_sents(
        sents, stopwords, min_count=min_count, tokenizer=tokenizer, noun=noun
    )

    mat = word_similarity_matrix(mat, min_sim=min_sim)

    return mat, vocab_idx, idx_vocab

In [224]:
mat, vocab_idx, idx_vocab = word_graph(sents)

In [225]:
# mat

In [226]:
# mat

## 04. PageRank

In [227]:
import numpy as np
from sklearn.preprocessing import normalize

In [228]:
# from sknetwork.ranking import PageRank
# pr = PageRank()
# scores = pr.fit_transform(mat)

In [229]:
df = 0.85
max_iter = 50
method = "iterative"

assert 0 < df < 1

In [230]:
A = normalize(mat, axis=0, norm="l1")
R = np.ones(A.shape[0])
N = np.ones(R.shape[0]) / R.shape[0]

In [231]:
# iteration
for _ in range(max_iter):
    R = df * np.matmul(A, R) + (1 - df) * N

In [232]:
# R

In [233]:
method = "algebraic"

A = normalize(mat, axis=0, norm="l1")
I = np.eye(A.shape[0])
N = np.ones(A.shape[0]) / A.shape[0]

In [234]:
R = np.linalg.inv((I - df * A))
R = np.matmul(R, (1 - df) * N)

In [235]:
# R

In [236]:
def pagerank(x: np.ndarray, df=0.85, max_iter=50, method="iterative"):
    """
    PageRank function
    ==================
    
    Arguments
    ---------
    x : np.ndarray
    df : float
        Damping Factor, 0 < df < 1
    max_iter : int
        Maximum number of iteration for Power method
    method : str
        default is iterative, oter algebraic
        
    Returns
    -------
    R : np.ndarray
        PageRank vector (score)
    """

    assert 0 < df < 1

    A = normalize(mat, axis=0, norm="l1")
    N = np.ones(A.shape[0]) / A.shape[0]

    if method == "iterative":
        R = np.ones(A.shape[0])
        # iteration
        for _ in range(max_iter):
            R = df * np.matmul(A, R) + (1 - df) * N
    elif method == "algebraic":
        R = np.linalg.inv((I - df * A))
        R = np.matmul(R, (1 - df) * N)

    return R

In [237]:
R = pagerank(mat, method="iterative")

## 04. Sentence & Word Extraction

In [238]:
topk = 10
idxs = R.argsort()[-topk:]

# keysents = [(idx, R[idx], sents[idx]) for idx in sorted(idxs)]
# keysents

In [241]:
keywords = [(idx, R[idx], idx_vocab[idx]) for idx in reversed(idxs)]

In [242]:
keywords

[(19, 0.038754667200304176, '서울'),
 (27, 0.03861352663379311, '용의자'),
 (9, 0.03661624569236274, '발사'),
 (16, 0.03546070933569919, '사제'),
 (2, 0.03517313567434101, '강북구'),
 (5, 0.03478439367464611, '경찰관'),
 (32, 0.03478439367464611, '조사'),
 (33, 0.03410792629851021, '총기'),
 (38, 0.0335252351049594, '폭행'),
 (39, 0.033476384490609516, '현장')]