<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jjc/TextSummarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Summarization (Text Rank)

In [328]:
import numpy as np

class ts():
    sents = []
    tokens = []
    node_score = {}
    link_matrix = np.zeros(shape=(len(node_score), len(node_score)))
    edge_matrix = np.zeros(shape=(len(node_score), len(node_score)))

    max_iter = 100
    df_ = 0.85

    def __init__(self, **options):
        self.sents = []
        self.tokens = []
        self.node_score = {}
        self.link_matrix = np.zeros(shape=(len(node_score), len(node_score)))
        self.edge_matrix = np.zeros(shape=(len(node_score), len(node_score)))

        try :
            self.max_iter = options['max_iter']
        except:
            pass
        try :
            self.df_ = options['df_']
        except:
            pass

    def add_sents(self, sents):
        self.sents = sents

        tokens = []
        # 문장 토큰화
        for sent in sents:
            tokens.append(sent.split())

        self.tokens = tokens

        # 합집합 확인
        unions = []
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                unions.append([i,j,set(tokens[i]).union(set(tokens[j]))])

        # 초기 링크 별 가중치 계산
        links = []
        for union in unions:
            # 교집합 length / 전체 length = 해당 방향 weight
            links.append([union[0], union[1], len(set(tokens[union[0]]) & set(tokens[union[1]])) / len(union[2])])
        
        #초기 node 가중치 설정
        node_score = {}
        for link in links:
            try:
                node_score[link[0]] += link[2]
            except:
                node_score[link[0]] = link[2]
            try:
                node_score[link[1]] += link[2]
            except:
                node_score[link[1]] = link[2]

        self.node_score = node_score

        link_matrix = np.zeros(shape=(len(node_score), len(node_score)))
        # 초기 매트릭스 값
        for link in links:
            link_matrix[link[0]][link[1]] += link[2]
            link_matrix[link[1]][link[0]] += link[2]
        
        self.link_matrix = link_matrix

        # 엣지 가중치 계산 - 1회만, (node score 갱신용)
        edge_matrix = np.zeros(shape=(len(node_score), len(node_score)))
        for link in links:
            edge_matrix[link[0]][link[1]] = link[2] / node_score[link[0]]
            edge_matrix[link[1]][link[0]] = link[2] / node_score[link[1]]
        
        self.edge_matrix = edge_matrix
    
    def find_sol(self):
        # 반복 시작
        for i in range(self.max_iter):
            # 0. node_score 갱신
            for i in range(len(self.node_score)):
                self.node_score[i] = (1-self.df_) + self.df_*self.link_matrix[:,i].sum()
            # 1. link matrix 갱신
            for i in range(len(self.node_score)):
                self.link_matrix[i,:] = self.node_score[i] * self.edge_matrix[i,:]
            
        return self.node_score

In [None]:
sents=[]
sents.append('딸기 바나나 사과 파인애플 수박')
sents.append('바나나 사과 딸기 포도')
sents.append('복숭아 수박')
sents.append('파인애플 사과 딸기 바나나')


ts1 = ts()
ts1.add_sents(sents)
ts1.find_sol()

---

In [10]:
import numpy as np
import math
import networkx as nx

def connect(nodes, tokens) :
    window_size = 2

    edges = []
    for window_start in range(0, (len(tokens)-window_size+1)):
        window = tokens[window_start:window_start+window_size]
        for i in range(window_size):
            for j in range(i+1, window_size):
                if (window[i] in nodes) & (window[j] in nodes):
                    edges.append((window[i], window[j]))
                    print((window[i], window[j]))
                    
    return edges

tokens = ['딸기', '바나나', '사과', '딸기', '파인애플']
nodes = ['바나나', '사과', '파인애플', '딸기']

graph = nx.diamond_graph()
graph.clear()
graph.add_nodes_from(list(set(nodes)))
graph.add_edges_from(connect(nodes, tokens))


('딸기', '바나나')
('바나나', '사과')
('사과', '딸기')
('딸기', '파인애플')


In [26]:
graph.edges

EdgeView([('바나나', '딸기'), ('바나나', '사과'), ('파인애플', '딸기'), ('사과', '딸기')])

In [27]:
graph.nodes

NodeView(('바나나', '파인애플', '사과', '딸기'))

In [270]:
sents=[]
sents.append('딸기 바나나 사과 파인애플 수박')
sents.append('바나나 사과 딸기 포도')
sents.append('복숭아 수박')
sents.append('파인애플 사과 딸기 바나나')

In [239]:
tokens=[]
for sent in sents:
    tokens.append(sent.split())

unions = []
for i in range(len(tokens)-1):
    for j in range(i+1, len(tokens)):
        unions.append([i,j,set(tokens[i]).union(set(tokens[j]))])

In [240]:
unions

[[0, 1, {'딸기', '바나나', '사과', '수박', '파인애플', '포도'}],
 [0, 2, {'딸기', '바나나', '복숭아', '사과', '수박', '파인애플'}],
 [0, 3, {'딸기', '바나나', '사과', '수박', '파인애플'}],
 [1, 2, {'딸기', '바나나', '복숭아', '사과', '수박', '포도'}],
 [1, 3, {'딸기', '바나나', '사과', '파인애플', '포도'}],
 [2, 3, {'딸기', '바나나', '복숭아', '사과', '수박', '파인애플'}]]

In [241]:
# set 연산 &(교집합) | (합집합)
links = []
for union in unions:
    # 교집합 length / 전체 length = 해당 방향 weight
    links.append([union[0], union[1], len(set(tokens[union[0]]) & set(tokens[union[1]])) / len(union[2])])

In [242]:
#초기 node 가중치 설정
node_score = {}
for link in links:
    try:
        node_score[link[0]] += link[2]
    except:
        node_score[link[0]] = link[2]
    try:
        node_score[link[1]] += link[2]
    except:
        node_score[link[1]] = link[2]

In [243]:
node_score

{0: 1.4666666666666668, 1: 1.1, 2: 0.16666666666666666, 3: 1.4}

In [244]:
link_matrix = np.zeros(shape=(len(node_score), len(node_score)))

In [245]:
links

[[0, 1, 0.5],
 [0, 2, 0.16666666666666666],
 [0, 3, 0.8],
 [1, 2, 0.0],
 [1, 3, 0.6],
 [2, 3, 0.0]]

In [246]:
# 초기 매트릭스 값
for link in links:
    link_matrix[link[0]][link[1]] += link[2]
    link_matrix[link[1]][link[0]] += link[2]

In [248]:
link_matrix

array([[0.        , 0.5       , 0.16666667, 0.8       ],
       [0.5       , 0.        , 0.        , 0.6       ],
       [0.16666667, 0.        , 0.        , 0.        ],
       [0.8       , 0.6       , 0.        , 0.        ]])

In [249]:
# 엣지 가중치 계산 - 1회만, (for 행렬곱, node score 갱신)
edge_matrix = np.zeros(shape=(len(node_score), len(node_score)))
for link in links:
    edge_matrix[link[0]][link[1]] = link[2] / node_score[link[0]]
    edge_matrix[link[1]][link[0]] = link[2] / node_score[link[1]]

In [250]:
edge_matrix

array([[0.        , 0.34090909, 0.11363636, 0.54545455],
       [0.45454545, 0.        , 0.        , 0.54545455],
       [1.        , 0.        , 0.        , 0.        ],
       [0.57142857, 0.42857143, 0.        , 0.        ]])

In [256]:
# 반복 시작
max_iter = 1
df_ = 0.85

# 0. node_score 갱신
for i in range(len(node_score)):
    node_score[i] = (1-df_) + df_*link_matrix[:,i].sum()

In [257]:
node_score

{0: 1.4679783549783552,
 1: 1.042858766233766,
 2: 0.28490530303030304,
 3: 1.3005909090909094}

In [258]:
# 1. link matrix 갱신
for i in range(len(node_score)):
    link_matrix[i,:] = node_score[i] * edge_matrix[i,:]

In [259]:
link_matrix

array([[0.        , 0.50044717, 0.16681572, 0.80071547],
       [0.47402671, 0.        , 0.        , 0.56883205],
       [0.2849053 , 0.        , 0.        , 0.        ],
       [0.74319481, 0.5573961 , 0.        , 0.        ]])