<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/jisang/09_Document_Summarization_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Document Summarization**

## **1. konlpy 다운로드**

In [None]:
!sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
!sudo apt-get install python-dev; pip install konlpy     # Python 2.x
!sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

## **2. Text Rank : 문서 요약 구현**

In [None]:
# 예시 문서
docs = ["딸기 바나나 사과 파인애플 수박",
        "바나나 사과 딸기 포도",
        "복숭아 수박",
        "파인애플 사과 딸기 바나나"]

In [None]:
from konlpy.tag import Mecab

import numpy as np
import math
import networkx as nx

mecab = Mecab()

# 토큰화
tokens = []
for line in docs:
    tokens.append([token for token in mecab.pos(line)])

tokens

In [None]:
# 초기 세팅값
first_matrix = [[0*i]*len(docs) for i in range(len(docs))]

for i in range(len(docs)-1):
    for j in range(i+1, len(docs)):
        union = set(tokens[i]).union(set(tokens[j]))
        intersection = set(tokens[i]).intersection(set(tokens[j]))
        first_matrix[i][j] = len(intersection)/len(union)
        first_matrix[j][i] = len(intersection)/len(union)

first_matrix

In [None]:
# 첫 스코어
first_score = [0] * len(docs)
for i in range(len(first_matrix)):
    for j in range(len(first_matrix[i])):
        first_score[i] += first_matrix[i][j]

first_score

In [None]:
# 가중치 행렬
weight_matrix = [[0*i]*len(docs) for i in range(len(docs))]
for i in range(len(weight_matrix)):
    for j in range(len(weight_matrix[i])):
        weight_matrix[i][j] = first_matrix[i][j] / first_score[i]

weight_matrix

In [None]:
# 스코어 갱신 함수
def cal_score(matrix):
    score_sum = [0] * len(docs)
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            score_sum[i] += matrix[j][i] * 0.85 # Damping Factor = 0.85
        score_sum[i] = score_sum[i] + 0.15
    return score_sum

In [None]:
# 행렬 갱신 함수
def make_new(matrix):
    new_matrix = [[0*i]*len(docs) for i in range(len(docs))]
    for i in range(len(new_matrix)):
        for j in range(len(new_matrix[i])):
            new_matrix[i][j] = weight_matrix[i][j] * cal_score(matrix)[i]
    return new_matrix

In [None]:
# 반복 확인
matrix1 = make_new(first_matrix)
matrix2 = make_new(matrix1)
matrix3 = make_new(matrix2)
matrix4 = make_new(matrix3)
matrix5 = make_new(matrix4)
matrix6 = make_new(matrix5)
matrix7 = make_new(matrix6)

first_matrix, matrix1, matrix2, matrix3, matrix4, matrix5, matrix6, matrix7

## **3. Class로 구현**

In [None]:
from konlpy.tag import Mecab

import numpy as np
import math
import networkx as nx

mecab = Mecab()

class summarization():
    def __init__(self, docs):
        self.tokens = []
        self.first_matrix = []
        self.first_score = []
        self.weight_matrix = []
        self.docs = docs
        self.score_sum = 0
    
    # 토큰화
    def make_token(self):
        for line in self.docs:
            self.tokens.append([token for token in mecab.pos(line)])
        return self.tokens

    # 초기 세팅값
    def make_first(self):
        self.first_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(self.docs)-1):
            for j in range(i+1, len(self.docs)):
                union = set(self.tokens[i]).union(set(self.tokens[j]))
                intersection = set(self.tokens[i]).intersection(set(self.tokens[j]))
                self.first_matrix[i][j] = len(intersection)/len(union)
                self.first_matrix[j][i] = len(intersection)/len(union)
        return self.first_matrix

    # 최초 스코어 계산
    def First_Score(self):
        self.first_score = [0] * len(docs)
        for i in range(len(self.first_matrix)):
            for j in range(len(self.first_matrix[i])):
                self.first_score[i] += self.first_matrix[i][j]
        return self.first_score

    # 가중치 행렬
    def make_weight(self):
        self.weight_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(self.weight_matrix)):
            for j in range(len(self.weight_matrix[i])):
                self.weight_matrix[i][j] = self.first_matrix[i][j] / self.first_score[i]
        return self.weight_matrix

    # 스코어 계산
    def cal_score(self, matrix):
        self.score_sum = [0] * len(self.docs)
        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                self.score_sum[i] += matrix[j][i] * 0.85
            self.score_sum[i] = self.score_sum[i] + 0.15
        return self.score_sum

    # 새로운 행렬에 반영
    def make_new(self, matrix):
        new_matrix = [[0*i]*len(self.docs) for i in range(len(self.docs))]
        for i in range(len(new_matrix)):
            for j in range(len(new_matrix[i])):
                new_matrix[i][j] = self.weight_matrix[i][j] * cal_score(matrix)[i]
        return new_matrix
    
    # 자동 실행 수행
    def run(self, num):
        self.make_token()
        matrix = self.make_first() # 초기 행렬
        self.First_Score() # 초기 스코어
        self.make_weight()
        for i in range(num):
            matrix = make_new(matrix)
            print(np.array(matrix))
            print("="*50)

In [None]:
# 예시 문서
docs = ["딸기 바나나 사과 파인애플 수박",
        "바나나 사과 딸기 포도",
        "복숭아 수박",
        "파인애플 사과 딸기 바나나"]

In [None]:
ds = summarization(docs)
ds.run(20)