In [1]:
import json
import pandas as pd
import re

from collections import Counter
# from collections import defaultdict
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.preprocessing import normalize
import math

import time

In [2]:
input_file_name = './data/extractive_test_v2.jsonl'

In [3]:
def preprocess_sentence(article):
    article = re.sub('”', ' ', article)
#     article = re.sub('  ', ' ', article)
#     article = re.sub('[-=+,#/\?:^$.@*※~&%ㆍ!』\\‘|\[\]\<\>`\'…》]', '', article)
    bracket = re.findall(r'\([^)]*\)', article )
    for i in bracket:
        word = i.strip('()')
        if word.isupper():
            end_index = article.find(i)
            word_len = article[end_index:0:-1].find(' ')
            start_index = end_index - word_len +1
            origin = article[start_index : end_index]
            article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        else:
            if '이하' in word:
                word = word[3:]
                n_space = word.count(' ')
                end_index = article.find(word)-4
                range_candidate = article[end_index-30:end_index].split(' ')[::-1]
                origin = ' '.join(range_candidate[:n_space+1][::-1])
                article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        # 괄호는 다 제거
        article = article.replace(i,'')
    article = ''.join(re.findall('[ 가-힣a-zA-Z0-9]',  article) )
    
    return article.strip(' ')

In [4]:
# 토크나이저
from konlpy.tag import Komoran

komoran = Komoran()
def komoran_tokenize(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
    return words

In [5]:
def scan_vocabulary(sents, tokenize, min_count=2):
    temp = Counter(w for sent in sents for w in tokenize(sent))
    temp = {w:c for w,c in temp.items() if c >= min_count}
    counter = {}
    for i in temp.keys():
        w = i.split('/')[0]        
        if w == '원':
            continue
        if len(w) == 1 and temp[i] <5 :
            continue
        counter[i] = temp[i] 
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

In [6]:
def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    
    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias
    return R

In [7]:
def textrank_sent_sim(s1, s2):
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

In [8]:
def cosine_sent_sim(s1, s2):
    if (not s1) or (not s2):
        return 0

    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)
    return prod / (norm1 * norm2)

In [18]:
def sent_graph(sents, tokenize, similarity, min_count=2, min_sim=0.1):
    _, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)

    tokens = [[w for w in tokenize(sent) if w in vocab_to_idx] for sent in sents]
    print(tokens)
    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

In [10]:
def textrank_keysentence(sents, tokenize, min_count, min_sim, similarity, df=0.85, max_iter=30, topk= 3 ):
    g = sent_graph(sents, tokenize,  similarity ,min_count, min_sim )
    R = pagerank(g, df, max_iter).reshape(-1)
    idxs = R.argsort()[-topk:]
    key_index = [ idx for idx in reversed(idxs)]
#     keysents = [(idx, R[idx], sents[idx]) for idx in reversed(idxs)]
#     summary_3 = '\n'.join( [sents[idx] for idx in reversed(idxs) ]  )
    return key_index

In [17]:
submission2 =  pd.DataFrame( columns = ['id' , 'summary'])
# submission2 =  pd.DataFrame( columns = ['id' , 'summary'])
# submission3 =  pd.DataFrame( columns = ['id' , 'summary'])
# submission4 =  pd.DataFrame( columns = ['id' , 'summary'])

In [19]:
# test
with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
    i = 0
    for line in input_file:
        line = json.loads(line)
        id_num, sents = list(line.values())[1:]
        preprocessed = [ preprocess_sentence(sent) for sent in sents ]
        key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , cosine_sent_sim )
        key_sentence1 ='\n'.join([sents[t] for t in key_index])      
        i += 1
        if i == 10:
            break

[['석문/NNP', '간척지/NNG', '임차/NNG', '법인/NNP', '협의회/NNG', '한국농어촌공사/NNP', '당진/NNP', '지사/NNP', '공공/NNG', '비축/NNG', '벼/NNG', '시위/NNG', '벌이/VV', '있/VV'], ['석문/NNP', '간척지/NNG', '임차/NNG', '법인/NNP', '협의회/NNG', '농림/NNP', '축산/NNP', '식품/NNG', '간척지/NNG', '임대료/NNG', '책정/NNG', '한국농어촌공사/NNP', '당진/NNP', '지사/NNP', '공공/NNG', '비축/NNG', '벼/NNG', '시위/NNG', '벌이/VV'], ['영농/NNG', '조합/NNP', '법인/NNP', '간척지/NNG', '협의회/NNG', '벼/NNG', '시위/NNG', '책정/NNG', '임대료/NNG', '인하/NNG', '있/VV'], ['지나/VV', '농림/NNP', '축산/NNP', '식품/NNG', '임대료/NNG', '올해/NNG', '상황/NNG'], ['임차/NNG', '올해/NNG', '임대료/NNG', '인하/NNG', '지나/VV', '동안/NNG', '보상/NNG', '상황/NNG'], ['간척지/NNG', '협의회/NNG', '기간/NNG', '연장/NNG', '연장/NNG', '기간/NNG', '동안/NNG', '인하/NNG', '임대료/NNG', '지나/VV', '보상/NNG'], ['벼/NNG', '시위/NNG', '임대료/NNG', '벼/NNG'], ['영농/NNG', '조합/NNP', '법인/NNP', '한국농어촌공사/NNP', '있/VV', '상황/NNG'], ['만들/VV', '있/VV', '만들/VV', '있/VV']]
[['벌/NNG', '떼/NNG', '해장국/NNP', '손님/NNG', '보답/NNG', '시간/NNP', '영업/NNG', '재개/NNG'], ['특별/XR', '감사/NNG', '이벤트/NNG'], ['동안/NNG', '야간/NNG'

In [16]:
# try 5
with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
    i = 0
    for line in input_file:
        line = json.loads(line)
        id_num, sents = list(line.values())[1:]
        preprocessed = [ preprocess_sentence(sent) for sent in sents ]
        key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , textrank_sent_sim   )
        key_sentence1 ='\n'.join([sents[t] for t in key_index])
        row1 = [id_num , key_sentence1 ]
        submission2.loc[i] = row1
        i +=1

KeyboardInterrupt: 

In [15]:
submission2.to_csv('C:/Users/Playdata/Desktop/dacon_extract_summary/extractive_submission2.csv', sep=',', index=False)