In [23]:
%load_ext autoreload
%autoreload 2
from tqdm import tqdm
import os
import numpy as np

import json
import random
from collections import Counter

from konlpy.tag import Komoran
komoran = Komoran()

os.chdir('../')
import yake
from yake.korea_token import edit_josa, edit_sentences
os.chdir('test_dacon/')

from sklearn.feature_extraction.text import TfidfVectorizer 
from collections import defaultdict

from krwordrank.word import KRWordRank
import math

from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pandas import DataFrame
from test_testmain import RougeScorer

import warnings
warnings.filterwarnings(action='ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# stopword.txt 읽어오기 위한 path 및 파일이름
language = 'ko'
path = '../yake/StopwordsList/'
txt = 'stopwords_{}.txt'.format(language)

try:
    f = open(path + txt, encoding='UTF-8')
    ExistStopwords = set(f.read().split())
    f.close()
except FileNotFoundError:
    ExistStopwords = [None]

ExistStopwords = list(set(ExistStopwords))

In [12]:
with open("valid_original.json", "r", encoding="utf8") as f:
    contents = f.read() # string 타입
    json_data = json.loads(contents)
    f.close()

In [13]:
def noun_parser(txt:str):
    noun = komoran.nouns(txt)
    count = Counter(noun)

    # 명사 빈도 카운트
    noun_list = count.most_common(50)

    return_list = [i[0] for i in noun_list if len(i[0]) > 1 and i[0] not in ExistStopwords]
    return_list = return_list[:10]

    return return_list

In [14]:
def tfidf_parser(txt:list):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(txt)
    matrix = vectorizer.transform(txt)

    # vocabulary_word_id = defaultdict(int)
    vocabulary_word_id = {token:idx for idx,token in enumerate(vectorizer.get_feature_names())}
        
    # 특징 추출 결과: {"token": value}
    result = {token:matrix[0, vocabulary_word_id[token]] for token in vectorizer.get_feature_names()}
        
    # 내림차순 (중요도 high) 기준 정렬
    result = sorted(result.items(), key = lambda item: item[1], reverse = True)

    return_list = [i[0] for i in result if i[1] != 0 and i[0] not in ExistStopwords]
    return_list = return_list[:10]

    return return_list

In [15]:
def wordrank_parser(txt:list):
    wordrank_extractor = KRWordRank(min_count=5, max_length=10)
    keywords, rank, graph = wordrank_extractor.extract(txt, num_keywords=100)

    def make_vocab_score(keywords, scaling=None):
        if scaling is None:
            scaling = lambda x:math.sqrt(x)
        return {word:scaling(rank) for word, rank in keywords.items()}

    keywords = make_vocab_score(keywords)

    return_list = [i for i in keywords if i not in ExistStopwords]
    return_list = return_list[:10]
    
    return return_list

In [16]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

def keybert_parser(txt:list, model):
    
    tokenized_doc = komoran.pos(txt)
    tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] in ['NNG','NNP']])

    n_gram_range = (1, 3)
    count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
    candidates = count.get_feature_names_out()

    doc_embedding = model.encode([txt])
    candidate_embeddings = model.encode(candidates)

    top_n = 10
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

    return keywords

In [186]:
def kyake(txt:list):
    # print("txt: ", txt)
    kw_extractor = yake.KeywordExtractor(n=[1,2,3],top=30, stoplen=2, windowsSize=1, WG=False, ReDup=False)
    return_tuple = kw_extractor.extract_keywords(''.join(txt))
    # print("return_tuple: ", return_tuple)
    return_list = [i[0] for i in return_tuple]
    return_list = return_list[:10]
    # print("return_list: ", return_list)

    return return_list

In [189]:
news_list = np.random.choice(range(0,30120),300,replace=False)  # max: 30122 || 30122개의 뉴스 데이터 # random.randint(0, 30000)
df_trainset = DataFrame(columns=["id", "summary"])
df_testset = DataFrame(columns=["id", "summary"])

scorer = RougeScorer()
_dataframe = DataFrame()

R1 = []
R2 = []
Rl = []
Rw = []

for i in tqdm(news_list):
    
    id = None
    text_list = []
    topic_number = None

    text_list = [
        json_data["documents"][i]["text"][j][k]["sentence"] + '.'
        for j in range(len(json_data["documents"][i]["text"]))
        for k in range(len(json_data["documents"][i]["text"][j]))
    ]

    topic_number = json_data["documents"][i]["extractive"]
    id = [int(json_data["documents"][0]['id'])]

    text = [' '.join(text_list)]
    STRTexT = ' '.join(text_list)
    LISTText = text_list

    new_text = edit_sentences(STRTexT)
    total_value = [' '.join(edit_josa(new_text[x])) for x in range(len(new_text))]
    TUNNEDText = total_value

    ### 빈도수 기반 명사 추출: noun_parser(STRTexT)      	 ||	txt:str
    ### TF.IDF: tfidf_parser(TUNNEDText)    	        ||	txt:list
    ### KR-WordRank: wordrank_parser(TUNNEDText)	    ||	txt:list
    ### Korean KeyBERT" keybert_parser(STRTexT, model) 	||	txt:list
    ### Kyake 키워드 추출: kyake(text)                     || txt:list

    # keywords = noun_parser(STRTexT)
    # keywords = tfidf_parser(TUNNEDText)
    # keywords = wordrank_parser(LISTText)  # TUNNEDText
    # keywords = keybert_parser(STRTexT, model)
    keywords = kyake(text)

    score = 0
    for l in range(len(text_list)):     # text_list[l] : text_list의 l번째 문장
        for m in range(len(keywords)):  # keywords : text파일의 문서를 각 함수들을 돌린 후 도출되는 키워드들.
            if keywords[m] in text_list[l]:
                score = score + 1
        score_data = dict(zip(["id", "index", "score"], [i, l, score]))
        score = 0
        _dataframe = _dataframe.append(score_data, ignore_index=True)

    multi_index = _dataframe.sort_values(by=['score', 'index'], ascending=[False, True]).groupby("id").head(3)
    multi_index = multi_index.sort_values(by=['id','score'], ascending=[True, False])

    summary_number = [int (i) for i in list(multi_index[multi_index['id']==i]['index'])]

    df_testset['id'] = id
    df_testset['summary'] = [[text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]]]
    df_trainset['id'] = id
    df_trainset['summary'] = [[text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]]]

    score_set = scorer.compute_rouge(df_trainset, df_testset)
    rouge_1 = score_set[0]
    rouge_2 = score_set[1]
    rouge_l = score_set[2]
    rouge_w = score_set[3]

    R1.append(rouge_1)
    R2.append(rouge_2)
    Rl.append(rouge_l)
    Rw.append(rouge_w)

    # print("키워드들: ", keywords)
    # print(i,"번째 문단..")
    # print("선택 문장 : ", (summary_number))
    # print([text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]])
    # print("정답지 : ", topic_number)
    # print([text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]])
    # print()

print("rouge_1의 평균점수: ", sum(R1) / len(R1))
print("rouge_2의 평균점수: ", sum(R2) / len(R2))
print("rouge_l의 평균점수: ", sum(Rl) / len(Rl))
print("rouge_w의 평균점수: ", sum(Rw) / len(Rw))

100%|██████████| 1000/1000 [05:42<00:00,  2.92it/s]

rouge_1의 평균점수:  0.4265763741125359
rouge_2의 평균점수:  0.2914302183256145
rouge_l의 평균점수:  0.3807695787165743
rouge_w의 평균점수:  0.23431972013031416





# 30000개 테스트

In [190]:
news_list = np.random.choice(range(0,30120),30000,replace=False)  # max: 30122 || 30122개의 뉴스 데이터 # random.randint(0, 30000)

* noun

In [191]:
df_trainset = DataFrame(columns=["id", "summary"])
df_testset = DataFrame(columns=["id", "summary"])

scorer = RougeScorer()
_dataframe = DataFrame()

R1 = []
R2 = []
Rl = []
Rw = []

for i in tqdm(news_list):
    
    id = None
    topic_number = None
    
    text_list = [
        json_data["documents"][i]["text"][j][k]["sentence"] + '.' 
        for j in range(len(json_data["documents"][i]["text"])) 
        for k in range(len(json_data["documents"][i]["text"][j]))
    ]

    topic_number = json_data["documents"][i]["extractive"]
    id = [int(json_data["documents"][0]['id'])]

    text = [' '.join(text_list)]
    STRTexT = ' '.join(text_list)
    LISTText = text_list

    new_text = edit_sentences(STRTexT)
    total_value = [' '.join(edit_josa(new_text[x])) for x in range(len(new_text))]
    TUNNEDText = total_value

    ### 빈도수 기반 명사 추출: noun_parser(STRTexT)      	 ||	txt:str

    """
    return : list
    """

    keywords = noun_parser(STRTexT)

    score = 0
    for l in range(len(text_list)):         # text_list[l] : text_list의 l번째 문장
        for m in range(len(keywords)):      # keywords : text파일의 문서를 각 함수들을 돌린 후 도출되는 키워드들.
            if keywords[m] in text_list[l]:
                score = score + 1
        score_data = dict(zip(["id", "index", "score"], [i, l, score]))
        score = 0
        _dataframe = _dataframe.append(score_data, ignore_index=True)

    multi_index = _dataframe.sort_values(by=['score', 'index'], ascending=[False, True]).groupby("id").head(3)
    multi_index = multi_index.sort_values(by=['id','score'], ascending=[True, False])

    summary_number = [int (i) for i in list(multi_index[multi_index['id']==i]['index'])]

    try:    
        df_testset['id'] = id
        df_testset['summary'] = [[text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]]]
        df_trainset['id'] = id
        df_trainset['summary'] = [[text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]]]
    except TypeError:
        print("TypeError 발생: ", json_data["documents"][i]["id"])
    
    score_set = scorer.compute_rouge(df_trainset, df_testset)
    rouge_1 = score_set[0]
    rouge_2 = score_set[1]
    rouge_l = score_set[2]
    rouge_w = score_set[3]

    R1.append(rouge_1)
    R2.append(rouge_2)
    Rl.append(rouge_l)
    Rw.append(rouge_w)

print("rouge_1의 평균점수: ", sum(R1) / len(R1))
print("rouge_2의 평균점수: ", sum(R2) / len(R2))
print("rouge_l의 평균점수: ", sum(Rl) / len(Rl))
print("rouge_w의 평균점수: ", sum(Rw) / len(Rw))

 46%|████▌     | 13811/30000 [15:05:11<58:25,  4.62it/s]     

TypeError 발생:  353136918


100%|██████████| 30000/30000 [16:29:14<00:00,  1.98s/it]  

rouge_1의 평균점수:  0.40289508598456664
rouge_2의 평균점수:  0.25501628855825076
rouge_l의 평균점수:  0.3500841162403251
rouge_w의 평균점수:  0.2132214644427696





* tfidf

In [192]:
df_trainset = DataFrame(columns=["id", "summary"])
df_testset = DataFrame(columns=["id", "summary"])

scorer = RougeScorer()
_dataframe = DataFrame()

R1 = []
R2 = []
Rl = []
Rw = []

for i in tqdm(news_list):
    
    id = None
    topic_number = None
    
    text_list = [
        json_data["documents"][i]["text"][j][k]["sentence"] + '.' 
        for j in range(len(json_data["documents"][i]["text"])) 
        for k in range(len(json_data["documents"][i]["text"][j]))
    ]

    topic_number = json_data["documents"][i]["extractive"]
    id = [int(json_data["documents"][0]['id'])]

    text = [' '.join(text_list)]
    STRTexT = ' '.join(text_list)
    LISTText = text_list

    new_text = edit_sentences(STRTexT)
    total_value = [' '.join(edit_josa(new_text[x])) for x in range(len(new_text))]
    TUNNEDText = total_value

    ### TF.IDF: tfidf_parser(TUNNEDText)    	        ||	txt:list

    """
    return : list
    """

    keywords = tfidf_parser(LISTText)

    score = 0
    for l in range(len(LISTText)):         # text_list[l] : text_list의 l번째 문장
        for m in range(len(keywords)):     # keywords : text파일의 문서를 각 함수들을 돌린 후 도출되는 키워드들.
            if keywords[m] in LISTText[l]:
                score = score + 1
        score_data = dict(zip(["id", "index", "score"], [i, l, score]))
        score = 0
        _dataframe = _dataframe.append(score_data, ignore_index=True)

    multi_index = _dataframe.sort_values(by=['score', 'index'], ascending=[False, True]).groupby("id").head(3)
    multi_index = multi_index.sort_values(by=['id','score'], ascending=[True, False])

    summary_number = [int (i) for i in list(multi_index[multi_index['id']==i]['index'])]

    try:    
        df_testset['id'] = id
        df_testset['summary'] = [[text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]]]
        df_trainset['id'] = id
        df_trainset['summary'] = [[text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]]]
    except TypeError:
        print("TypeError 발생: ", json_data["documents"][i]["id"])
    
    score_set = scorer.compute_rouge(df_trainset, df_testset)
    rouge_1 = score_set[0]
    rouge_2 = score_set[1]
    rouge_l = score_set[2]
    rouge_w = score_set[3]

    R1.append(rouge_1)
    R2.append(rouge_2)
    Rl.append(rouge_l)
    Rw.append(rouge_w)
    
print("rouge_1의 평균점수: ", sum(R1) / len(R1))
print("rouge_2의 평균점수: ", sum(R2) / len(R2))
print("rouge_l의 평균점수: ", sum(Rl) / len(Rl))
print("rouge_w의 평균점수: ", sum(Rw) / len(Rw))

 46%|████▌     | 13811/30000 [46:44<1:22:08,  3.28it/s]

TypeError 발생:  353136918


100%|██████████| 30000/30000 [2:04:56<00:00,  4.00it/s]  

rouge_1의 평균점수:  0.49430827391830257
rouge_2의 평균점수:  0.3755918322511678
rouge_l의 평균점수:  0.4550028104580825
rouge_w의 평균점수:  0.28702157616674867





* wordrank

In [193]:
df_trainset = DataFrame(columns=["id", "summary"])
df_testset = DataFrame(columns=["id", "summary"])

scorer = RougeScorer()
_dataframe = DataFrame()

R1 = []
R2 = []
Rl = []
Rw = []

for i in tqdm(news_list):
    
    id = None
    topic_number = None
    
    text_list = [
        json_data["documents"][i]["text"][j][k]["sentence"] + '.' 
        for j in range(len(json_data["documents"][i]["text"])) 
        for k in range(len(json_data["documents"][i]["text"][j]))
    ]

    topic_number = json_data["documents"][i]["extractive"]
    id = [int(json_data["documents"][0]['id'])]

    text = [' '.join(text_list)]
    STRTexT = ' '.join(text_list)
    LISTText = text_list

    new_text = edit_sentences(STRTexT)
    total_value = [' '.join(edit_josa(new_text[x])) for x in range(len(new_text))]
    TUNNEDText = total_value

    ### KR-WordRank: wordrank_parser(TUNNEDText)	    ||	txt:list

    """
    return : list
    """

    keywords = wordrank_parser(LISTText)  # TUNNEDText

    score = 0
    for l in range(len(text_list)):         # text_list[l] : text_list의 l번째 문장
        for m in range(len(keywords)):      # keywords : text파일의 문서를 각 함수들을 돌린 후 도출되는 키워드들.
            if keywords[m] in text_list[l]:
                score = score + 1
        score_data = dict(zip(["id", "index", "score"], [i, l, score]))
        score = 0
        _dataframe = _dataframe.append(score_data, ignore_index=True)

    multi_index = _dataframe.sort_values(by=['score', 'index'], ascending=[False, True]).groupby("id").head(3)
    multi_index = multi_index.sort_values(by=['id','score'], ascending=[True, False])

    summary_number = [int (i) for i in list(multi_index[multi_index['id']==i]['index'])]

    try:    
        df_testset['id'] = id
        df_testset['summary'] = [[text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]]]
        df_trainset['id'] = id
        df_trainset['summary'] = [[text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]]]
    except TypeError:
        print("TypeError 발생: ", json_data["documents"][i]["id"])
    
    score_set = scorer.compute_rouge(df_trainset, df_testset)
    rouge_1 = score_set[0]
    rouge_2 = score_set[1]
    rouge_l = score_set[2]
    rouge_w = score_set[3]

    R1.append(rouge_1)
    R2.append(rouge_2)
    Rl.append(rouge_l)
    Rw.append(rouge_w)

print("rouge_1의 평균점수: ", sum(R1) / len(R1))
print("rouge_2의 평균점수: ", sum(R2) / len(R2))
print("rouge_l의 평균점수: ", sum(Rl) / len(Rl))
print("rouge_w의 평균점수: ", sum(Rw) / len(Rw))

 46%|████▌     | 13811/30000 [1:44:53<50:05,  5.39it/s]     

TypeError 발생:  353136918


100%|██████████| 30000/30000 [3:00:12<00:00,  2.77it/s]  

rouge_1의 평균점수:  0.4164455377369773
rouge_2의 평균점수:  0.2728152794166633
rouge_l의 평균점수:  0.3662498021335229
rouge_w의 평균점수:  0.22441687718580353





* Kyake

In [195]:
SpecialToken = r"""#(*+/\:;<=[^_₩{|~‘“""" # 띄우기
NotSpecialToken = r""",'"]}>)”’"""   # 붙이기
                            # 여기에 없으면 유지
exclude = set(SpecialToken)
unexclude = set(NotSpecialToken)

# 특수문자 제거 함수
def str2token(TokenList):
    NewTokenList = []

    for i in TokenList:
        if i in exclude:
            TokenList = TokenList.replace(i, ' ')

        elif i in unexclude:
            TokenList = TokenList.replace(i, '')

    # NewTokenList = TokenList.split()
    return TokenList


In [196]:
df_trainset = DataFrame(columns=["id", "summary"])
df_testset = DataFrame(columns=["id", "summary"])

scorer = RougeScorer()
_dataframe = DataFrame()

R1 = []
R2 = []
Rl = []
Rw = []

for i in tqdm(news_list):
    
    id = None
    topic_number = None
    
    text_list = [
        str2token(json_data["documents"][i]["text"][j][k]["sentence"] + '.') 
        for j in range(len(json_data["documents"][i]["text"])) 
        for k in range(len(json_data["documents"][i]["text"][j]))
    ]

    topic_number = json_data["documents"][i]["extractive"]
    id = [int(json_data["documents"][0]['id'])]

    text = [' '.join(text_list)]
    STRTexT = ' '.join(text_list)
    LISTText = text_list

    new_text = edit_sentences(STRTexT)
    total_value = [' '.join(edit_josa(new_text[x])) for x in range(len(new_text))]
    TUNNEDText = total_value

    ### Kyake 키워드 추출: kyake(text)                     || txt:list

    """
    return : list
    """

    keywords = kyake(text)

    score = 0
    for l in range(len(text_list)):         # text_list[l] : text_list의 l번째 문장
        for m in range(len(keywords)):      # keywords : text파일의 문서를 각 함수들을 돌린 후 도출되는 키워드들.
            if keywords[m] in text_list[l]:
                score = score + 1
        score_data = dict(zip(["id", "index", "score"], [i, l, score]))
        score = 0
        _dataframe = _dataframe.append(score_data, ignore_index=True)

    multi_index = _dataframe.sort_values(by=['score', 'index'], ascending=[False, True]).groupby("id").head(3)
    multi_index = multi_index.sort_values(by=['id','score'], ascending=[True, False])

    summary_number = [int (i) for i in list(multi_index[multi_index['id']==i]['index'])]

    try:    
        df_testset['id'] = id
        df_testset['summary'] = [[text_list[int(summary_number[0])],text_list[int(summary_number[1])], text_list[int(summary_number[2])]]]
        df_trainset['id'] = id
        df_trainset['summary'] = [[text_list[int(topic_number[0])],text_list[int(topic_number[1])], text_list[int(topic_number[2])]]]
    except TypeError:
        print("TypeError 발생: ", json_data["documents"][i]["id"])
    
    score_set = scorer.compute_rouge(df_trainset, df_testset)
    rouge_1 = score_set[0]
    rouge_2 = score_set[1]
    rouge_l = score_set[2]
    rouge_w = score_set[3]

    R1.append(rouge_1)
    R2.append(rouge_2)
    Rl.append(rouge_l)
    Rw.append(rouge_w)

print("rouge_1의 평균점수: ", sum(R1) / len(R1))
print("rouge_2의 평균점수: ", sum(R2) / len(R2))
print("rouge_l의 평균점수: ", sum(Rl) / len(Rl))
print("rouge_w의 평균점수: ", sum(Rw) / len(Rw))

 46%|████▌     | 13811/30000 [1:24:31<1:48:09,  2.49it/s]

TypeError 발생:  353136918
TypeError 발생:  <class 'TypeError'>


100%|██████████| 30000/30000 [17:51:20<00:00,  2.14s/it]       

rouge_1의 평균점수:  0.4866184531288278
rouge_2의 평균점수:  0.36551729143361406
rouge_l의 평균점수:  0.4469727954869353
rouge_w의 평균점수:  0.28046706763847146





* KeyBERT 모델은 기본 점수가 좋지 않을 뿐더러 시간이 오래 걸리는 관계로 생략함