# 사전 준비

In [None]:
!pip install transformers
!pip install datasets

**네이버 뉴스 요약 데이터 불러오기**

In [2]:
from datasets import load_dataset

datasets = load_dataset("daekeun-ml/naver-news-summarization-ko")

Downloading readme:   0%|          | 0.00/787 [00:00<?, ?B/s]



Downloading and preparing dataset csv/daekeun-ml--naver-news-summarization-ko to /root/.cache/huggingface/datasets/daekeun-ml___csv/daekeun-ml--naver-news-summarization-ko-884ccea06154613b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/66.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.45M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/daekeun-ml___csv/daekeun-ml--naver-news-summarization-ko-884ccea06154613b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['date', 'category', 'press', 'title', 'document', 'link', 'summary'],
        num_rows: 22194
    })
    test: Dataset({
        features: ['date', 'category', 'press', 'title', 'document', 'link', 'summary'],
        num_rows: 2740
    })
    validation: Dataset({
        features: ['date', 'category', 'press', 'title', 'document', 'link', 'summary'],
        num_rows: 2466
    })
})

**KoBART 모델과 Tokenizer 불러오기**

In [5]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v2')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v2')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/496M [00:00<?, ?B/s]

# 데이터 구축

**전처리 함수 정의**


In [None]:
!pip install soynlp
!pip install emoji==1.7.0

In [7]:
import re
import emoji
from soynlp.normalizer import repeat_normalize

emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
special_symbol = re.compile(
    r'([.,?!/@$%~％·∼()\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E])\1{1,}')

In [8]:
def clean(x):
    x = pattern.sub(' ', x)                     # 일반적으로 사용하는 특수문자, 영어, 한글, emoji제외 공백으로 치환
    x = url_pattern.sub('', x)                  # URL 제거
    x = special_symbol.sub('\1'*1, x)          # 반복되는 특수문자의 축약 횟수 1개로 줄임
    x = x.strip()                               # 문자의 시작과 끝에서 공백제거
    x = repeat_normalize(x, num_repeats=2)      # 반목되는 문자의 축약 횟수 2개로 줄임
    
    return x

**데이터셋 구축**

In [9]:
import numpy as np
from torch.utils.data import Dataset

class SummarySet(Dataset):
    def __init__(self, docs, tokenizer, max_len, ignore_id = -100):
        super().__init__()
        self.docs = docs
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
        self.ignore_id = ignore_id      # BartForConditionalGeneration의 labels 입력 index에 포함

    # padding 및 데이터 size 일치화 함수 (input_ids)
    def add_padding(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]
            
        return inputs

    # padding 및 데이터 size 일치화 함수 (labels)
    def add_ignored(self, inputs):
        if len(inputs) < self.max_len:
            ignored = np.array([self.ignore_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, ignored])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __len__(self):  
        return self.docs.num_rows

    def __getitem__(self, idx):
        instance = self.docs[idx]

        input_ids = self.tokenizer.encode(clean(instance['document']))
        input_ids = self.add_padding(input_ids)

        labels = self.tokenizer.encode(clean(instance['summary']))
        labels.append(self.eos_id)

        dec_input_ids = [self.eos_id]       # BART 디코더 입력의 시작은 eos토큰의 인덱스로 시작
        dec_input_ids += labels[:-1]
        dec_input_ids = self.add_padding(dec_input_ids)

        labels = self.add_ignored(labels)

        return {'input_ids': np.array(input_ids, dtype=np.int_),
                'decoder_input_ids': np.array(dec_input_ids, dtype=np.int_),
                'labels': np.array(labels, dtype=np.int_)}

In [19]:
# 기사의 내용 평균 길이 (998.6이므로 max_len으로 1024가 적절하다 판단)
total = 0

for i in range(0, datasets['train'].num_rows):
    total += len(datasets["train"][i]['document'])

avg_doc_len = total / datasets['train'].num_rows
print(avg_doc_len)

998.6017392087952


In [10]:
train_set = SummarySet(datasets["train"], tokenizer, max_len=1024)
val_set = SummarySet(datasets["validation"], tokenizer, max_len=1024)
test_set = SummarySet(datasets["test"], tokenizer, max_len=1024)

**데이터로더 구축**

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, shuffle=False, batch_size=64)      # 22194 = 2*(3^4)*137  64size일때 마지막 배치의 데이터 50개, 32size일떄 18개
val_dataloader = DataLoader(val_set, shuffle=False, batch_size=64)      # 2740 = 2^2*5*137  32size일때 마지막 배치의 데이터 20개, 64size일때 52개
test_dataloader = DataLoader(test_set, shuffle=False, batch_size=32)      # 2466 = 2*3^2*137  16, 32size 모두 마지막 배치의 데이터 2개

In [21]:
next(iter(train_dataloader))

{'input_ids': tensor([[26001, 15530, 14516,  ...,     3,     3,     3],
         [14111, 11763,  1700,  ...,     3,     3,     3],
         [14036, 13125, 17350,  ...,     3,     3,     3],
         ...,
         [29949, 17588, 14880,  ...,     3,     3,     3],
         [15139, 14040, 14681,  ...,     3,     3,     3],
         [14856, 14698, 28234,  ...,     3,     3,     3]]),
 'decoder_input_ids': tensor([[    1, 14516, 18115,  ...,     3,     3,     3],
         [    1, 14705, 18633,  ...,     3,     3,     3],
         [    1, 14025, 14025,  ...,     3,     3,     3],
         ...,
         [    1, 14195, 15064,  ...,     3,     3,     3],
         [    1, 14802, 14953,  ...,     3,     3,     3],
         [    1, 14770, 14714,  ...,     3,     3,     3]]),
 'labels': tensor([[14516, 18115, 15736,  ...,  -100,  -100,  -100],
         [14705, 18633, 22751,  ...,  -100,  -100,  -100],
         [14025, 14025, 13699,  ...,  -100,  -100,  -100],
         ...,
         [14195, 15064, 1

In [22]:
next(iter(val_dataloader))

{'input_ids': tensor([[14245, 18597, 14465,  ...,     3,     3,     3],
         [19813, 26114, 14116,  ...,     3,     3,     3],
         [21985, 23170, 19235,  ...,     3,     3,     3],
         ...,
         [19765,  9120, 14558,  ...,     3,     3,     3],
         [20851, 23613, 14913,  ...,     3,     3,     3],
         [14470, 19850, 15106,  ...,     3,     3,     3]]),
 'decoder_input_ids': tensor([[    1, 14195, 16601,  ...,     3,     3,     3],
         [    1, 14025, 14898,  ...,     3,     3,     3],
         [    1, 21985, 23170,  ...,     3,     3,     3],
         ...,
         [    1, 16476, 12147,  ...,     3,     3,     3],
         [    1, 16415, 16602,  ...,     3,     3,     3],
         [    1, 14136, 19850,  ...,     3,     3,     3]]),
 'labels': tensor([[14195, 16601, 14689,  ...,  -100,  -100,  -100],
         [14025, 14898, 16732,  ...,  -100,  -100,  -100],
         [21985, 23170, 19235,  ...,  -100,  -100,  -100],
         ...,
         [16476, 12147,  

In [23]:
next(iter(test_dataloader))

{'input_ids': tensor([[14360, 11790, 19958,  ...,     3,     3,     3],
         [14188, 13679, 18904,  ...,     3,     3,     3],
         [21473, 13173,  9932,  ...,     3,     3,     3],
         ...,
         [16356, 18451, 22473,  ...,     3,     3,     3],
         [16692, 16049, 18643,  ...,     3,     3,     3],
         [16061, 16580, 12074,  ...,     3,     3,     3]]),
 'decoder_input_ids': tensor([[    1, 14360, 21622,  ...,     3,     3,     3],
         [    1, 14188, 13679,  ...,     3,     3,     3],
         [    1, 14029, 15582,  ...,     3,     3,     3],
         ...,
         [    1, 14141, 14986,  ...,     3,     3,     3],
         [    1, 14572, 16356,  ...,     3,     3,     3],
         [    1, 16476, 14670,  ...,     3,     3,     3]]),
 'labels': tensor([[14360, 21622, 14360,  ...,  -100,  -100,  -100],
         [14188, 13679, 18904,  ...,  -100,  -100,  -100],
         [14029, 15582, 12258,  ...,  -100,  -100,  -100],
         ...,
         [14141, 14986, 1

# 모델 학습

**모델 파라미터 설정**

**KoBART 학습 진행**

**모델 테스트**