## 1.Import 및 라이브러리 다운로드

In [1]:
import gc
import torch
import sys

gc.collect()
torch.cuda.empty_cache()

sys.path.append('~/aiffel/Aiffelthon_koBART')

In [2]:
!pip install rouge_score
!pip install datasets==1.0.2
!pip install transformers==4.24.0
!pip install transformer-utils
!pip install packaging



In [3]:
# 필요한 라이브러리 불러오기
import datasets
import transformers
import pandas as pd
from datasets import Dataset

#Tokenizer
from transformers import RobertaTokenizerFast

#Encoder-Decoder Model
from transformers import EncoderDecoderModel

#Training
from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments
from seq2seq_training_args import Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_from_disk
import numpy as np
import nltk

from tqdm import tqdm

In [4]:
# 1. Data EDA
### 이 과정에서는 Data의 특성을 파악하고 얼만큼의 Data를 사용할 것인지 파악할 것이다
#- 예를 들어 토큰화 이후 Data별 Len을 확인하였을 때 적절한 len을 찾는 것이 목표이다.


# 데이터 불러오기
train_20sent = pd.read_csv('data/train_20per_Sum3.csv')
val_20sent = pd.read_csv('data/train_20per_Sum3.csv')


# 데이터 별 길이
# 현재 train, val의 길이가 동일한 것으로 나오는데 데이터 추출에서 문제가 발생한 것으로 예측됨.

print('train 20%sen : ',len(train_20sent))
print('val 20%sen : ',len(val_20sent))


train 20%sen :  73431
val 20%sen :  73431


In [5]:
type(train_20sent['input'])

pandas.core.series.Series

In [6]:
def preprocess_sentence(sentence):
    sentence = sentence.str.lower() # 텍스트 소문자화
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열 (...) 제거
    sentence = re.sub('"','', sentence) # 쌍따옴표 제거
    sentence = re.sub("'",'', sentence) # 따옴표 제거
    sentence = re.sub('\n','', sentence) # \n " 제거
    sentence = re.sub('.{2,3}\W{0,1}기자','', sentence) # 기자 이름 제거
    sentence = re.sub(r'[?.!,][/?.!,]', '', sentence) # 여러개 문장 부호를 하나의 문장부호로 바꿉니다
    sentence = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣a-z0-9]", " ", sentence) # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub(r'[" "]+', " ", sentence) # 여러개 공백을 하나의 공백으로 바꿉니다.
    sentence = sentence.strip() # 문장 양쪽 공백 제거

    return sentence


In [7]:
clean_text = []
clean_headlines = []

for i in tqdm(train_20sent['input']):
    clean_text.append(i)
for i in tqdm(train_20sent['sentence_per_20']):
    clean_headlines.append(i)

100%|██████████| 73431/73431 [00:00<00:00, 1874775.92it/s]
100%|██████████| 73431/73431 [00:00<00:00, 2100699.37it/s]


In [8]:
train_20sent['input'] = clean_text
train_20sent['sentence_per_20'] = clean_headlines

In [9]:
# reset_index 사용
train_20sent.reset_index(inplace=True, drop=True)
val_20sent.reset_index(inplace=True, drop=True)

In [10]:
# DF > data Set으로 전환
train_data = Dataset.from_pandas(train_20sent) 
val_len = len(val_20sent) // 2
val_data = Dataset.from_pandas(val_20sent[:val_len])
test_data=Dataset.from_pandas(val_20sent[val_len:])

In [11]:
print(train_data)
print(val_data)
print(test_data)

Dataset(features: {'input': Value(dtype='string', id=None), 'sentence_per_20': Value(dtype='string', id=None)}, num_rows: 73431)
Dataset(features: {'input': Value(dtype='string', id=None), 'sentence_per_20': Value(dtype='string', id=None)}, num_rows: 36715)
Dataset(features: {'input': Value(dtype='string', id=None), 'sentence_per_20': Value(dtype='string', id=None)}, num_rows: 36716)


In [12]:
max_input = 512
max_target = 128
batch_size = 3
model_checkpoints = "gogamza/kobart-base-v1"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [14]:
encoder_max_length = 256  # demo
decoder_max_length = 64 # 글자가 끊김 -> 이걸 더 늘려줄 필요가 있음

In [16]:
def preprocess_data(data_to_process):
  #get all the dialogues
  inputs = [dialogue for dialogue in data_to_process['input']]
  #tokenize the dialogues
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  #tokenize the summaries
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['sentence_per_20'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']
  #return the tokenized data
  #input_ids, attention_mask and labels
  return model_inputs

In [17]:
train_tokenize_data = train_data.map(preprocess_data, batched = True, remove_columns=['input', 'sentence_per_20'])
val_tokenize_data = val_data.map(preprocess_data, batched = True, remove_columns=['input', 'sentence_per_20'])



  0%|          | 0/74 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [22]:
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),}

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results6",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,  # demo
    per_device_eval_batch_size=16,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=2000,
    save_total_limit=3,
)

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset=train_tokenize_data,
    eval_dataset=val_tokenize_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 73431
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 22950
  Number of trainable parameters = 123859968
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2000,1.9739
4000,1.4751
6000,1.4506
8000,1.4411
10000,1.4332
12000,1.4249
14000,1.4218
16000,1.4136
18000,1.413
20000,1.4077


Saving model checkpoint to results6/checkpoint-500
Configuration saved in results6/checkpoint-500/config.json
Model weights saved in results6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results6/checkpoint-500/tokenizer_config.json
Special tokens file saved in results6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to results6/checkpoint-1000
Configuration saved in results6/checkpoint-1000/config.json
Model weights saved in results6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in results6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in results6/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to results6/checkpoint-1500
Configuration saved in results6/checkpoint-1500/config.json
Model weights saved in results6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in results6/checkpoint-1500/tokenizer_config.json
Special tokens file saved in results6/checkpoint-1500/special_tokens_map.json
Sav

TrainOutput(global_step=22950, training_loss=1.475346052049292, metrics={'train_runtime': 28541.6505, 'train_samples_per_second': 12.864, 'train_steps_per_second': 0.804, 'total_flos': 1.119338946625536e+17, 'train_loss': 1.475346052049292, 'epoch': 5.0})

In [27]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 36715
  Batch size = 16


{'eval_loss': 1.4190137386322021,
 'eval_rouge2_precision': 0.1506,
 'eval_rouge2_recall': 0.0689,
 'eval_rouge2_fmeasure': 0.0866,
 'eval_runtime': 2095.0949,
 'eval_samples_per_second': 17.524,
 'eval_steps_per_second': 1.095,
 'epoch': 5.0}

In [30]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["input"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    print('통과2')
    input_ids = inputs.input_ids.to(model.device)
    
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v1")# 여기에 기본 kobart가져오기?

test_samples = val_data.select(range(16))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1] # 여기에 체크포인트 가져오기 
# 연구해봐야한다.

loading configuration file config.json from cache at /aiffel/.cache/huggingface/hub/models--gogamza--kobart-base-v1/snapshots/d7e64abd841bc1fa5d2939d14161124c51f29e8b/config.json
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Model config BartConfig {
  "_name_or_path": "gogamza/kobart-base-v1",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "

통과2


  scores = () if (return_dict_in_generate and output_scores) else None


통과2


In [34]:
from tabulate import tabulate

In [35]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["sentence_per_20"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["input"])), headers=["Id", "sentence_per_20"]))

  Id  Summary after                                                                                    Summary before
----  -----------------------------------------------------------------------------------------------  ------------------------------------------------------------------------
   0  독일계 음식 배달서비스업체 DH(딜리버리 히어로)가 평가한 우아                                     40억 달러 ‘ ‘ ‘’ 주인공 김봉진 우아한형제들 대표태
   1  한·중 수교 이후 지금까지의 한·중관계는 경제적 협력 관계를 중심으로 발전해 왔다. 이로             예상  베이징  베이징에서 열린 한·중 정상회담을 계기로 내년 봄 시진핑
   2  한·중이 사드에 관한 ‘3불(不)’이 ‘약속’인지 ‘                                                     속’ ‘약속’인지 ‘입장표명’인지 표현을 놓고 갈등하는
   3  배달의민족이 독일 자본에 매각된 것을 놓고 말들이 많다. 민족 정서를                               배달의민족이 독일 자본에 매각된 것을 놓고 말들이 사람들이  민족 정서를
   4  지난 28일부터 나흘간 진행된 7기 5차 노동당 전원회의에서 북한은 핵 무력 개발의                    북한 전원회의에서 북한은 핵        북한 전원회의에서 북한은 핵
   5  부산 해운대의 상징이었던 5성급 해운대그랜드호텔(그랜드호텔)이 지난해                             은 부산대의 상징이었던 5성급 해운대그랜드호텔(그랜드호텔
   6  워런은 지난해 하반기 최고 관심 후보였다. 지난해 1