### Environment

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Load dataset

In [2]:
data_files = {"../dataset_jit/train_jit.csv"}
dataset =  load_dataset("csv", data_files=data_files, delimiter="\t")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ko', 'je'],
        num_rows: 160000
    })
})

### Tokenizer

In [4]:
import torch
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

tokenizer = PreTrainedTokenizerFast.from_pretrained('hyunwoongko/kobart')
model = BartForConditionalGeneration.from_pretrained('hyunwoongko/kobart')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [5]:
examples= {'je': dataset['train']["je"], 
           'ko': dataset['train']["ko"]}

In [6]:
def preprocess_function(example):
    inputs = example['ko']
    targets = example['je']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)

    # KoBART의 입력과 출력을 설정합니다.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names,batched=True)


In [7]:
print( '원 데이터    :', dataset['train'][10]['je'] )
print( '처리 후 데이터:',  tokenizer.convert_ids_to_tokens(tokenized_datasets['train'][10]['input_ids']))
print( '토큰화       :', tokenized_datasets['train'][10]['input_ids'] )

print('\n')
print( '원 데이터    :', dataset['train'][10]['ko'] )
print( '처리 후 데이터:', tokenizer.convert_ids_to_tokens(tokenized_datasets['train'][10]['input_ids']) )
print( '토큰화       :', tokenized_datasets['train'][10]['labels'] )

원 데이터    : 예 .
처리 후 데이터: ['<s>', '▁예', '▁.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

In [8]:
import evaluate
import numpy as np
metric = evaluate.load("sacrebleu")

def compute_metrics_bleu(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    return result

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 160000
    })
})

In [10]:
formatted_train_df = pd.DataFrame([{
    "input_ids": np.array(fd["input_ids"], dtype=np.uint16),
    "attention_mask": np.array(fd["attention_mask"], dtype=np.uint8),
    "labels": np.array(fd["labels"], dtype=np.uint16)
} for fd in tokenized_datasets["train"]])

#formatted_valid_df = pd.DataFrame([{
#    "input_ids": np.array(fd["input_ids"], dtype=np.uint16),
#    "attention_mask": np.array(fd["attention_mask"], dtype=np.uint8),
#    "labels": np.array(fd["labels"], dtype=np.uint16)
#} for fd in tokenized_datasets["valid"]])

In [11]:
train_dataset = Dataset.from_pandas(formatted_train_df)
#valid_dataset = Dataset.from_pandas(formatted_valid_df)

In [12]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 160000
})

In [13]:
train_test_split = train_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'valid': train_test_split['test']
})

In [14]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 144000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
})

In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="result/bleu",                         
    evaluation_strategy="epoch",                        
    learning_rate=2e-5,                               
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,                  
    weight_decay=0.01,                                
    save_total_limit=3,                               
    num_train_epochs=5,                                 
    predict_with_generate=True,                        
)

trainer = Seq2SeqTrainer(
    model=model,                        
    args=training_args,                 
    train_dataset=dataset_dict['train'],         
    eval_dataset=dataset_dict['valid'],
    compute_metrics=compute_metrics_bleu
)

trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.8826, 'grad_norm': 0.5195364952087402, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.22}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1574, 'grad_norm': 0.4221462607383728, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.44}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1324, 'grad_norm': 0.39549702405929565, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.67}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1218, 'grad_norm': 0.3865397572517395, 'learning_rate': 1.6444444444444444e-05, 'epoch': 0.89}


                                                        
 20%|██        | 2250/11250 [1:39:12<6:23:31,  2.56s/it]

{'eval_loss': 0.10656744986772537, 'eval_bleu': 29.00716205040281, 'eval_runtime': 155.6405, 'eval_samples_per_second': 102.801, 'eval_steps_per_second': 1.606, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1144, 'grad_norm': 0.26906004548072815, 'learning_rate': 1.555555555555556e-05, 'epoch': 1.11}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1047, 'grad_norm': 0.2947693467140198, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.33}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1006, 'grad_norm': 0.4075740873813629, 'learning_rate': 1.377777777777778e-05, 'epoch': 1.56}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0982, 'grad_norm': 0.30008772015571594, 'learning_rate': 1.288888888888889e-05, 'epoch': 1.78}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0947, 'grad_norm': 0.2866904139518738, 'learning_rate': 1.2e-05, 'epoch': 2.0}


                                                        
 40%|████      | 4500/11250 [3:01:29<3:56:12,  2.10s/it]

{'eval_loss': 0.0912688598036766, 'eval_bleu': 30.41034487320904, 'eval_runtime': 136.9546, 'eval_samples_per_second': 116.827, 'eval_steps_per_second': 1.825, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0906, 'grad_norm': 0.3407308757305145, 'learning_rate': 1.1111111111111113e-05, 'epoch': 2.22}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0876, 'grad_norm': 0.2861734628677368, 'learning_rate': 1.0222222222222223e-05, 'epoch': 2.44}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0852, 'grad_norm': 0.31948086619377136, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.67}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0846, 'grad_norm': 0.26888760924339294, 'learning_rate': 8.444444444444446e-06, 'epoch': 2.89}


                                                        
 60%|██████    | 6750/11250 [4:22:45<2:37:54,  2.11s/it]

{'eval_loss': 0.0851917564868927, 'eval_bleu': 31.10401823955993, 'eval_runtime': 136.9792, 'eval_samples_per_second': 116.806, 'eval_steps_per_second': 1.825, 'epoch': 3.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0833, 'grad_norm': 0.30734190344810486, 'learning_rate': 7.555555555555556e-06, 'epoch': 3.11}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0785, 'grad_norm': 0.27820688486099243, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.33}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0805, 'grad_norm': 0.3068881332874298, 'learning_rate': 5.777777777777778e-06, 'epoch': 3.56}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0784, 'grad_norm': 0.26097571849823, 'learning_rate': 4.888888888888889e-06, 'epoch': 3.78}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0795, 'grad_norm': 0.2198985517024994, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                                        
 80%|████████  | 9000/11250 [5:44:18<1:18:46,  2.10s/it]

{'eval_loss': 0.08268947899341583, 'eval_bleu': 31.446335732938252, 'eval_runtime': 137.1012, 'eval_samples_per_second': 116.702, 'eval_steps_per_second': 1.823, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.076, 'grad_norm': 0.28390538692474365, 'learning_rate': 3.1111111111111116e-06, 'epoch': 4.22}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0755, 'grad_norm': 0.30689266324043274, 'learning_rate': 2.222222222222222e-06, 'epoch': 4.44}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0757, 'grad_norm': 0.354523241519928, 'learning_rate': 1.3333333333333334e-06, 'epoch': 4.67}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.0753, 'grad_norm': 0.3156682252883911, 'learning_rate': 4.444444444444445e-07, 'epoch': 4.89}


                                                       
100%|██████████| 11250/11250 [7:05:50<00:00,  2.27s/it]

{'eval_loss': 0.0818123146891594, 'eval_bleu': 31.530046175977446, 'eval_runtime': 136.7061, 'eval_samples_per_second': 117.039, 'eval_steps_per_second': 1.829, 'epoch': 5.0}
{'train_runtime': 25550.1475, 'train_samples_per_second': 28.18, 'train_steps_per_second': 0.44, 'train_loss': 0.12863388264973957, 'epoch': 5.0}





TrainOutput(global_step=11250, training_loss=0.12863388264973957, metrics={'train_runtime': 25550.1475, 'train_samples_per_second': 28.18, 'train_steps_per_second': 0.44, 'total_flos': 5.48762812416e+16, 'train_loss': 0.12863388264973957, 'epoch': 5.0})

In [30]:
trainer.evaluate()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100

{'eval_loss': 0.0818123146891594,
 'eval_bleu': 31.530046175977446,
 'eval_runtime': 198.6908,
 'eval_samples_per_second': 80.527,
 'eval_steps_per_second': 1.258,
 'epoch': 5.0}

In [32]:
trainer.save_pretrained("./result/bleu")

AttributeError: 'Seq2SeqTrainer' object has no attribute 'save_pretrained'

In [33]:
# 모델 저장
model.save_pretrained('./bart_bleu')
tokenizer.save_pretrained('./bart_bleu')


AttributeError: 'RegressionMetric' object has no attribute 'save_pretrained'

Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/status-report/update
Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001ECB95933A0>: Failed to resolve 'www.comet.com' ([Errno 11001] getaddrinfo failed)")': /clientlib/status-report/update
