### Environment

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Load dataset

In [2]:
data_files = {"../dataset_jit/train_jit.csv"}
dataset =  load_dataset("csv", data_files=data_files, delimiter="\t")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ko', 'je'],
        num_rows: 160000
    })
})

### Tokenizer

In [4]:
import torch
from tqdm import tqdm
import numpy as np
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

tokenizer = PreTrainedTokenizerFast.from_pretrained('hyunwoongko/kobart')
model = BartForConditionalGeneration.from_pretrained('hyunwoongko/kobart')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [5]:
examples= {'je': dataset['train']["je"], 
           'ko': dataset['train']["ko"]}

In [6]:
def preprocess_function(example):
    inputs = example['ko']
    targets = example['je']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)

    # KoBART의 입력과 출력을 설정합니다.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names,batched=True)


In [7]:
print( '원 데이터    :', dataset['train'][10]['je'] )
print( '처리 후 데이터:',  tokenizer.convert_ids_to_tokens(tokenized_datasets['train'][10]['input_ids']))
print( '토큰화       :', tokenized_datasets['train'][10]['input_ids'] )

print('\n')
print( '원 데이터    :', dataset['train'][10]['ko'] )
print( '처리 후 데이터:', tokenizer.convert_ids_to_tokens(tokenized_datasets['train'][10]['input_ids']) )
print( '토큰화       :', tokenized_datasets['train'][10]['labels'] )

원 데이터    : 예 .
처리 후 데이터: ['<s>', '▁예', '▁.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

In [8]:
from comet import download_model, load_from_checkpoint
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

model_path = download_model("Unbabel/wmt20-comet-da")
comet_model = load_from_checkpoint(model_path)

experiment = Experiment(
  api_key="ekgBymOFSthlTKqzOhUQxubBQ",
  project_name="general",
  workspace="baechaemuk"
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.3.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\baech_6wlvk\.cache\huggingface\hub\models--Unbabel--wmt20-comet-da\snapshots\4c372befe4d603e6d0363f434248ecad66945607\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\baech_6wlvk\anaconda3\envs\torch23\lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in 'c:\\Users\\baech_6wlvk\\Chaemuk\\NaturalLanguageProcess\\fin_project\\kobart' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/baechaemuk/general/e2de47e78fe344999e2435b17f3bf575



In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 160000
    })
})

In [10]:
formatted_train_df = pd.DataFrame([{
    "input_ids": np.array(fd["input_ids"], dtype=np.uint16),
    "attention_mask": np.array(fd["attention_mask"], dtype=np.uint8),
    "labels": np.array(fd["labels"], dtype=np.uint16)
} for fd in tokenized_datasets["train"]])

#formatted_valid_df = pd.DataFrame([{
#    "input_ids": np.array(fd["input_ids"], dtype=np.uint16),
#    "attention_mask": np.array(fd["attention_mask"], dtype=np.uint8),
#    "labels": np.array(fd["labels"], dtype=np.uint16)
#} for fd in tokenized_datasets["valid"]])

In [11]:
train_dataset = Dataset.from_pandas(formatted_train_df)
#valid_dataset = Dataset.from_pandas(formatted_valid_df)

In [12]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 160000
})

In [13]:
train_test_split = train_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'valid': train_test_split['test']
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 144000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
})

In [14]:
train_dataset = dataset_dict['train'].select(range(72000))
valid_dataset = dataset_dict['valid'].select(range(8000))

In [15]:
def compute_metrics_comet(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.tensor(predictions)
    predictions = predictions.argmax(dim=-1)
    
    pred_str = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels[labels == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 데이터셋에서 소스 텍스트와 참조 텍스트를 가져옴
    source_texts = tokenizer.batch_decode(valid_dataset["input_ids"], skip_special_tokens=True)

    # COMET 평가
    comet_data = [
        {"src": src, "mt": hyp, "ref": ref}
        for src, hyp, ref in zip(source_texts, pred_str, label_str)
    ]
    comet_output = comet_model.predict(comet_data, batch_size=8, gpus=1)
    comet_score = comet_output["scores"]

    experiment.log_metric("comet_score", comet_score)
    return {"comet_score": sum(comet_score) / len(comet_score)}

In [16]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="result/comet",                         
    evaluation_strategy="epoch",                        
    learning_rate=2e-5,                               
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64
    ,                  
    weight_decay=0.01,                                
    save_total_limit=3,                               
    num_train_epochs=3,                                 
    predict_with_generate=True,
    report_to="none"                    
)

trainerComet = Seq2SeqTrainer(
    model=model,                        
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics_comet
)

trainerComet.train()

comet_ml is installed but `COMET_API_KEY` is not set.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.8585, 'grad_norm': 0.3912442922592163, 'learning_rate': 1.7037037037037038e-05, 'epoch': 0.44}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.156, 'grad_norm': 0.3392932415008545, 'learning_rate': 1.4074074074074075e-05, 'epoch': 0.89}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1000/1000 [01:09<00:00, 14.31it/s]
                                                       
 33%|███▎      | 1125/3375 [1:53:57<3:41:31,  5.91s/it]

{'eval_loss': 0.12795157730579376, 'eval_comet_score': -1.0423607417524328, 'eval_runtime': 182.6082, 'eval_samples_per_second': 43.81, 'eval_steps_per_second': 0.685, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1305, 'grad_norm': 0.3784959316253662, 'learning_rate': 1.1111111111111113e-05, 'epoch': 1.33}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1208, 'grad_norm': 0.4480949640274048, 'learning_rate': 8.148148148148148e-06, 'epoch': 1.78}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1000/1000 [01:09<00:00, 14.30it/s]
                                                       
 67%|██████▋   | 2250/3375 [3:47:27<1:49:28,  5.84s/it]

{'eval_loss': 0.11049967259168625, 'eval_comet_score': -1.0423607417524328, 'eval_runtime': 182.8774, 'eval_samples_per_second': 43.745, 'eval_steps_per_second': 0.684, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1134, 'grad_norm': 0.2988501489162445, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


Non-default generation parameters: {'forced_eos_token_id': 1}


{'loss': 0.1086, 'grad_norm': 0.37419670820236206, 'learning_rate': 2.222222222222222e-06, 'epoch': 2.67}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1000/1000 [01:10<00:00, 14.26it/s]
                                                     
100%|██████████| 3375/3375 [5:39:10<00:00,  6.03s/it]

{'eval_loss': 0.10700460523366928, 'eval_comet_score': -1.0423607417524328, 'eval_runtime': 183.3687, 'eval_samples_per_second': 43.628, 'eval_steps_per_second': 0.682, 'epoch': 3.0}
{'train_runtime': 20350.8149, 'train_samples_per_second': 10.614, 'train_steps_per_second': 0.166, 'train_loss': 0.2324130972403067, 'epoch': 3.0}





TrainOutput(global_step=3375, training_loss=0.2324130972403067, metrics={'train_runtime': 20350.8149, 'train_samples_per_second': 10.614, 'train_steps_per_second': 0.166, 'total_flos': 1.646288437248e+16, 'train_loss': 0.2324130972403067, 'epoch': 3.0})

In [17]:
trainerComet.evaluate()

100%|██████████| 125/125 [00:49<00:00,  2.48it/s]GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1000/1000 [01:12<00:00, 13.86it/s]
100%|██████████| 125/125 [02:10<00:00,  1.04s/it]


{'eval_loss': 0.10700460523366928,
 'eval_comet_score': -1.0423607417524328,
 'eval_runtime': 130.6473,
 'eval_samples_per_second': 61.234,
 'eval_steps_per_second': 0.957,
 'epoch': 3.0}

In [None]:
trainerComet.eval()

AttributeError: 'Seq2SeqTrainer' object has no attribute 'eval'

Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/batch/logger/experiment/metric
Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/batch/logger/experiment/metric
Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/status-report/update
Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002331D1095A0>: Failed to resolve 'www.comet.com' ([Errno