In [1]:
from datasets import load_from_disk

tag_dataset = load_from_disk("./data/paperswithcode/")
tag_dataset

DatasetDict({
    train: Dataset({
        features: ['abstract', 'area', 'arxiv_id', 'task_id', 'title'],
        num_rows: 47274
    })
    dev: Dataset({
        features: ['abstract', 'area', 'arxiv_id', 'task_id', 'title'],
        num_rows: 2626
    })
    test: Dataset({
        features: ['abstract', 'arxiv_id', 'title'],
        num_rows: 2627
    })
})

In [2]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,AutoTokenizer
)

In [3]:
model_name = "facebook/bart-base"
# T5는 seq2seq 모델이므로 model을 불러올 때 AutoModelForSeq2SeqLM을 사용해야 함
config = AutoConfig.from_pretrained(
    model_name,
    cache_dir=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)

model = model.cuda()

Downloading: 100%|██████████| 1.69k/1.69k [00:00<00:00, 1.72MB/s]
Downloading: 100%|██████████| 899k/899k [00:01<00:00, 877kB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 468kB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:01<00:00, 741kB/s]
Downloading: 100%|██████████| 558M/558M [00:47<00:00, 11.7MB/s]


In [57]:
# Configs #TODO move to `arguments.py`
max_source_length = 512
padding = True
truncation = True
batch_size = 16
num_train_epochs = 2

In [66]:
def preprocess(examples):

    titles = [title for title in examples['title']]
    abstracts = [abstract for abstract in examples['abstract']]
    target = [f"{area}, {' '.join(task_id.split('-'))}" for area, task_id in zip(examples['area'], examples['task_id'])]

    model_inputs = tokenizer(
        titles,
        abstracts,
        max_length=max_source_length,
        padding=padding,
        truncation=truncation
    )
    label = tokenizer(
        target
    )

    model_inputs["labels"] = label['input_ids']
    return model_inputs

In [67]:
train_dataset = tag_dataset['train']

remove_columns = train_dataset.column_names
train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=remove_columns,
    desc="Preprocess Training Dataset"
)

Preprocess Training Dataset: 100%|██████████| 48/48 [00:11<00:00,  4.04ba/s]


In [68]:
eval_dataset = tag_dataset['dev']

remove_columns = eval_dataset.column_names
eval_dataset = eval_dataset.map(
    preprocess,
    batched=True,
    remove_columns=remove_columns,
    desc="Preprocess Evaluation(dev) Dataset"
)

Preprocess Evaluation(dev) Dataset: 100%|██████████| 3/3 [00:00<00:00,  3.71ba/s]


In [46]:
import torch

sample_input_ids = torch.tensor(train_dataset['input_ids'][0]).unsqueeze(0).cuda()
sample_attn_mask = torch.tensor(train_dataset['attention_mask'][0], dtype=torch.long).unsqueeze(0).cuda()

In [48]:
sample_output = model(
    input_ids=sample_input_ids,
    attention_mask=sample_attn_mask,
)

In [52]:
sample_output.logits.shape

torch.Size([1, 506, 50265])

In [69]:
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    pad_to_multiple_of=None,
)

In [70]:
args = Seq2SeqTrainingArguments(
    output_dir='outputs', 
    do_train=True, 
    do_eval=True, 
    predict_with_generate=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    save_strategy='epoch',
    save_total_limit=2 # 모델 checkpoint를 최대 몇개 저장할지 설정
    report_to="wandb"
)

In [71]:
from datasets import load_metric

metric = load_metric("bleu")

In [72]:
def postprocess_text(preds, labels):
    """
    postprocess는 nltk를 이용합니다.
    Huggingface의 TemplateProcessing을 사용하여
    정규표현식 기반으로 postprocess를 진행할 수 있지만
    해당 미션에서는 nltk를 이용하여 간단한 후처리를 진행합니다
    """

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    # labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # decoded_labels은 rouge metric을 위한 것이며, f1/em을 구할 때 사용되지 않음
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 간단한 post-processing
    # decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # formatted_predictions = [{"id": ex["id"], "prediction_text": decoded_preds[i]} for i, ex in enumerate(datasets["validation"].select(range(max_val_samples)))]
    # references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"].select(range(max_val_samples))]

    predictions = [{
        "id": ex["id"],
        "prediction_text": decoded_preds[i]
    } for i, ex in enumerate(eval_dataset)]

    references = [{
        "id": ex["id"],
        "answers": tokenizer.batch_decode(ex["labels"])
    } for ex in eval_dataset]

    result = metric.compute(predictions=predictions, references=references)
    return result

In [73]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [74]:
train_result = trainer.train(resume_from_checkpoint=None)

