In [1]:
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2Config, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from datasets import load_metric
import wandb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback
import numpy as np

In [2]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    #f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy} #, "precision": precision, "recall": recall, "f1": f1}


In [3]:
# 토크나이저 

gpt_name = 'skt/ko-gpt-trinity-1.2B-v0.5'
tokenizer = GPT2TokenizerFast.from_pretrained(gpt_name)

In [4]:
def read_boolq(data_path):
    data = pd.read_csv(data_path, delimiter='\t')
    label_col = 'Answer(FALSE = 0, TRUE = 1)'
    texts = []
    labels = []
    sos = '<s>'
    eos = '<\s>'
    t = '<unused0>'
    q = '<unused1>'

    for i in range(len(data)):
        text = t + sos + data['Text'][i] + eos + q + sos + data['Question'][i] + eos
        texts.append(text)
        labels.append(data[label_col][i])
    return texts, labels

In [5]:
class BooqDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
  
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
TRAIN_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Train.tsv'
VALID_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Dev.tsv'

train_texts, train_labels = read_boolq(TRAIN_PATH)
valid_texts, valid_labels = read_boolq(VALID_PATH)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)

train_dataset = BooqDataset(train_encodings, train_labels)
valid_dataset = BooqDataset(valid_encodings, valid_labels)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
config = GPT2Config.from_pretrained(gpt_name)
config.num_labels = 2
model = GPT2ForSequenceClassification(config).from_pretrained(gpt_name)

In [None]:
for param in model.parameters():
    param.requires_grad = False

for param in model.score.parameters():
    param.requires_grad = True

In [9]:
gpt_last_idx = len(model.transformer.h) - 1
not_freeze_num = 7

for idx in range(gpt_last_idx,  gpt_last_idx - not_freeze_num -1, -1):
    for param in model.transformer.h[idx].parameters():
        param.requires_grad = True

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=100,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate= 5e-4,
    evaluation_strategy = "steps",
    eval_steps = 100, # Evaluation and Save happens every 10 steps
    weight_decay= 0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    report_to="wandb",  # enable logging to W&B
    run_name="gpt2 block 23 - 16 finetune",
    seed = 0,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(model = model, 
                  args = training_args, 
                  train_dataset=train_dataset,
                  eval_dataset=valid_dataset,
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
                 )
trainer.train()
wandb.finish()

***** Running training *****
  Num examples = 3665
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1150
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mddobokki[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,No log,2.346931,0.534286,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 700
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-10
Configuration saved in ./results/checkpoint-10/config.json
Model weights saved in ./results/checkpoint-10/pytorch_model.bin
