In [1]:
#!pip install tensorflow-gpu
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2Config, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from datasets import load_metric
import wandb
from tqdm import tqdm
import os

In [2]:
gpt_name = 'skt/ko-gpt-trinity-1.2B-v0.5'

tokenizer = GPT2TokenizerFast.from_pretrained(gpt_name)

config = GPT2Config.from_pretrained(gpt_name)
config.num_labels = 2


In [3]:
def accuracy(output, target):
    with torch.no_grad():
        pred = torch.argmax(output, dim=1)
        assert pred.shape[0] == len(target)
        correct = 0
        correct += torch.sum(pred == target).item()
    return correct / len(target)

In [4]:
def read_boolq(data_path):
    data = pd.read_csv(data_path, delimiter='\t')
    label_col = 'Answer(FALSE = 0, TRUE = 1)'
    texts = []
    labels = []
    sos = '<s>'
    eos = '<\s>'
    t = '<unused0>'
    q = '<unused1>'

    for i in range(len(data)):
        text = t + sos + data['Text'][i] + eos + q + sos + data['Question'][i] + eos
        texts.append(text)
        labels.append(data[label_col][i])
    return texts, labels

In [5]:
TRAIN_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Train.tsv'
VALID_PATH = '/opt/ml/corpus_korean/data/BoolQ/SKT_BoolQ_Dev.tsv'

In [6]:
train_texts, train_labels = read_boolq(TRAIN_PATH)
valid_texts, valid_labels = read_boolq(VALID_PATH)

In [7]:
class BooqDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
  
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
train_dataset = BooqDataset(train_encodings, train_labels)
valid_dataset = BooqDataset(valid_encodings, valid_labels)

In [13]:
'''
text token + sos + 문장 + eos + question token 
<t>
'''

'\ntext token + sos + 문장 + eos + question token \n<t>\n'

In [10]:
model = GPT2ForSequenceClassification(config).from_pretrained(gpt_name)

Some weights of the model checkpoint at skt/ko-gpt-trinity-1.2B-v0.5 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/ko-gpt-trinity-1.2B-v0.5 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
for param in model.parameters():
    param.requires_grad = False
    
for param in model.score.parameters():
    param.requires_grad = True

In [12]:
config = {'epochs' : 20 , 'learning_rate' : 5e-6, 'batch_size' : 32, 'weight_decay' : 0}
wandb.init(project='nlp_test',config=config)

[34m[1mwandb[0m: Currently logged in as: [33mddobokki[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [13]:
cfg = wandb.config

In [14]:
train_loader = DataLoader(train_dataset,cfg.batch_size, shuffle = True)
valid_loader = DataLoader(valid_dataset,cfg.batch_size, shuffle = True)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = cfg.learning_rate
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE,amsgrad=True,weight_decay=cfg.weight_decay)
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)
output_dir='./results'

In [16]:
model.to(device)
model.train()


CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplac

In [17]:
grad_num = 2
layer_idx = [[j - i for i in range(grad_num)] for j in range(len(model.transformer.h)-1, -1, -1 * grad_num)]
vacc_li = [0]


for e in range(cfg.epochs):
    epoch_loss = 0
    epoch_acc = 0
    for batch in tqdm(train_loader,'train: '):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        acc = accuracy(outputs.logits, labels)
        wandb.log({'loss': loss.item(), 'accuracy': acc})
        
        loss.backward()
        optimizer.step()
    # valid
        del input_ids
        del attention_mask
        del labels
        del loss
        del outputs
        torch.cuda.empty_cache()
    
    
    
    model.eval()
    valid_acc = 0
    valid_loss = 0
    for batch in tqdm(valid_loader, 'valid: '):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)
            
            acc = accuracy(outputs.logits, labels)
            valid_acc += acc
            
            valid_loss += loss.item()
            
            del input_ids
            del attention_mask
            del labels
            del loss
            del outputs
            torch.cuda.empty_cache()
    
    vacc = valid_acc/len(valid_loader)
    vacc_f = f'{vacc:.3f}'
    wandb.log({'valid_loss': valid_loss/len(valid_loader)})
    wandb.log({'val_accuracy' :vacc })
    model.train()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if vacc_li[-1] < vacc:
        torch.save(model.state_dict(), os.path.join(output_dir, f'boolq_{e+0:03}_{vacc_f}.pt'))
        vacc_li.append(vacc)
        
    for param in model.parameters():
        param.requires_grad = False

    for param in model.score.parameters():
        param.requires_grad = True

    for idx in layer_idx[e % len(layer_idx)]:
        for param in model.transformer.h[idx].parameters():
            param.requires_grad = True

wandb.finish()

train:   0%|          | 0/115 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


train: 100%|██████████| 115/115 [02:49<00:00,  1.28s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


train: 100%|██████████| 115/115 [02:49<00:00,  1.47s/it]
valid:   0%|          | 0/22 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


valid: 100%|██████████| 22/22 [00:33<00:00,  1.54s/it]
train:   0%|          | 0/115 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


train: 100%|██████████| 115/115 [03:32<00:00,  1.85s/it]
valid:   0%|          | 0/22 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


valid: 100%|██████████| 22/22 [00:33<00:00,  1.54s/it]
train:   0%|          | 0/115 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


train: 100%|██████████| 115/115 [03:58<00:00,  2.07s/it]
valid:   0%|          | 0/22 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


valid: 100%|██████████| 22/22 [00:34<00:00,  1.55s/it]
train:   0%|          | 0/115 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





RuntimeError: CUDA out of memory. Tried to allocate 214.00 MiB (GPU 0; 31.75 GiB total capacity; 29.07 GiB already allocated; 75.50 MiB free; 30.46 GiB reserved in total by PyTorch)

In [None]:
1, 2, 3, 4, 5 - output

In [14]:
grad_num = 2
layer_idx = [[j - i for i in range(grad_num)] for j in range(24-1, -1, -1 * grad_num)]

In [15]:
layer_idx

[[23, 22],
 [21, 20],
 [19, 18],
 [17, 16],
 [15, 14],
 [13, 12],
 [11, 10],
 [9, 8],
 [7, 6],
 [5, 4],
 [3, 2],
 [1, 0]]

In [22]:
for param in model.transformer.parameters():
    print(param.data.size())

torch.Size([51200, 1920])
torch.Size([1024, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 5760])
torch.Size([5760])
torch.Size([1920, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 7680])
torch.Size([7680])
torch.Size([7680, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 5760])
torch.Size([5760])
torch.Size([1920, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 7680])
torch.Size([7680])
torch.Size([7680, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 5760])
torch.Size([5760])
torch.Size([1920, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 7680])
torch.Size([7680])
torch.Size([7680, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 5760])
torch.Size([5760])
torch.Size([1920, 1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920])
torch.Size([1920, 7680])
torc

In [18]:
input_ids

tensor([[    9,     0, 30396,  ...,     3,     3,     3],
        [    9,     0, 30252,  ...,     3,     3,     3],
        [    9,     0, 30355,  ...,     3,     3,     3],
        ...,
        [    9,     0, 29999,  ...,     3,     3,     3],
        [    9,     0, 31793,  ...,     3,     3,     3],
        [    9,     0, 45504,  ...,     3,     3,     3]], device='cuda:0')

In [19]:
attention_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [20]:
labels

tensor([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 1, 1, 0, 1, 1, 0], device='cuda:0')

In [30]:
model.transformer.h[0].ln_1.weight

Parameter containing:
tensor([0.3735, 0.3872, 0.4238,  ..., 0.4060, 0.4456, 0.4629], device='cuda:0')

In [29]:
model.transformer.h[23]

Parameter containing:
tensor([1.0253, 0.9990, 1.1074,  ..., 1.0078, 1.0869, 1.0312], device='cuda:0',
       requires_grad=True)

In [18]:
torch.cuda.empty_cache()

In [22]:
grad_num = 8
layer_idx = [[j - i for i in range(grad_num)] for j in range(len(model.transformer.h)-1, -1, -1 * grad_num)]
vacc_li = [0]

for param in model.parameters():
    param.requires_grad = False

for param in model.score.parameters():
    param.requires_grad = True

for idx in layer_idx[0 % len(layer_idx)]:
    for param in model.transformer.h[idx].parameters():
        param.requires_grad = True

In [20]:
import gc
gc.collect()

91

In [24]:
print(layer_idx[0 % len(layer_idx)])

[23, 22, 21, 20, 19, 18, 17, 16]


In [10]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate= 5e-5,
    evaluation_strategy = "epoch",
    weight_decay= 0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    #logging_steps=100,
    report_to="wandb",  # enable logging to W&B
    run_name="nlp_test"
)
trainer = Trainer(model = model, args = training_args, train_dataset=train_dataset,eval_dataset=valid_dataset)
trainer.train()

In [12]:
# default_config = {'epochs' : 10 , 'learning rate' : 5e-5, 'batch_size' : 32, 'weight_decay' : 0.1}
# wandb.init(project='nlp_test',config=default_config)

In [141]:
def collate_fn(batched_samples):
    PAD = tokenizer.vocab['<pad>']
    batch_size = len(batched_samples)
    #print(batched_samples)

    ### 아래에 코드 빈칸을 완성해주세요
    batched_samples = sorted(batched_samples, key=lambda x:x[0], reverse=True) # 0번째 요소의 길이를 기준으로 내림차순 정렬
    
    src_sentences = []
    msk_sentences = []
    tgt_sentences = []
    for src_sentence, msk_sentence ,tgt_sentence in batched_samples:
        src_sentences.append(torch.tensor(src_sentence))
        msk_sentences.append(torch.tensor(msk_sentence))
        tgt_sentences.append(tgt_sentence)

    print(tgt_sentences)
    src_sentences = torch.nn.utils.rnn.pad_sequence(src_sentences, batch_first=True, padding_value=PAD) # batch x longest seuqence 순으로 정렬 (링크 참고)
    msk_sentences = torch.nn.utils.rnn.pad_sequence(msk_sentences, batch_first=True, padding_value=1)
    tgt_sentences = torch.tensor(tgt_sentences) # batch x longest seuqence 순으로 정렬 (링크 참고)
    # 링크: https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html

    return src_sentences,msk_sentences ,tgt_sentences

In [142]:
train_data = BooqDataset(TRAIN_PATH)

In [143]:
#train_data[0][0]

In [144]:
train_loader = DataLoader(dataset=train_data,batch_size=3,collate_fn=collate_fn)

In [145]:
next(iter(train_loader))

[1, 0, 1]


(tensor([[    9,     0, 32990, 35300, 36434, 25404, 32521, 30784, 31947, 34537,
          34457, 37947, 30970, 25768, 30653, 34537, 30184, 31533, 30029, 30227,
          43628, 40292, 30090, 38762, 37378, 30485, 29979, 32361, 32990, 30004,
          32679, 36946, 30539, 30063, 36707, 25820, 32357, 30862, 30081, 35493,
          32134, 30001, 32034, 30180, 30196, 32136, 24644, 20700, 28704,   384,
          33322,   443, 31915, 31561, 30404, 42995, 30270, 47714, 36434, 25404,
          32508, 31425, 40408, 30302, 37976, 31233, 41081, 32513, 30100, 38135,
          41362, 46661, 32204, 25512, 43693, 43350, 38946, 47827, 36233, 40868,
          30001, 46768, 31013,   404,   436,   459,   406,    10,     0, 36434,
          25404, 32508, 32204, 25512, 43693, 43350, 38946, 47827, 36233, 40868,
          30001, 30136, 31992, 47727,   404,   436,   459,   406],
         [    9,     0, 32079, 25516, 30153, 31048, 42734, 30396, 24309, 34747,
            384, 35899, 37006, 29993, 32419, 29994, 3

In [16]:
#train_encodings = tokenizer(train_data['Text'][0], truncation=True, padding=True)
#train_encodings

In [45]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    learning_rate= 5e-5,
    #evaluation_strategy = 'steps',
    weight_decay= 0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
