In [10]:
import time
import os
import copy
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2TokenizerFast, GPT2Config, GPT2ForSequenceClassification
from transformers import AutoTokenizer, AdamW
from transformers import Trainer, TrainingArguments

import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [11]:
# del model
# del pytorch_model
# del trainer
torch.cuda.empty_cache()

In [12]:
causal_relationship_path = 'data/causal_relationship_inference'
homograph_path = 'data/homograph'
interrogation_globale_path = 'data/interrogation_globale'
sentence_grammaticality_judgment_path = 'data/sentence_grammaticality_judgment'

In [13]:
import os
import pandas as pd

origin_copa_train_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Train.tsv'), delimiter='\t')
origin_copa_dev_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Dev.tsv'), delimiter='\t')
origin_copa_test_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Test.tsv'), delimiter='\t')

# origin_nikl_train_csv = pd.read_csv(os.path.join(homograph_path, 'NIKL_SKT_WiC_Train.txv'), delimiter='\t')
# origin_nikl_dev_csv = pd.read_csv(os.path.join(homograph_path, 'NIKL_SKT_WiC_Dev.txv'), delimiter='\t')
# origin_nikl_test_csv = pd.read_csv(os.path.join(homograph_path, 'NIKL_SKT_WiC_Test.txv'), delimiter='\t')

# origin_boolq_train_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'SKT_BoolQ_Train.tsv'), delimiter='\t')
# origin_boolq_dev_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'SKT_BoolQ_Dev.tsv'), delimiter='\t')
# origin_boolq_test_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'SKT_BoolQ_Test.tsv'), delimiter='\t')

# origin_cola_train_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'NIKL_CoLA_train.tsv'), delimiter='\t')
# origin_cola_dev_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'NIKL_CoLA_dev.tsv'), delimiter='\t')
# origin_cola_test_csv = pd.read_csv(os.path.join(interrogation_globale_path, 'NIKL_CoLA_test.tsv'), delimiter='\t')



In [14]:
from IPython.display import HTML, display

# Custom IPython progress bar for training
class ProgressMonitor(object):
    
    tmpl = """
        <table style="width: 100%;">
            <tbody>
                <tr>
                    <td style="width: 30%;">
                     <b>Epoch: {epoch}/{num_epochs} Loss: {loss:0.4f}</b> &nbsp&nbsp&nbsp {value} / {length}
                    </td>
                    <td style="width: 70%;">
                        <progress value='{value}' max='{length}', style='width: 100%'>{value}</progress>
                    </td>
                </tr>
            </tbody>
        </table>        
        """

    def __init__(self, length):
        self.length = length
        self.count = 0
        self.display = display(self.html(0, 0, 0, 0), display_id=True)
        
    def html(self, count, loss, epoch, num_epochs):
        return HTML(self.tmpl.format(length=self.length, value=count, loss=loss, epoch=epoch, num_epochs=num_epochs))
        
    def update(self, epoch, num_epochs, count, loss):
        self.count += count
        self.display.update(self.html(self.count, loss, epoch, num_epochs))

In [15]:
num_epochs = 3
num_labels = 2
batch_size = 16
learning_rate = 5e-5

In [16]:
causal_relationship_path = 'data/causal_relationship_inference'
origin_copa_train_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Train.tsv'), delimiter='\t')
origin_copa_dev_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Dev.tsv'), delimiter='\t')
origin_copa_test_csv = pd.read_csv(os.path.join(causal_relationship_path, 'SKT_COPA_Test.tsv'), delimiter='\t')

print(origin_copa_train_csv['question'].unique())
print(origin_copa_train_csv['Answer'].unique())
print(origin_copa_dev_csv['question'].unique())
print(origin_copa_dev_csv['Answer'].unique())
print(origin_copa_test_csv['question'].unique())
print(origin_copa_test_csv['Answer'].unique())

def make_copa_csv(csv, phase='train'):
    ids = csv['ID']
    sentences = csv['sentence']
    questions = csv['question']
    candidate1 = csv['1']
    candidate2 = csv['2']

    rows = []
    
    if phase == 'train' or phase == 'dev':
        answers = csv['Answer']
        columns = ['ID', 'sentence', 'Answer']
        
        for i in range(len(questions)):
            if questions[i].strip() == '결과':
                if answers[i] == 1:
                    rows.append([ids[i], sentences[i] + ' ' + candidate1[i], 1])
                    rows.append([ids[i], sentences[i] + ' ' + candidate2[i], 0])
                elif answers[i] == 2:
                    rows.append([ids[i], sentences[i] + ' ' + candidate1[i], 0])
                    rows.append([ids[i], sentences[i] + ' ' + candidate2[i], 1])
                
            elif questions[i].strip() == '원인':
                if answers[i] == 1:
                    rows.append([ids[i], candidate1[i] + ' ' + sentences[i], 1])
                    rows.append([ids[i], candidate2[i] + ' ' + sentences[i], 0])
                elif answers[i] == 2:
                    rows.append([ids[i], candidate1[i] + ' ' + sentences[i], 0])
                    rows.append([ids[i], candidate2[i] + ' ' + sentences[i], 1])
        
    elif phase == 'test':
        columns = ['ID', 'sentence']
        for i in range(len(questions)):
            if questions[i].strip() == '결과':
                rows.append([ids[i], sentences[i] + ' ' + candidate1[i]])
                rows.append([ids[i], sentences[i] + ' ' + candidate2[i]])
            elif questions[i].strip() == '원인':
                rows.append([ids[i], candidate1[i] + ' ' + sentences[i]])
                rows.append([ids[i], candidate2[i] + ' ' + sentences[i]])

    dataset = pd.DataFrame(rows)
    dataset.columns = columns
    
    return dataset

copa_train_csv = make_copa_csv(origin_copa_train_csv)
copa_dev_csv = make_copa_csv(origin_copa_dev_csv, phase='dev')
copa_test_csv = make_copa_csv(origin_copa_test_csv, phase='test')

['결과' '원인' '원인 ' '결과 ']
[1 2]
['결과' '원인']
[2 1]
['결과' '원인']
[nan]


In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained("skt/ko-gpt-trinity-1.2B-v0.5")

copa_train_encodings = tokenizer(list(copa_train_csv['sentence'].values), truncation=True, padding=True)
copa_dev_encodings = tokenizer(list(copa_dev_csv['sentence'].values), truncation=True, padding=True)
copa_test_encodings = tokenizer(list(copa_test_csv['sentence'].values), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
class CopaDataset(Dataset):
    def __init__(self, dataset, encodings, phase='train'):
        self.copa_csv = dataset
        self.encodings = encodings
        self.phase = phase

        self.ids = dataset['ID']
        if self.phase == 'train' or self.phase == 'dev':
            self.labels = dataset['Answer']
    
    def __getitem__(self, index):
        if self.phase == 'train' or self.phase == 'dev':
            item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[index])
            return item
        else:
            return (self.ids[index], self.encodings[index])
    
    def __len__(self):
        return len(self.copa_csv['Answer'])

copa_train_dataset = CopaDataset(copa_train_csv, copa_train_encodings)
copa_dev_dataset = CopaDataset(copa_dev_csv, copa_dev_encodings, phase='dev')
copa_test_dataset = CopaDataset(copa_test_csv, copa_test_encodings, phase='test')

train_loader = DataLoader(copa_train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(copa_dev_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(copa_test_dataset, batch_size=batch_size, shuffle=False)

In [7]:
config = GPT2Config.from_pretrained("skt/ko-gpt-trinity-1.2B-v0.5")
config.num_labels = num_labels
model = GPT2ForSequenceClassification(config).from_pretrained("skt/ko-gpt-trinity-1.2B-v0.5")    
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optim = AdamW(model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at skt/ko-gpt-trinity-1.2B-v0.5 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/ko-gpt-trinity-1.2B-v0.5 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'AdamW' is not defined

In [10]:
def train(epoch, num_epochs, model, optimizer, scheduler=None):
    model.train()

    batch_loss_list = []
    progress = ProgressMonitor(length=len(copa_train_dataset))

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = criterion(outputs.logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_loss_list.append(loss.item())
        progress.update(epoch, num_epochs, input_ids.shape[0], sum(batch_loss_list)/len(batch_loss_list))

    if scheduler:
        scheduler.step()

In [11]:
def validate(model):
    model.eval()

    correct = 0
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            outputs = torch.argmax(outputs.logits, 1)
            correct += (outputs == labels).sum().item()

    acc = 100 * float(correct) / len(copa_dev_dataset) 
    print('Test Acc: {}/{} ({:.2f}%)'.format(correct, len(copa_dev_dataset), acc))
    
    return acc

In [12]:
since = time.time()

best_model_weights = copy.deepcopy(model.state_dict())
best_acc = 0.0

for epoch in range(num_epochs):
    train(epoch+1, num_epochs, model, optim)
    acc = validate(model)

    if acc > best_acc:
        best_acc = acc
        best_model_weights = copy.deepcopy(model.state_dict())

model.load_state_dict( best_model_weights )

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best test accuracy: {:4f}'.format(best_acc))

0,1
Epoch: 1/3 Loss: 0.7758 6160 / 6160,6160


Test Acc: 500/1000 (50.00%)


0,1
Epoch: 2/3 Loss: 0.7237 6160 / 6160,6160


Test Acc: 500/1000 (50.00%)


0,1
Epoch: 3/3 Loss: 0.7188 6160 / 6160,6160


Test Acc: 517/1000 (51.70%)
Training completed in 8m 10s
Best test accuracy: 51.700000


In [17]:
origin_copa_train_csv

Unnamed: 0,ID,sentence,question,1,2,Answer
0,1,이퀄라이저로 저음 음역대 소리 크기를 키웠다.,결과,베이스 소리가 잘 들리게 되었다.,베이스 소리가 들리지 않게 되었다.,1
1,2,음료에 초콜렛 시럽을 넣었다.,결과,음료수가 더 달아졌다.,음료수가 차가워졌다.,1
2,3,남자는 휴대폰을 호수에 빠뜨렸다.,결과,휴대폰이 업그레이드 되었다.,휴대폰이 고장났다.,2
3,4,옆 집 사람이 이사를 나갔다.,원인,옆 집 사람은 계약이 완료되었다.,옆 집 사람은 계약을 연장했다.,1
4,5,문을 밀었다.,결과,문이 잠겼다.,문이 열렸다.,2
...,...,...,...,...,...,...
3075,3076,계약직으로 일하던 남성은 퇴사했다.,원인,계약을 연장했다.,계약이 종료되었다.,2
3076,3077,목이 마르다.,원인,물을 마시지 못했다.,텀블러를 샀다.,1
3077,3078,노래를 오랫동안 불렀다.,결과,목이 아프다.,노래방이 폐업했다.,1
3078,3079,사람들이 일제히 함성을 지른다.,원인,우리나라 축구팀이 골을 넣었다.,우리나라 축구팀이 경기에서 패배했다.,2


In [18]:
copa_train_csv

Unnamed: 0,ID,sentence,Answer
0,1,이퀄라이저로 저음 음역대 소리 크기를 키웠다. 베이스 소리가 잘 들리게 되었다.,1
1,1,이퀄라이저로 저음 음역대 소리 크기를 키웠다. 베이스 소리가 들리지 않게 되었다.,0
2,2,음료에 초콜렛 시럽을 넣었다. 음료수가 더 달아졌다.,1
3,2,음료에 초콜렛 시럽을 넣었다. 음료수가 차가워졌다.,0
4,3,남자는 휴대폰을 호수에 빠뜨렸다. 휴대폰이 업그레이드 되었다.,0
...,...,...,...
6155,3078,노래를 오랫동안 불렀다. 노래방이 폐업했다.,0
6156,3079,우리나라 축구팀이 골을 넣었다. 사람들이 일제히 함성을 지른다.,0
6157,3079,우리나라 축구팀이 경기에서 패배했다. 사람들이 일제히 함성을 지른다.,1
6158,3080,가수가 3시간 동안 춤을 추었다. 가수의 목이 쉬었다.,0


In [None]:
# <cls> sentence <sep> question <sep> 1 <eos>   - 1
# <cls> sentence <sep> question <sep> 2 <eos>   - 0

# <sos> sentence <sep> question <sep> 1 <eos> - logits argmax( 1, 2 )