<a href="https://colab.research.google.com/github/DayoungKwon/mrc/blob/main/mrc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 설치 
- khaiii (오래걸림)
- 그외 (huggingface, konlpy)

In [1]:
# !git clone https://github.com/kakao/khaiii.git
# !pip install cmake
# !mkdir build
# !cd build && cmake /content/khaiii
# !cd /content/build/ && make all
# !cd /content/build/ && make resource
# !cd /content/build && make install
# !cd /content/build && make package_python
# !pip install /content/build/package_python

In [2]:
! pip install datasets
! pip install transformers
# ! apt-get install -y openjdk-8-jdk python3-dev
# ! pip install konlpy "tweepy<4.0.0"
# ! /bin/bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Data Preparing

1. data load 
2. data preprocessing 
- 형태소 분석 
- tokenizing 


## Data Load

데이터를 로드하고, parsing 해서 필요한 데이터만 뽑아주는 모듈 
KoMRC

In [3]:
from typing import List, Tuple, Dict, Any
import json
import numpy as np
import random

class KoMRC:
    def __init__(self, data, indices: List[Tuple[int, int, int]], mode=None): #, tokenizer=None):
        self._data = data
        self._indices = indices 
        self._preprocessed_data = None

    # Json을 불러오는 메소드
    @classmethod
    def load(cls, file_path):
        with open(file_path, 'r', encoding='utf-8') as fd:
            data = json.load(fd)

        indices = []
        for d_id, document in enumerate(data['data']):
            for p_id, paragraph in enumerate(document['paragraphs']):
                for q_id, _ in enumerate(paragraph['qas']):
                    indices.append((d_id, p_id, q_id))
        
        return cls(data, indices)

    # 데이터 셋을 잘라내는 메소드
    @classmethod
    def split(cls, dataset, eval_ratio: float=.1, seed=42, mode='mecab'): #, tokenizer=None):
        indices = list(dataset._indices)
        random.seed(seed)
        random.shuffle(indices)
        train_indices = indices[int(len(indices) * eval_ratio):]
        eval_indices = indices[:int(len(indices) * eval_ratio)]

        return cls(dataset._data, train_indices, mode=mode), cls(dataset._data, eval_indices, mode=mode)#, tokenizer=tokenizer)

    def __getitem__(self, slices):
        if not self._preprocessed_data :
          self._preprocessed_data = self.__parse_rawdata(self._indices)


        if isinstance(slices, (int, np.int64)):
          # row 
          return {
              'guid': self._preprocessed_data['guid'][slices],
              'context': self._preprocessed_data['context'][slices],
              'answers': self._preprocessed_data['answers'][slices],
              'question': self._preprocessed_data['question'][slices]
          }
        elif isinstance(slices, str):
          return self._preprocessed_data[slices]
        elif isinstance(slices, slice):
          return {
              'guid': [self._preprocessed_data['guid'][i] for i in range(slices.start, slices.stop)],
              'context': [self._preprocessed_data['context'][i] for i in range(slices.start, slices.stop)],
              'answers': [self._preprocessed_data['answers'][i] for i in range(slices.start, slices.stop)],
              'question': [self._preprocessed_data['question'][i] for i in range(slices.start, slices.stop)]
          }
        raise ValueError(f'unhanled slices : {slices}, type={type(slices)}')


    def __parse_rawdata(self, indices) -> dict:

        _indices = {'question':[], 'answers':[], 
                    'context':[], 'guid':[]}

        for index, indice in enumerate(indices):
          d_id, p_id, q_id = indice
          paragraph = self._data['data'][d_id]['paragraphs'][p_id]

          qa = paragraph['qas'][q_id]

          _indices['guid'].append(qa['guid'])

          _indices['question'].append(qa['question'])
          _indices['answers'].append(qa['answers']) 
          _indices['context'].append(paragraph['context'])

        print(len(_indices['question']))
        return _indices


    def __len__(self) -> int:
        return len(self._indices)

In [4]:
# 데이터 위치 (저장하신 위치에 따라 바꿔주세요)
train_file = '/content/train.json'
test_file = '/content/test.json'
dataset = KoMRC.load(train_file)
train_dataset, dev_dataset = KoMRC.split(dataset)
print("Number of Train Samples:", len(train_dataset))
print("Number of Dev Samples:", len(dev_dataset))

print(dev_dataset[0])
print(dev_dataset[0]['answers'])
print(dev_dataset[0]['context'].split('.')[0])
print(dev_dataset['answers'])
print(len(dev_dataset['answers']))
print(dev_dataset['question'])
print(len(dev_dataset['question']))
print(dev_dataset[0:10])

Number of Train Samples: 10834
Number of Dev Samples: 1203
1203
{'guid': '844e22ab28924c1697d5ac28801b34c1', 'context': '지난해 주요 연극상을 나눠 가졌던 세 편의 작품이 올봄에 나란히 앙코르 무대를 갖는다. 대한민국연극대상 연기·무대예술상, 동아연극상 작품·희곡·연기상 등을 수상한 ‘알리바이 연대기’(17~20일 대학로 아르코예술극장 대극장, 25일~5월11일 서계동 국립극단 백성희장민호극장), 연극대상에서 대상과 희곡상을 받은 ‘여기가 집이다’(18일~5월22일 대학로 연우소극장), 연극대상 작품·연출상과 김상열연극상 수상작인 ‘황금용’(5월9~18일 서강대 메리홀 대극장)이다. 초연 당시 짧은 상연 기간과 낮은 인지도 등으로 공연을 놓친 연극팬에겐 평단으로부터 작품성을 인정받은 수작을 관람할 수 있는 기회다. ‘알리바이 연대기’는 희곡을 쓰고 연출한 김재엽의 가족사에 근거한 다큐멘터리 드라마다. 1930년에 태어난 한 개인의 사적인 연대기를 바탕으로 그 사이를 파고드는 역사적 순간들을 정밀하게 조명한다. 연출가는 “공적인 권력이 사적인 권리를 지켜주기보다 억압하기 일쑤였던 한국 현대사 속에서 개인은 언제나 무죄를 입증하며 하루하루 자신을 지켜내야 하는 ‘알리바이의 연대기’ 속에서 살아왔다”고 말한다.한국연극평론가협회는 이 작품을 ‘2013년 올해의 연극 베스트3’로 선정하며 “촘촘하고 세세하게 삶에 천착해 개인과 역사에 대한 이분법적 관점을 극복한다. 정치극에 대한 새로운 가능성을 보여줬다”고 평했다. 이 작품으로 연기상을 휩쓴 남명렬을 비롯해 지춘성 정원조 등 초연 배우들이 그대로 출연한다.‘여기가 집이다’는 허름하고 볼품 없는 ‘20년 전통’의 고시원에 모여 사는 사람들의 절망과 희망을 그린 작품. ‘차력사와 아코디언’ ‘택배 왔어요’를 만든 극단 이와삼의 장우재 대표가 직접 대본을 쓰고 연출했다. 나름의 규칙을 가지고 평화로웠던 고시원에 새로운 주인으로 등

In [5]:
# Tokenizer load 
from transformers import AutoTokenizer, AutoModel, AutoConfig

def load_tokenizer(model_id, test_sentence):
  tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
  return tokenizer


In [6]:
sentence = '위메프의 배달∙픽업 서비스 위메프오가 23~25일 3일간 ‘BBQ 50% 페이백’ 이벤트를 진행한다고 23일 밝혔다'

# hf_model_id = 'kykim/electra-kor-base'
# tokenizer = load_tokenizer(hf_model_id, sentence)

# hf_model_id = 'monologg/kobert'
# tokenizer = load_tokenizer(hf_model_id, sentence)


# hf_model_id = 'kykim/bert-kor-base'
# tokenizer = load_tokenizer(hf_model_id, sentence)


hf_model_id = 'Jinhwan/krelectra-base-mecab'
tokenizer = load_tokenizer(hf_model_id, sentence)

# hf_model_id = 'monologg/kobigbird-bert-base'
# #https://github.com/monologg/KoBigBird
config = AutoConfig.from_pretrained(hf_model_id, max_position_embeddings=1024)
model = AutoModel.from_pretrained(hf_model_id)

Some weights of the model checkpoint at Jinhwan/krelectra-base-mecab were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
question = '무슨 이벤트를 하나?'
aa =tokenizer(question, sentence, return_offsets_mapping=True)

In [8]:
from tqdm import tqdm 
class TKIndexerWrappedDataset:
    def __init__(self, dataset: KoMRC, tokenizer) -> None:
        self._indexer = tokenizer
        self._dataset = self._sample2ids_with_tk(dataset)

    def __len__(self) -> int:
        return len(self._dataset['input_ids'])


    def _sample2ids_with_tk(self, sample):
      tokenizer = self._indexer

      # return_offsets_mapping
      tokenized = tokenizer(sample['question'], 
                            sample['context'],
                            return_overflowing_tokens=True, 
                            return_offsets_mapping=True, 
                            # stride = 128,
                            return_tensors = 'pt',
                            max_length=512, 
                            truncation='only_second', 
                            padding='max_length')
      offset_map = tokenized.pop('offset_mapping')
      overflow_map = tokenized.pop('overflow_to_sample_mapping')
      tokenized['start'] = []
      tokenized['end'] = []
      tokenized['guid'] = []
      cls_id = tokenizer.cls_token_id

      for i, offsets in enumerate(tqdm(offset_map)):
        input_ids = tokenized['input_ids'][i]
        cls_index = input_ids.tolist().index(cls_id)
        seq_ids = tokenized.sequence_ids(i)

        # check
        example_idx = overflow_map[i]
        answers = sample['answers'][example_idx]
        guid = sample['guid'][example_idx]
        tokenized['guid'].append(guid)
        if i < 10 : 
          print(f'Char answers: {answers}')

        # 
        if not answers :
          continue 

        ans_start = answers[0]['answer_start']
        ans_end = answers[0]['answer_start'] + len(answers[0]['text'])
        # question, context 분리 
        idx = 0 
        while seq_ids[idx] != 1 :
          idx += 1 

        ans_token_start = idx

        while seq_ids[idx] == 1 :
          idx += 1 
        ans_token_end = idx - 1

        if not (offsets[ans_token_start][0] <= ans_start and offsets[ans_token_end][1] >= ans_end):
          tokenized['start'].append(cls_index)
          tokenized['end'].append(cls_index)      
        else : 

          while (ans_token_start < ans_token_end
            and offsets[ans_token_start][0] <= ans_start):
            ans_token_start += 1 
          ans_token_start -= 1 
          
          while ans_token_end >= ans_token_start and offsets[ans_token_end][1] >= ans_end:
            ans_token_end -= 1 
          ans_token_end += 1

          if i < 10 :
            print(f'Token answer : {tokenizer.decode([input_ids[i] for i in range(ans_token_start, ans_token_end+1)] )}')
          tokenized['start'].append(ans_token_start)
          tokenized['end'].append(ans_token_end)        
      return tokenized
      
    
    def __getitem__(self, slices) -> Any:

        if isinstance(slices, (int, np.int64)):
          # row 
          return {
              'input_ids': self._dataset['input_ids'][slices],
              'token_type_ids': self._dataset['token_type_ids'][slices],
              'attention_mask': self._dataset['attention_mask'][slices],
              'start': self._dataset['start'][slices] if len(self._dataset['start']) > slices else None,
              'end': self._dataset['end'][slices] if len(self._dataset['end']) > slices else None,
              'guid': self._dataset['guid'][slices]
          }
        elif isinstance(slices, str):
          return self._dataset[slices]
        elif isinstance(slices, slice):
          return {
              'input_ids': [self._dataset['input_ids'][i] for i in range(slices.start, slices.stop)],
              'token_type_ids': [self._dataset['token_type_ids'][i] for i in range(slices.start, slices.stop)],
              'attention_mask': [self._dataset['attention_mask'][i] for i in range(slices.start, slices.stop)],
              'guid': [self._dataset['guid'][i] for i in range(slices.start, slices.stop)],
              'start': [self._dataset['start'][i] for i in range(slices.start, slices.stop)],
              'end': [self._dataset['end'][i] for i in range(slices.start, slices.stop)]
          }
        raise ValueError(f'unhanled slices : {slices}, type={type(slices)}')

        return sample

In [9]:
indexed_train_dataset = TKIndexerWrappedDataset(train_dataset, tokenizer)
indexed_dev_dataset = TKIndexerWrappedDataset(dev_dataset, tokenizer)
print(f'\nlen origin train_dataset : {len(train_dataset)}')
print(f'len indexed_train_dataset : {len(indexed_train_dataset)}')
sample = indexed_dev_dataset[0]
print(sample['input_ids'].shape)

10834


  0%|          | 0/16594 [00:00<?, ?it/s]

Char answers: [{'text': '세탁기', 'answer_start': 772}]


  0%|          | 30/16594 [00:05<36:13,  7.62it/s]  

Token answer : 세탁기
Char answers: [{'text': '세탁기', 'answer_start': 772}]
Char answers: [{'text': '해양', 'answer_start': 329}]
Token answer : 해양
Char answers: [{'text': '9월14일', 'answer_start': 18}]
Token answer : 9월14일
Char answers: [{'text': '아리에 바르디', 'answer_start': 959}]
Char answers: [{'text': '아리에 바르디', 'answer_start': 959}]
Token answer : 아리에 바르디
Char answers: [{'text': '6일', 'answer_start': 70}]
Token answer : 6일
Char answers: [{'text': '6일', 'answer_start': 70}]
Char answers: [{'text': '자본시장법 개정안', 'answer_start': 126}]
Token answer : 자본시장법 개정안
Char answers: [{'text': '1월 5일', 'answer_start': 1078}]


100%|██████████| 16594/16594 [00:48<00:00, 343.35it/s]
  5%|▍         | 85/1812 [00:00<00:04, 429.54it/s]

Char answers: [{'text': '서강대 메리홀 대극장', 'answer_start': 246}]
Token answer : 서강대 메리홀 대극장
Char answers: [{'text': '서강대 메리홀 대극장', 'answer_start': 246}]
Char answers: [{'text': '저축은행중앙회', 'answer_start': 345}, {'text': '중앙회', 'answer_start': 774}]
Token answer : 저축은행중앙회
Char answers: [{'text': '저축은행중앙회', 'answer_start': 345}, {'text': '중앙회', 'answer_start': 774}]
Char answers: [{'text': '구문론', 'answer_start': 521}]
Token answer : 구문론
Char answers: [{'text': '1956년', 'answer_start': 634}, {'text': '1956', 'answer_start': 634}]
Token answer : 1956년
Char answers: [{'text': '1956년', 'answer_start': 634}, {'text': '1956', 'answer_start': 634}]
Char answers: [{'text': '1부', 'answer_start': 187}]
Token answer : 1부
Char answers: [{'text': '이베이', 'answer_start': 889}]
Token answer : 이베이
Char answers: [{'text': '50개', 'answer_start': 65}]
Token answer : 50개


100%|██████████| 1812/1812 [00:04<00:00, 449.75it/s]


len origin train_dataset : 10834
len indexed_train_dataset : 16594
torch.Size([512])





### Data 준비
Collator : 데이터를 batch에 맞게 자르고 묶어주고, 준비하는 모듈
DataLoader : 데이터를 iterative (돌아가면서 훈련할 수 있게) Load하는 모듈 


In [10]:
import torch
from torch.nn.utils.rnn import pad_sequence

class Collator:
    def __init__(self, indexer) -> None:
        self._indexer = indexer

    def __call__(self, samples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        samples = {
            key: [sample[key] for sample in samples]
            for key in samples[0]
        }

        for key in 'start', 'end':
            if samples[key][0] is None:
                samples[key] = None
            else:
                samples[key] = torch.tensor(samples[key], dtype=torch.long)
        for key in 'input_ids', 'attention_mask', 'token_type_ids':
            samples[key] = pad_sequence(
                [torch.tensor(sample, dtype=torch.long) for sample in samples[key]],
                batch_first=True, padding_value=self._indexer.pad_token_id
            )

        return samples

In [11]:
from torch.utils.data import DataLoader

batch_size = 64
#num_workers = core num
accumulation = 4 # 메모리를 아끼기 위하여 Gradient accumulation을 해보자
collator = Collator(tokenizer)
train_loader = DataLoader(indexed_train_dataset, batch_size=batch_size//accumulation, shuffle=True, collate_fn=collator, num_workers=2)
dev_loader = DataLoader(indexed_dev_dataset, batch_size=batch_size//accumulation, shuffle=False, collate_fn=collator, num_workers=2)

In [12]:
batch = next(iter(dev_loader))
print(batch['input_ids'].shape)
print(list(batch.keys()))



torch.Size([16, 512])
['input_ids', 'token_type_ids', 'attention_mask', 'start', 'end', 'guid']


## 본격적으로 훈련하기
1. 모델 hyperparameter 정하기
2. Training


In [13]:
!pip install transformers
import torch.nn as nn

from transformers.models.bert.modeling_bert import (
    BertModel,
    BertPreTrainedModel
)
from transformers import AutoModel
## Simple Version for Bert QA: https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForQuestionAnswering.forward
class BertForQuestionAnswering(nn.Module):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config, hf_model_id):
        super().__init__()
        self.bert = AutoModel.from_pretrained(hf_model_id)
        self.start_linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.end_linear = nn.Linear(self.bert.config.hidden_size, 1)


    def forward(
        self, input_ids, attention_mask, token_type_ids
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        start_logits = self.start_linear(outputs.last_hidden_state).squeeze(-1)
        end_logits = self.end_linear(outputs.last_hidden_state).squeeze(-1)

        return start_logits, end_logits

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import torch
from transformers import BertConfig


print(f'Encoder model to use : {hf_model_id}')
model = BertForQuestionAnswering({}, hf_model_id)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-2)


Encoder model to use : Jinhwan/krelectra-base-mecab


Some weights of the model checkpoint at Jinhwan/krelectra-base-mecab were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# device set
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.cuda()


BertForQuestionAnswering(
  (bert): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

In [16]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [17]:
import os
from statistics import mean

import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
print(f'device : {device}')
os.makedirs('dump', exist_ok=True)
train_losses = []
dev_losses = []

step = 0


for epoch in range(1, 31):
    print("Epoch", epoch)
    # Training
    running_loss = 0.
    losses = []
    progress_bar = tqdm(train_loader, desc='Train')
    for batch in progress_bar:
        guid = batch.pop('guid')
        progress_bar.set_postfix({'guid' : guid})
        # del batch['guid'] #, batch['context'], batch['question'], batch['position']
        # batch = {key: value.cuda() for key, value in batch.items()}
        batch['input_ids'] = batch['input_ids'].to(device)
        batch['token_type_ids'] = batch['token_type_ids'].to(device)
        batch['attention_mask'] = batch['attention_mask'].to(device)
        batch['start'] = batch['start'].to(device)
        batch['end'] = batch['end'].to(device)

        start = batch.pop('start')
        end = batch.pop('end')
        
        model_output = model(**batch)
        
        try :
          start_logits, end_logits = model_output
        except Exception as e :
          print(model_output)
          raise e 
        
        try :
          loss = F.cross_entropy(start_logits, start) + F.cross_entropy(end_logits, end)
          loss.backward()
          running_loss += loss.item()
        except Exception as e :
          print('[ERROR] Error occured while getting loss')
          print(f'loss : {loss}, start_logits : {start_logits}, end_logits : {end_logits}')
          print(f'guid : {guid}, start : {start}, end : {end}')
          print(f'batch[input_ids].shape : {batch["input_ids"].shape}')
          raise e 
        del batch, start, end, start_logits, end_logits, loss
        
        step += 1
        if step % accumulation:
            continue

        clip_grad_norm_(model.parameters(), max_norm=1.)
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        losses.append(running_loss)
        running_loss = 0.
        progress_bar.set_description(f"Train - Loss: {losses[-1]:.3f}")

    train_losses.append(mean(losses))
    print(f"train score: {train_losses[-1]:.3f}")

    # Evaluation
    losses = []
    for batch in tqdm(dev_loader, desc="Evaluation"):
        del batch['guid'] #, batch['context'], batch['question'], batch['position']
        # batch = {key: value.cuda() for key, value in batch.items()}
        batch['input_ids'] = batch['input_ids'].to(device)
        batch['token_type_ids'] = batch['token_type_ids'].to(device)
        batch['attention_mask'] = batch['attention_mask'].to(device)
        batch['start'] = batch['start'].to(device)
        batch['end'] = batch['end'].to(device)

        start = batch.pop('start')
        end = batch.pop('end')
        
        with torch.no_grad():
            start_logits, end_logits = model(**batch)
        loss = F.cross_entropy(start_logits, start) + F.cross_entropy(end_logits, end)

        losses.append(loss.item())
        del batch, start, end, start_logits, end_logits, loss
    dev_losses.append(mean(losses))
    print(f"Evaluation score: {dev_losses[-1]:.3f}")

    torch.save(model, f'dump/model.{epoch}')

device : cuda
Epoch 1


Train - Loss: 10.077: 100%|██████████| 1038/1038 [26:14<00:00,  1.52s/it, guid=['04b335022980420895f180cb7ce140a3', '4078da0871f549abb50615c225a7cad6']]


train score: 17.362


Evaluation: 100%|██████████| 114/114 [01:04<00:00,  1.77it/s]


Evaluation score: 2.723
Epoch 2


Train - Loss: 6.314:  13%|█▎        | 130/1038 [03:18<23:09,  1.53s/it, guid=['1127a28bbf3044f1b1437f9af3a35ea9', '58dcee5c42874503bde7aea725e635d7', '59173a0e98454c4ab10f00224d26d680', '5ad78f83e4cd4fabae6e4ab6d4d879bd', '1c97cdcdc16343cdb293e8d942bba17c', '8797dc8cec0c4dc0a157cf1a97975fe8', '119352ac545e474c895a15fb25ce6baf', '85f0958ae89f4741a7c72feba77a7e5e', '4336dea9a1ba4981922370577a507a1a', '54a45f3d9fc449d49e39f1d44cd42e77', 'fb348979363c454aa09b27f295e63ec6', 'c98d12723e2244c1b3d4ea243edcbefe', 'eb47a81c3ae74ac0ae7fcada41079dd1', '335d2c42edf3418687eb016c0ebea52b', '58cad3a1197248e7a8fae760244e46c5', '5f22f5aff3d94b539e7ece6c4daf4fe4']]


KeyboardInterrupt: ignored

In [None]:
import matplotlib.pyplot as plt

t = list(range(1, 31))
plt.plot(t, train_losses, label="Train Loss")
plt.plot(t, dev_losses, label="Dev Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

Inference

In [18]:
model = torch.load('dump/model.1')
model.eval()

BertForQuestionAnswering(
  (bert): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

In [22]:
test_dataset = KoMRC.load('/content/test.json')
test_dataset = TKIndexerWrappedDataset(test_dataset, tokenizer)
print("Number of Test Samples", len(test_dataset))

4008


100%|██████████| 6142/6142 [00:00<00:00, 36502.13it/s]

Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Char answers: None
Number of Test Samples 6142





In [None]:
import csv

os.makedirs('out', exist_ok=True)
with torch.no_grad(), open('out/baseline.csv', 'w') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    for sample in tqdm(test_dataset, "Testing"):

        input_ids, token_type_ids, attention_mask = [
            torch.tensor(sample[key], dtype=torch.long, device="cuda")
            for key in ("input_ids", "token_type_ids","attention_mask")
        ]
        
        with torch.no_grad():
            start_logits, end_logits = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :], attention_mask=attention_mask[None, :])
        start_logits.squeeze_(0), end_logits.squeeze_(0)
        
        start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
        end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)
        probability = torch.triu(start_prob[:, None] @ end_prob[None, :])
        index = torch.argmax(probability).item()
        
        start = index // len(end_prob)
        end = index % len(end_prob)
        answer = sample["input_ids"][int(start) : int(end) + 1]
        last_answer = tokenizer.decode(answer)
        rows.append([sample["guid"], last_answer])


        # input_ids, token_type_ids = [
        #     torch.tensor(sample[key], dtype=torch.long, device="cuda")
        #     for key in ("input_ids", "token_type_ids")
        # ]
    
        # with torch.no_grad():
        #     start_logits, end_logits = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :])
        # start_logits.squeeze_(0), end_logits.squeeze_(0)
    
        # start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
        # end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)
        # probability = torch.triu(start_prob[:, None] @ end_prob[None, :])
        # index = torch.argmax(probability).item()
    
        # start = index // len(end_prob)
        # end = index % len(end_prob)
    
        # start = sample['position'][start][0]
        # end = sample['position'][end][1]

        # rows.append([sample["guid"], sample['context'][start:end]])
    
    writer.writerows(rows)

  del sys.path[0]
Testing:  12%|█▏        | 727/6142 [00:27<03:34, 25.20it/s]