# DEPENDENCY

In [73]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
%autoreload 2

In [76]:
import json
import torch
import pandas as pd

from torch.utils.data import Dataset
from transformers import BertTokenizer
from IPython.display import clear_output
from tqdm import tqdm_notebook as tqdm
from pathlib import Path

In [77]:
data_dir = './FGC_release_1.7.13/'
data_file = data_dir + 'FGC_release_all_dev.json'

In [85]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

I0706 07:41:06.259315 140573014021952 tokenization_utils_base.py:1254] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [78]:
with open(data_file) as fo:
  develop_set = json.load(fo)

print(type(develop_set[0]["QUESTIONS"][0]))
len(develop_set)

<class 'dict'>


247

In [79]:
develop_set[0]["QUESTIONS"][0]["QTEXT_CN"]

'苏东坡在中国历史上，是哪一个朝代的人？'

In [80]:
print(develop_set[0].keys())

dict_keys(['DID', 'QUESTIONS', 'DTEXT', 'DTEXT_CN', 'SENTS'])


In [91]:
print(develop_set[0]["QUESTIONS"][0].keys())

dict_keys(['QID', 'QTYPE', 'ATYPE_', 'AMODE_', 'QTEXT', 'QTEXT_CN', 'SENTS', 'SHINT_', 'ANSWER', 'ASPAN', 'AMODE', 'ATYPE', 'AHINT', 'SHINT'])


In [81]:
develop_set[0]["SENTS"][0]["text"]

'苏轼（1037年1月8日－1101年8月24日），'

In [88]:
tokenizer.tokenize(develop_set[0]["SENTS"][0]["text"])

['苏',
 '轼',
 '（',
 '103',
 '##7',
 '年',
 '1',
 '月',
 '8',
 '日',
 '－',
 '110',
 '##1',
 '年',
 '8',
 '月',
 '24',
 '日',
 '）',
 '，']

In [20]:
class BertSER:
    def __init__(self, model_name):
        pass

In [117]:
class FGC_Dataset(Dataset):
    """
        FGC release all dev.json
        usage FGC_Dataset(file_path, mode, tokenizer)
        for tokenizer:
            PRETRAINED_MODEL_NAME = "bert-base-chinese"
            tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
        for file_path:
            something like ./FGC_release_1.7.13/FGC_release_all_dev.json
        for mode:
            ["train", "develop", "test"]
    """
    # read, preprocessing
    def __init__(self, data_file_ref, mode, tokenizer):
        # load raw json
        assert mode in ["train", "develop", "test"]
        self.mode = mode
        with open(data_file_ref) as fo:
            self.raw_data = json.load(fo)
        self.tokenizer = tokenizer 
        self.tokenlized_pair = None
        
        # generate raw pairs of q sent s
        self.raw_pair = list()
        for instance in self.raw_data:
            q = instance["QUESTIONS"][0]["QTEXT_CN"]
            sentences = instance["SENTS"]
            for idx, sent in enumerate(sentences):
                # check if is supporting evidence
                lab = idx in instance["QUESTIONS"][0]["SHINT_"]
                self.raw_pair.append((q, sent["text"], lab))
        
        # generate tensors 
        self.dat = list()
        for instance in self.raw_pair:
            q, sent, label = instance
            
            if mode is not "test":
                label_tensor = torch.tensor(label)
            else:
                label_tensor = None
            
            # first sentence, use bert tokenizer to cut subwords
            subwords = ["[CLS]"]
            q_tokens = self.tokenizer.tokenize(q)
            subwords.extend(q_tokens)
            subwords.append("[SEP]")
            len_q = len(subwords)
            
            # second sentence
            sent_tokens = self.tokenizer.tokenize(sent)
            subwords.extend(sent_tokens)
            subwords.append("[SEP]")
            len_sent = len(subwords)
            
            # subwords to ids, ids to torch tensor
            ids = self.tokenizer.convert_tokens_to_ids(subwords)
            tokens_tensor = torch.tensor(ids)
            
            # segments_tensor
            segments_tensor = torch.tensor([0] * len_q + [1] * len_sent, dtype=torch.long)
            self.dat.append((tokens_tensor, segments_tensor, label_tensor))
            
        return None
    
    # get one data of index idx
    def __getitem__(self, idx):
        return self.dat[idx]
    
    def __len__(self):
        return len(self.dat)

In [118]:
fgc_ds = FGC_Dataset(data_file, "develop", tokenizer)

In [119]:
tokenizer.convert_ids_to_tokens(fgc_ds[0][0])

['[CLS]',
 '苏',
 '东',
 '坡',
 '在',
 '中',
 '国',
 '历',
 '史',
 '上',
 '，',
 '是',
 '哪',
 '一',
 '个',
 '朝',
 '代',
 '的',
 '人',
 '？',
 '[SEP]',
 '苏',
 '轼',
 '（',
 '103',
 '##7',
 '年',
 '1',
 '月',
 '8',
 '日',
 '－',
 '110',
 '##1',
 '年',
 '8',
 '月',
 '24',
 '日',
 '）',
 '，',
 '[SEP]']