## [open in Kaggle](https://www.kaggle.com/meishidou/baseline-nb)

# Prerequisites

In [None]:
import json
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import transformers

# package in . directory
from bichoice import utils
from baseline.model import BertForClassification
from baseline.data_processor import DataProcessor

In [None]:
assert transformers.__version__ == '4.1.1'

# declare a namespace
D = utils.GlobalSettings({
        'DATADIR': './data/',
        # select checkpoint-7
        'MODELDIR': './outputs/baseline_output/checkpoint-7/',
    })

# load training parameters
argD = utils.GlobalSettings(
    torch.load(os.path.join(D.MODELDIR, 'training_args.bin')))
print('this model is trained with following hyper parameters:')
print(str(argD))

# Show Some Examples

In [None]:
processor = DataProcessor(D.DATADIR)
tokenizer = transformers.BertTokenizer.from_pretrained(argD.model_name)
train = processor.dataset[0]
# select a literal example from train set
train_e = random.choice(train)

In [None]:
def show_baseline_example(e):
    '''show all info of a single `baseline.InputExample` object`'''
    print('text_a:')
    print('    ', e.text_a)
    print('text_b:')
    print('    ', e.text_b)
    print('text_c:')
    print('    ', e.text_c)
    print('label:', e.label)
    print('guid:', e.guid)

In [None]:
# create several baseline examples from `train_e`
train_b_e = processor._create_examples([train_e], set_type='train')
for i, e in enumerate(train_b_e):
    print('-----EXAMPLE{}-----'.format(i+1))
    show_baseline_example(e)

# Tokenizing Examples

In [None]:
def show_baseline_features(f):
    '''show info of a single `baseline.InputFeatures` object'''
    print('-----FIRST TOKEN SEQUENCE-----')
    input_mask = np.asarray(f.input_mask)
    input_ids = np.asarray(f.input_ids)[input_mask==1]
    segment_ids = np.asarray(f.segment_ids)[input_mask==1]
    first_sent = tokenizer.convert_ids_to_tokens(input_ids[segment_ids==0])
    second_sent = tokenizer.convert_ids_to_tokens(input_ids[segment_ids==1])
    print(''.join(first_sent))
    print('-----SECOND TOKEN SEQUENCE-----')
    print(''.join(second_sent))

In [None]:
train_f_e = processor.convert_examples_to_features(
    train_b_e, argD.max_length, tokenizer)[0]
print('label:', train_b_e[0].label)
for i, f in enumerate(train_f_e):
    print('-----EXAMPLE{}-----'.format(i+1))
    show_baseline_features(f)

# Infer with Baseline Model

In [None]:
# initialize model and load state dict from a checkpoint 
device = 'cuda:0' # not compatible with cpu
model = BertForClassification(argD.model_name)
model.load_state_dict(torch.load(os.path.join(D.MODELDIR, 'model.bin')))
model.to(device)
model.eval()

In [None]:
b = processor.get_dataset(train_b_e, tokenizer, argD.max_length)[:]
b = tuple(t.to(device) for t in b)
with torch.no_grad():
    output = model(input_ids=b[0], 
                   attention_mask=b[1], 
                   token_type_ids=b[2], 
                   labels=b[3])
logits = output[1].detach().cpu().numpy()
pred = np.argmax(logits, axis=1)[0]
label = b[3][0]
options = [e.text_b for e in train_b_e]
print('infered answer:', options[pred])
print('correct answer:', options[label])