In [112]:
import json
import itertools
from pathlib import Path
import numpy as np
import pandas as pd
from re import search
from typing import List, Tuple, Any, Union
import nltk
from transformers import AutoTokenizer
import torch
from datasets import Dataset

def load_json(file: str):
    return json.loads(Path(file).read_bytes())

p_data = pd.read_csv("./dataset/train.csv").drop(["Unnamed: 6", "total no.: 7987"], axis=1)
split_ids = load_json('./dataset/splitIds__splitBy-id_stratifyBy-s_train-0.6_valid-0.2_test-0.2_seed-42.json')
train_data, valid_data, test_data = [p_data[p_data.id.isin(split_ids[split])] for split in ["train", "valid", "test"]]
print(train_data.shape[0],valid_data.shape[0],test_data.shape[0])

22919 7728 7699


In [113]:
args = {
    "max_len" : 512,
    "batch_size" : 8,
    "model_name" : "bert-base-uncased"
}

### Functions

In [125]:
def contains(small, big):
    for i in range(len(big)-len(small)+1):
        for j in range(len(small)):
            if big[i+j] != small[j]:
                break
        else:
            return i, i+len(small)
    return False

def keep_continuous(data: pd.DataFrame):
    keep=[]
    for i in range(data.shape[0]):
        qp_not_in_q = data.iloc[i]['q\''][1:-1] not in data.iloc[i]['q'][1:-1]
        rp_not_in_r = data.iloc[i]['r\''][1:-1] not in data.iloc[i]['r'][1:-1]
        if not (qp_not_in_q or rp_not_in_r):
            keep.append(i)
    
    data = data.iloc[keep]
    return data

#format data 
#token type id sepid is in 0 not 1, context 0 question 1
#attention mask有東西的1其他0
#TODO: pad use tokenizer.pad_token_id
def format_data_qp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    cls_q_sep = [clsid] + q + [sepid]
    token_type_id = [0 if _ in range(len(cls_q_sep)) else 1 for _ in range(args['max_len'])]
    
    q_r_s = [clsid] + q + [sepid] + r + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    
    if contains(qp, q_r_s):
        start_pos, end_pos = contains(qp, q_r_s)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, token_type_id, attention_mask, start_pos, end_pos

def format_data_rp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    cls_q_sep = [clsid] + r + [sepid]
    token_type_id = [0 if _ in range(len(cls_q_sep)) else 1 for _ in range(args['max_len'])]
    
    q_r_s = [clsid] + r + [sepid] + q + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    
    if contains(rp, q_r_s):
        start_pos, end_pos = contains(rp, q_r_s)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, token_type_id, attention_mask, start_pos, end_pos

### Tokenizer

In [126]:
tokenizer = AutoTokenizer.from_pretrained(args["model_name"])
clsid = tokenizer.cls_token_id
sepid = tokenizer.sep_token_id
def model_tokenize(text: str) -> List[int]:
    text = text.strip('"')
    token_ids = tokenizer(text)["input_ids"]
    return token_ids[1:-1] #without cls sep

loading configuration file config.json from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt

In [127]:
def preprocess(data: pd.DataFrame, choice:str):
    data = keep_continuous(data)
    print('ids left:', data['id'].nunique())
    print('instances left', data.shape[0])
    ids = list(data.id)
    Q, R, S, QP, RP = [data[field] for field in ["q", "r", "s", "q'", "r'"]]
    Q, R, QP, RP, S = [list(map(model_tokenize, x)) for x in [Q, R, QP, RP, S]]
    
    # only keep those Q+R+S < 512 tokens
    count = 0
    keep = []
    for i in range(len(Q)):
        if (len(Q[i])+len(R[i])) > 512-5:
            count += 1
        else:
            keep.append(i)
    print(f"Q+R+S longer than {args['max_len']} tokens:", count, " Remains:",len(keep))
    Q = [Q[i] for i in keep]
    R = [R[i] for i in keep]
    QP = [QP[i] for i in keep]
    RP = [RP[i] for i in keep]
    S = [S[i] for i in keep]
    ids = [ids[i] for i in keep]
    
    #find start end positions make dict
    if choice == 'qp':
        data = list(map(format_data_qp, Q, R, S, QP, RP))
    elif choice == 'rp':
        data = list(map(format_data_rp, Q, R, S, QP, RP))
    else:
        return 'ERROR'
    input_list, token_list, attention_list, s_pos, e_pos =[], [], [], [], []
    for i in range(len(data)):
        input_list.append(data[i][0])
        token_list.append(data[i][1])
        attention_list.append(data[i][2])
        s_pos.append(data[i][3])
        e_pos.append(data[i][4])
        
    data = {
        'input_ids': input_list,
        'token_type_ids': token_list,
        'attention_masks': attention_list,
        'start_positions': s_pos,
        'end_positions': e_pos
    }
    
    #make dataset
    ds = Dataset.from_dict(data)
    return ds

In [124]:
train_data_qp_done=preprocess(train_data, 'qp')
valid_data_qp_done=preprocess(valid_data, 'qp')

train_data_rp_done=preprocess(train_data, 'rp')
valid_data_rp_done=preprocess(valid_data, 'rp')

Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


ids left: 4736
instances left 17834
Q+R+S longer than 512 tokens: 494  Remains: 17340
Q No span
[2002, 2210, 2611, 2356, 1996, 2879, 1024, 6838, 1010, 2079, 2017, 2156, 1996, 3392, 2648, 1029, 6838, 1024, 2748, 1012, 2210, 2611, 1024, 6838, 2079, 2017, 2156, 1996, 5568, 2648, 1029, 6838, 1024, 2748, 4757, 4757, 2015, 1006, 2893, 5458, 1997, 1996, 3980, 2011, 2023, 2051, 1007, 1012, 2210, 2611, 1024, 2106, 2017, 2156, 1996, 3712, 1029, 6838, 1024, 2748, 4757, 4757, 2015, 2210, 2611, 1024, 6838, 1010, 2079, 2017, 2156, 1996, 3836, 1029, 6838, 1024, 2748, 2210, 2611, 1024, 2079, 2017, 2156, 2014, 4167, 1029, 6838, 1024, 2053, 2210, 2611, 1024, 2059, 2429, 2000, 2054, 2057, 2020, 4036, 2651, 1999, 2082, 1010, 2016, 2442, 2025, 2031, 2028, 999]
[101, 2182, 2003, 2242, 2005, 2035, 2017, 12873, 23503, 2015, 2028, 2154, 1037, 1020, 2095, 2214, 2611, 2001, 3564, 1999, 1037, 9823, 1012, 1996, 3836, 2001, 2183, 2000, 4863, 6622, 2000, 1996, 2336, 1012, 1996, 3836, 2356, 1037, 2210, 2879, 1024, 68

[CLS] not if you have permission to do one from the buyer . if i sold a firearm , and did n ' t know the person so well , i would do one . i have contacts in businesses who can do them for me for free . its just a smart thing to do . should it be the law ? no way . but i would do it . better safe than sorry in this instance . [SEP] R No span
[4392, 2438, 5404, 2000, 2131, 1037, 12610, 1998, 5610, 2438, 8962, 2000, 2131, 1037, 12610, 2296, 2154, 1998, 1996, 5404, 4392, 2121, 2097, 2022, 4788, 1997]
[101, 11581, 6026, 1029, 1045, 3984, 2061, 1010, 2144, 1045, 1005, 1049, 2667, 2000, 4863, 2339, 2009, 1005, 1055, 10021, 2005, 2009, 2000, 2022, 6206, 1012, 2021, 1010, 3984, 2054, 1010, 2017, 2024, 11581, 6026, 2339, 5404, 2003, 3423, 1012, 1998, 2017, 1005, 2128, 2061, 3308, 2055, 5948, 5404, 2005, 2753, 2086, 1012, 4392, 2438, 5404, 2000, 2131, 1037, 12610, 1998, 5610, 2438, 8962, 2000, 2131, 1037, 12610, 2296, 2154, 1998, 1996, 5404, 4392, 2121, 2097, 2022, 4788, 2125, 1012, 2021, 2017, 

### Model / Collator / Trainer

In [9]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased")
model_name = "bert-large-uncased"
model_args = TrainingArguments(
    f"{model_name}-finetuned-aicup-qp",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    num_train_epochs=5,
    weight_decay=0.01
)


data_collator = default_data_collator

trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_qp_done,
    eval_dataset=valid_data_qp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

loading configuration file config.json from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c210431/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0

OutOfMemoryError: Caught OutOfMemoryError in replica 2 on device 2.
Original Traceback (most recent call last):
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1843, in forward
    outputs = self.bert(
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1014, in forward
    encoder_outputs = self.encoder(
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 603, in forward
    layer_outputs = layer_module(
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 489, in forward
    self_attention_outputs = self.attention(
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 419, in forward
    self_outputs = self.self(
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/nfs/nas-7.1/cwhuang/miniconda3/envs/IRhw/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 341, in forward
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 2; 47.46 GiB total capacity; 28.68 GiB already allocated; 103.44 MiB free; 28.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
trainer.save_model("aicup-trained-qp-bert-large")

In [44]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased")
model_name = "bert-large-uncased"
model_args = TrainingArguments(
    f"{model_name}-finetuned-aicup-rp",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    num_train_epochs=5,
    weight_decay=0.01
)


data_collator = default_data_collator

trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_rp_done,
    eval_dataset=valid_data_rp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

loading configuration file config.json from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/cwhuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891c

NameError: name 'default_data_collator' is not defined

In [None]:
trainer.save_model("aicup-trained-rp-bert-large")

### Predict Q'

In [5]:
def leave_unique_id(data: pd.DataFrame):
    ids = []
    index = []
    for i in range(data.shape[0]):
        if data.iloc[i]['id'] not in ids:
            ids.append(data.iloc[i]['id'])
            index.append(i)
    print(len(index), len(ids))
    data = data.iloc[index]
    return data

def format_data_post_qp(q: str, r: str, s: str, qp: str, rp: str, ids: str):
    q_r_s =  q + '[SEP]' + r + '[SEP]' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  q
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    token_type_id = [0 if _ in range(len(tokenized_q)) else 1 for _ in range(args['max_len'])]
    
    tokenized_q_r_s["token_type_ids"] = token_type_id
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s

def format_data_post_rp(q: str, r: str, s: str, qp: str, rp: str, ids: str):
    q_r_s =  r + '[SEP]' + q + '[SEP]' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  r
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    token_type_id = [0 if _ in range(len(tokenized_q)) else 1 for _ in range(args['max_len'])]
    
    tokenized_q_r_s["token_type_ids"] = token_type_id
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s
        
def postprocess(data: pd.DataFrame, choice: str):
    #data = keep_continuous(data) #seems unusefull in predicting
    print('ids left:', data['id'].nunique())
    print('instances left', data.shape[0])
    ids = list(data.id)
    Q, R, S, QP, RP = [data[field] for field in ["q", "r", "s", "q'", "r'"]]
    Q, R, S, QP, RP = [list(map(lambda x: x.strip('"'), y)) for y in [Q, R, S, QP, RP]]
    
    if choice == 'qp':
        data = list(map(format_data_post_qp, Q, R, S, QP, RP, ids))
    elif choice == 'rp':
        data = list(map(format_data_post_rp, Q, R, S, QP, RP, ids))
    input_list, token_list, attention_list, offset, ex_id =[], [], [], [], []
    
    for i in range(len(data)):
        input_list.append(data[i]["input_ids"])
        token_list.append(data[i]["token_type_ids"])
        attention_list.append(data[i]["attention_mask"])
        offset.append(data[i]["offset_mapping"])
        ex_id.append(data[i]["example_id"])
        
    data = {
        'input_ids': input_list,
        'token_type_ids': token_list,
        'attention_mask': attention_list,
        'offset_mapping': offset,
        'example_id': ex_id
    }
    
    #make dataset
    ds = Dataset.from_dict(data)
    
    return ds


In [6]:
test_data_unique = leave_unique_id(test_data)
test_qp_post = postprocess(test_data_unique, 'qp')
test_rp_post = postprocess(test_data_unique, 'rp')

Token indices sequence length is longer than the specified maximum sequence length for this model (778 > 512). Running this sequence through the model will result in indexing errors


1598 1598
ids left: 1598
instances left 1598
ids left: 1598
instances left 1598


In [81]:
# load model
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("./bert-large-uncased-finetuned-aicup-qp/checkpoint-5000")

test_args = TrainingArguments(
    output_dir = "bert-large-uncased-finetuned-aicup-qp/checkpoint-5000",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
)

# init trainer
trainer = Trainer(model = model, args = test_args)
raw_predictions_qp = trainer.predict(test_qp_post)

loading configuration file ./bert-large-uncased-finetuned-aicup-qp/checkpoint-5000/config.json
Model config BertConfig {
  "_name_or_path": "./bert-large-uncased-finetuned-aicup-qp/checkpoint-5000",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./bert-large-uncased-finetuned-aicup-qp/checkpoint-5000/pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionA

In [82]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-finetuned-aicup-rp/checkpoint-5000")

test_args = TrainingArguments(
    output_dir = "bert-large-uncased-finetuned-aicup-rp/checkpoint-5000",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
)

# init trainer
trainer = Trainer(model = model, args = test_args)
raw_predictions_rp = trainer.predict(test_rp_post)

loading configuration file bert-large-uncased-finetuned-aicup-rp/checkpoint-5000/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased-finetuned-aicup-rp/checkpoint-5000",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file bert-large-uncased-finetuned-aicup-rp/checkpoint-5000/pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnsweri

In [83]:
test_qp_post.set_format(type=test_qp_post.format["type"], columns=list(test_qp_post.features.keys()))
test_rp_post.set_format(type=test_rp_post.format["type"], columns=list(test_rp_post.features.keys()))

In [84]:
from tqdm.auto import tqdm
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, choice, n_best_size = 20, max_answer_length = 510):
    all_start_logits, all_end_logits = raw_predictions
    n_best_size = 20
    predictions = collections.OrderedDict()
    #print(examples.shape[0], len(features))
    # Let's loop over all the examples!
    for example_index in range(examples.shape[0]):
        # Those are the indices of the features associated to the current example.
        valid_answers = []
        
        if choice == 'qp':
            context = examples.iloc[example_index]["q"][1:-1] #strip "
        elif choice == 'rp':
            context = examples.iloc[example_index]["r"][1:-1] #strip "
        #print("Q :", context)
        
        # We grab the predictions of the model for this feature.
        start_logits = all_start_logits[example_index]
        end_logits = all_end_logits[example_index]
        # This is what will allow us to map some the positions in our logits to span of texts in the original
        # context.
        offset_mapping = features[example_index]["offset_mapping"]
        #print(offset_mapping)
        # Update minimum null prediction.
        cls_index = features[example_index]["input_ids"].index(tokenizer.cls_token_id)
        feature_null_score = start_logits[cls_index] + end_logits[cls_index]

        # Go through all possibilities for the `n_best_size` greater start and end logits.
        start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                # to part of the input_ids that are not in the context.
                if (
                    start_index >= len(offset_mapping)
                    or end_index >= len(offset_mapping)
                    or offset_mapping[start_index] is None
                    or offset_mapping[end_index] is None
                ):
                    continue
                # Don't consider answers with a length that is either < 0 or > max_answer_length.
                if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                    continue

                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                #print(start_index, end_index, start_char, end_char)
                valid_answers.append(
                    {
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": context[start_char: end_char] # +1 because of the starting "
                    }
                )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        #print("Q':", best_answer["text"])
        #print("=================")
        predictions[examples.iloc[example_index]["id"]] = best_answer["text"]
        
    return predictions


In [85]:
final_predictions_qp = postprocess_qa_predictions(test_data_unique, test_qp_post, raw_predictions_qp.predictions, 'qp')
final_predictions_rp = postprocess_qa_predictions(test_data_unique, test_rp_post, raw_predictions_rp.predictions, 'rp')

In [86]:
print(final_predictions_rp)

OrderedDict([(9, 'based on the idea that people are dispensible , particularly if they obstruct your well-being . a'), (12, ''), (13, 'You have to be off for more than 4 days and after that I would get the princely sum of Â£79.15 per week .'), (37, 'Then you freely admit that you lied when you said this , and I quote : People like Arch are setting it as opposed to science and in that position it will be doomed to fail . AND Arch is doing nothing but distorting the very thing he holds so dear ,'), (38, 'wow , I never thought a qoute would be so stupid as to compare it to a house . Fine'), (39, 'In a way , taking rights away is an American value since we have prescribed methods for doing it . For'), (41, 'The coelacanths found are different from those in the fossil record . Further'), (60, 'Your making a bunch of arbitrary statments .'), (78, 'The situation has nothing to do with chance . People'), (79, 'or just ship it to a frend where it is legal to sell it your self ,'), (82, "So Bloo

In [87]:
ids = []
qp = []
rp = []
for k,v in final_predictions_qp.items():
    ids.append(k)
    qp.append(v)
for k,v in final_predictions_rp.items():
    rp.append(v)

dict = {'id': ids, "q'": qp, "r'": rp} 
df = pd.DataFrame(dict) 
df = df.replace(np.nan, '', regex=True)
df.to_csv('./outputs/predict_test_bert-large-ckp5000.csv',header=False)

In [73]:
test_dropped = test_data.drop(['q', 'r', 's'], axis=1)
test_dropped = test_dropped.fillna('')

In [74]:
test_dropped.to_csv('test_gold_622.csv',header=False)