<a href="https://colab.research.google.com/github/DAlkemade/bert-for-fever/blob/master/L101_tokenize_for_document_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tokenize the data in a .tsv file of features to use as input for a BERT model

In [0]:
!pip install torch
!pip install transformers
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [0]:
import pandas as pd
from google.colab import drive
import torch
from transformers import *
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset, WeightedRandomSampler)
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

In [0]:
TEST = False
MAX_SENTENCE_LENGTH = 512 #maybe make smaller and then batch size higher
PADDING_TOKEN_TYPE_ID = 0 # take the advice at https://github.com/huggingface/transformers/blob/0cb163865a4c761c226b151283309eedb2b1ca4d/transformers/data/processors/glue.py#L30
WORK_DIR = '/content/drive/My Drive/Overig'

In [0]:
drive.mount('/content/drive')
data_fname = '/content/drive/My Drive/Overig/document_selection_test_n=50.tsv'

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [0]:
data = pd.read_csv(data_fname)
data

In [0]:
if TEST:
    data= data[:100]

In [0]:
print(f"Number of training instances: {len(data.index)}")
print(f'Number of claims in training set: {len(list(dict.fromkeys(data.claim_id)))}')

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]

In [0]:
def prep_instance(claim, context, label, doc_id):
    
    # print(f'Total length: {len(claim) + len(source_sentence)}')
    context = f'[ {doc_id} ] {context}'
    encodings = tokenizer.encode_plus(claim, context, add_special_tokens=True, max_length=MAX_SENTENCE_LENGTH) # I expect this will cut off the documents
    input_ids, token_type_ids = encodings["input_ids"], encodings["token_type_ids"]
    # We mask padding with 0
    attention_mask = [1] * len(input_ids)
    # Pad on the right
    padding_length = MAX_SENTENCE_LENGTH - len(input_ids)
    # The next 3 lines are taken from the example at https://github.com/huggingface/transformers/blob/0cb163865a4c761c226b151283309eedb2b1ca4d/transformers/data/processors/glue.py#L30
    input_ids = input_ids + ([pad_token] * padding_length)
    # We mask padding with 0
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([PADDING_TOKEN_TYPE_ID] * padding_length)
    return InputFeatures(input_ids=input_ids,attention_mask=attention_mask,
                              token_type_ids=token_type_ids, label=label)


In [0]:
def create_features(data, dev=False):
    claims = list(data.claim)
    contexts = list(data.context)
    labels = list(data.label)
    claim_ids = list(data.claim_id)
    doc_ids = list(data.doc_id)
    features = [prep_instance(claims[i], contexts[i], labels[i], doc_ids[i]) for i in tqdm(range(len(claims)))]
    return features

In [0]:
# print("Create features")
features = create_features(data)

In [0]:
print("Save features")
torch.save(features, os.path.join(WORK_DIR, f'{datetime.now().strftime("%y%m%d%H%M%S")}features_document_selection_from_{data_fname.split(".")[0].split("/")[-1]}'))