<a href="https://colab.research.google.com/github/DAlkemade/bert-for-fever/blob/master/L101_inference_BERT_document_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TODO:
- Loop over dev.sentences.p5.s5.jsonl om alle zinnen uit de predicted_documents  te halen (eventueel in los notebook (tsv maken))
- Maak een feautures object met tokenize notebook
- Maak in dit notebook een lijst met ids van de .tsv en laadt bijbehorende features object. Zorg dat id weer in de TensorDataSet komt
- Doe predictions voor alle instances in de features en sla de evidence logit op in een dict op met de vorm {'id': ['sentence': 'x', 'chance':chance of being evidence]}}
- Loop over dev.sentences.p5.s5.jsonl en kies voor elke claim de top 5 sentences (of minder als er minder als evidence zijn geclassified, dus stel 3 hebben argmax=evidence, doe er dan 3 (betere precision)). Voor not verifiable entries gewoon een lege list oid erin stoppen

In [0]:
!pip install torch
!pip install transformers
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [0]:
import pandas as pd
from google.colab import drive
import torch
from transformers import *
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset, WeightedRandomSampler)
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os
from sklearn.metrics import confusion_matrix
import json
import pprint
from scipy.special import softmax
import pickle

In [0]:
BATCH_SIZE = 32
WORK_DIR = '/content/drive/My Drive/Overig'
cached_features_file_dev = os.path.join(WORK_DIR, '/content/drive/My Drive/Overig/200104213845features_document_selection_from_document_selection_test_n=50')
data_fname = '/content/drive/My Drive/Overig/document_selection_test_n=50.tsv'
docs_input_file = '/content/drive/My Drive/Overig/test_baseline_pages.sentences.p5.s5.jsonl'
N = 5
pp = pprint.PrettyPrinter(indent=4)
TEST_IDS = [137334, 145446]
OUT_TAG = 'using 50 docs from baseline'
HALF_DATA = True
LOAD_PREVIOUS_EVIDENCE = False
FIRST_HALF = True
model_name = 'doc model hnm'

In [0]:
drive.mount('/content/drive')

In [0]:
with open(docs_input_file, "r") as in_file:
    instances = []
    for line in in_file:
        instances.append(json.loads(line))
print(len(instances))
for instance in instances:
    instance.pop('predicted_pages', None) # drop all predicted sentences, since that's what we're doing

In [0]:
data = pd.read_csv(data_fname)
data.head(10)

In [0]:
claim_ids = list(data.claim_id)
claims_not_reduced = claim_ids
doc_ids = list(data.doc_id)
del data

In [0]:
print("Load cached dev features")
features_dev = torch.load(cached_features_file_dev)
print("Loaded features")
print(f'Len features: {len(features_dev)}')
number_instances = 500000

if HALF_DATA:
    if FIRST_HALF:
        features_dev = features_dev[:number_instances]
        claim_ids = claim_ids[:number_instances]
        doc_ids = doc_ids[:number_instances]
    else:
        features_dev = features_dev[number_instances:]
        claim_ids = claim_ids[number_instances:]
        doc_ids = doc_ids[number_instances:]



In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [0]:
features_length = len(features_dev)
print(f'Len features after: {features_length}')
print(f'Len doc ids: {len(doc_ids)}')
print(f'Len claim ids: {len(claim_ids)}')

In [0]:
def create_dataloader(features, dev=False):
    # The next lines are taken from the example at https://github.com/huggingface/transformers/blob/0cb163865a4c761c226b151283309eedb2b1ca4d/transformers/data/processors/glue.py#L30
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    all_claim_ids = torch.tensor(claim_ids, dtype=torch.long)
    idx = torch.tensor(range(len(features)), dtype=torch.long)

    
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_claim_ids, idx)
    
    if dev:
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)
    else:
        class_counts = np.bincount(all_labels)
        class_sample_freq = 1/class_counts
        weights = [class_sample_freq[label] for label in all_labels]
        print(class_counts)
        print(class_sample_freq)
        # sampler = RandomSampler(dataset)
        num_samples = round(class_counts[1]*2).item() #.item to convert to native int
        print(f'Num samples: {num_samples}')
        sampler = WeightedRandomSampler(weights, num_samples=num_samples, replacement=True) # we want to use all positive instances and use equally as many negative instances. This should now generally happen by chance
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [0]:
torch.cuda.empty_cache()
print("Create dev dataloader")
dataloader_dev = create_dataloader(features_dev, dev=True)
del features_dev

In [0]:
model = BertForSequenceClassification.from_pretrained(f"/content/drive/My Drive/Cambridge/L101/{model_name}", num_labels=2)
model.cuda()
pass # suppress model.cuda output

In [0]:
def unpack_batch(batch):
    input_ids = batch[0]
    attention_mask = batch[1]
    type_ids = batch[2]
    y_true = batch[3]
    claim_id = batch[4]
    doc_id = [doc_ids[idx] for idx in batch[5]]  #retrieve doc ids
    return input_ids, attention_mask, type_ids, y_true, claim_id, doc_id

In [0]:
if LOAD_PREVIOUS_EVIDENCE:
    with open('/content/drive/My Drive/Overig/docs_evidence_full_hnm.pkl', 'rb') as f:
        evidence = pickle.load(f)
    with open('/content/drive/My Drive/Overig/docs_evidence_all_full_hnm.pkl', 'rb') as f:
        evidence_all_scores = pickle.load(f)
    print("Loaded evidence")
else:
    evidence = dict((el,[]) for el in dict.fromkeys(claims_not_reduced))
    evidence_all_scores = dict((el,[]) for el in dict.fromkeys(claims_not_reduced))
print(f"Number of claims: {len(evidence.keys())}")
print("Done")

In [0]:
model.eval()

print("Start evaluation")

# Variables for evaluation
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
conf_matrix = np.zeros((2,2))

print(f"Number of batches: {len(dataloader_dev)}")
for step, batch in enumerate(dataloader_dev):
    if step % 1000 == 0:
            print(f'\nAt step {step}')
    # Move batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack values
    input_ids, attention_mask, type_ids, y_true, claim_ids_batch, doc_ids_batch = unpack_batch(batch)
    # Telling the model not to compute or store gradients, saving memory and speeding up validation. taken from https://mccormickml.com/2019/07/22/BERT-fine-tuning/
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(input_ids, token_type_ids=type_ids, attention_mask=attention_mask, labels=y_true)
        logits = outputs[1]


    # Move logits and labels to CPU. from https://mccormickml.com/2019/07/22/BERT-fine-tuning/, as this should free up RAM
    logits = logits.detach().cpu().numpy()
    y_true = y_true.to('cpu').numpy()
    pred_flat = np.argmax(logits, axis=1).flatten()
    y_true_flat = y_true.flatten()

    for i, claim_id in enumerate(claim_ids_batch):
        softmax_logits = softmax(logits[i]) # !!!
        # print(softmax_logits)
        # print(f'For claim {claim_id}; doc_id: {doc_ids_batch[i]}; sentence idx: {sentence_idxs_batch[i]}')
        # save all regression scores to dict
        evidence_all_scores[claim_id.item()].append([softmax_logits[1], doc_ids_batch[i]])
        
        # save just the sentences labeled as evidence to dict
        if pred_flat[i] == 1: # only if classified as evidence
            evidence[claim_id.item()].append([softmax_logits[1], doc_ids_batch[i]])    

    # tmp_eval_accuracy = flat_accuracy(pred_flat, y_true_flat)
    
    # eval_accuracy += tmp_eval_accuracy

    # # Update confusion matrix, to save tp, tn, fp, tn 
    # tmp_confusion_matrix = confusion_matrix(y_true_flat, pred_flat, labels=[0,1])
    # conf_matrix = np.add(conf_matrix, tmp_confusion_matrix)

    # nb_eval_steps += 1

In [0]:
mean_number_evidence_predictions_per_claim = np.mean([len(value) for value in evidence.values()])
mean_number_evidence_predictions_per_claim

In [0]:
with open('/content/drive/My Drive/Overig/docs_evidence_full_hnm.pkl', 'wb') as f:
    pickle.dump(evidence, f)
with open('/content/drive/My Drive/Overig/docs_evidence_all_full_hnm.pkl', 'wb') as f:
    pickle.dump(evidence_all_scores, f)