## A model to further fine tune the 'bert-large-uncased-whole-word-masking-finetuned-squad' pre-trained model
This model allows you to further fine-tune the pre-trained and fine tuned 'bert-large-uncased-whole-word-masking-finetuned-squad' used in the 2P DEMO BERT legal predict.ipynb notebook.
The notebook requires that you have a .csv file available with the folliwing column headers:
- 'context' - This is the text which you are trying to extract the answer from
- 'question' - This is the question being asked
- 'answer' - This is the answer, which must be in the 'context' character for character
- 'answer_start' - This is the start character of the 'answer' within the 'context'

The model expects a .csv as input, and carries out the following:
- prepares the data to enable fine tuning of the 'bert-large-uncased-whole-word-masking-finetuned-squad'
- tokenised the data
- trains the model using an AdamW optimizer using the pytorch library
- save the model
- carries out validation, using a separate carved out validation dataset
- Enables prediction using the new fine-tuned model on your own data

## Train model

In [None]:
# Load required libraries
import pandas as pd
import transformers
import torch
from tqdm import tqdm
from transformers import AutoModelForQuestionAnswering, AdamW, AutoTokenizer
from torch.utils.data import DataLoader

In [None]:
# Load pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Load tokenizer - Need to use the BERT tokenizer, as other tokenizers not accepted
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
# Load a small dataset for fine tuning

# Replace with your own dataset
datasets = pd.read_csv('/home/malmason/datasets/squad_csv/SQuAD_csv_sm.csv')

In [None]:
# Remove data where answer not in context
array = []
for i in range(len(datasets)):
    if datasets['answer'][i] not in datasets['context'][i]:
        array.append(i)
datasets.drop(datasets.index[array], axis=0, inplace=True)

In [None]:
datasets.reset_index(drop=True, inplace=True)

In [None]:
# Take a look at the dataset
datasets.tail()

In [None]:
# Sets answers and answer start points as dictionary items
data_answers = []
temp_data = {}
for answer, answer_start in zip(datasets.answer, datasets.answer_start):
    temp_data['text'] = str(answer)
    temp_data['answer_start'] = int(answer_start)
    dict_copy = temp_data.copy()
    data_answers.append(dict_copy)

In [None]:
# Get context containing answer and the answer itself
data_contexts = datasets.context
data_questions = datasets.question

In [None]:
# Split train and val datasets as needed based on your fine tuning data
train_answers = data_answers[:80000]
val_answers = data_answers[80000:]
train_contexts = data_contexts[:80000]
val_contexts = data_contexts[80000:]
train_questions = data_questions[:80000]
val_questions = data_questions[80000:]

In [None]:
# Take a look to see output is as expected
print(data_contexts[0], data_questions[0], data_answers[0])

In [None]:
# Add answer end character to data
def add_end_idx(answers, contexts):

    for answer, context in zip(answers, contexts):
        
        # refers to text we expect to find in context
        gold_text = str(answer['text'])
        
        # get start index
        start_idx = answer['answer_start']
        
        # coonvert data type to int
        start_idx = int(start_idx)
        
        # record end index position
        end_idx = start_idx + len(gold_text)

        # Adjust in case the end index is off
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n


In [None]:
# apply function add_end_index
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
# Verify answer_end is there, as can sometimes be missing if answer not where expected
count = 0
for answer in (train_answers):
    if 'answer_end' not in answer:
        print(answer, count)
    count +=1

In [None]:
# convert train and val contexts, questions and answers to lists
train_contexts = list(train_contexts)
train_questions = list(train_questions)
val_contexts = list(val_contexts)
val_questions = list(val_questions)
train_answers = list(train_answers)
val_answers = list(val_answers)

In [None]:
# call tokenizer for training and val data
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    
    for i in range(len(answers)):

        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # truncate if start position is none
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # find end token
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update tokenised data with start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
# call add_token_position function
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
# Quick check keys are there. Will be different based on BERT model used
train_encodings.keys()

In [None]:
# Check encoding - Format is: Context starts with start token [CLS], and finishes with [SEP], where the question follows
print(val_encodings[40].tokens)

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
# Bring data together for train and val encodings
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
# Set device to GPU if it exists, else CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Use GPU
model.to(device)
# set train mode
model.train()
# set learning rate
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader with batch size that will fit GPU
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

for epoch in range(2):

    model.train()

    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize gradients
        optim.zero_grad()
        # get inputs and send to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # get loss
        loss = outputs[0]
        loss.backward()

        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
# Save model and tokenizer
model_path = 'models/bert-large-uncased-whole-word-masking-finetuned-squad-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

## Evaluate model

In [None]:
# Load model
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
model.to(device)

In [None]:
# Set model to evaluate
model.eval()

# Use dataloader with batch size to load val data
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

loop = tqdm(val_loader)

for batch in loop:

    with torch.no_grad():

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # predict
        outputs = model(input_ids, attention_mask=attention_mask)
        # get start and end predictions from outputs
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)
print("Accuracy: ", acc)

## Predict model

In [None]:
# Load text extractor and natural language tool kit
from pdfminer.high_level import extract_text
import nltk

In [None]:
# Get an agreement
filename = 'DEMO_VitalibisInc_20180316_8-K_EX-10.2_11100168_EX-10.2_Hosting Agreement.pdf'
doc = extract_text(filename)

In [None]:
# Remove characters not desired in text
book = doc.replace("\n" , "")
book = book.replace("\x0c", "")
book = book.replace("  ", " ")

In [None]:
# Break book into sentences
sent_corpus = nltk.sent_tokenize(book)

In [None]:
# Use GPU
device = torch.device("cuda")
model.to(device)

In [None]:
def question_answer(question, sent_corpus):
    max_prob = -10.0
    
    # loop through sentences
    for sent in sent_corpus:
        
        # Convert text to string
        text = str(sent)
        
        # Tokenise the question and text
        inputs = tokenizer(question, text, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt").to(device)
        input_ids = inputs["input_ids"].tolist()[0]
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        
        # Run the tokenised text through the pre-trained auto model for  question answering, and store outputs
        outputs = model(**inputs)

        # Get start and end scores for each sentence from the model output
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get location of maximum start score
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1 
        
        # Get the maximum start and end probabilities
        max_prob_start = torch.max(answer_start_scores)
        max_prob_end = torch.max(answer_end_scores)
        
        # Sum the maximum start and end probabilities
        max_prob_startend = max_prob_start + max_prob_end
        
        # Check of score of prediction for sentence is higher than previously recorded
        if max_prob_startend > max_prob:
            max_prob = max_prob_startend
            
            # Convert answer tokens to string
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            # Store context where the answer was derived from as text answer
            text_answer = text
            
    print('BERT Answer:\n------------\n', answer, '\n\nSentence:\n---------\n', text_answer)

In [None]:
question_answer('Which two parties is the agreement between?', sent_corpus)

In [None]:
question_answer('When is the agreement dated?', sent_corpus)