In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch, os
from tqdm import tqdm
import nltk
from pdfminer.high_level import extract_text

# Resolve any conflicting libraries
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
model_name = 'DEMO_bert-large-uncased-whole-word-masking-legal_finetuned-squad/'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [3]:
filename = '2M DEMO_2020-12-15 H665 OOFFS_657.pdf'
doc = extract_text(filename)

In [4]:
book = doc.replace("\n" , "")
book = book.replace("\x0c", "")
book = book.replace("  ", " ")

In [5]:
sent_corpus = nltk.sent_tokenize(book)

In [6]:
#device = torch.device("cuda")
#model.to(device)

In [7]:
def question_answer(question, sent_corpus):
    max_prob = -10.0
    # loop through sentences
    for sent in sent_corpus:
        
        text = str(sent)
        
        inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")#.to(device)
        input_ids = inputs["input_ids"].tolist()[0]
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        outputs = model(**inputs)

        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1 
        
        max_prob_start = torch.max(answer_start_scores)
        max_prob_end = torch.max(answer_end_scores)
        max_prob_startend = max_prob_start + max_prob_end
        
        if max_prob_startend > max_prob:
            max_prob = max_prob_startend
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            text_answer = text
    print('BERT Answer:\n------------\n', answer, '\n\nSentence:\n---------\n', text_answer)

In [8]:
question_answer('When is the agreement made?', sent_corpus)

BERT Answer:
------------
 4 december 2020 

Sentence:
---------
 DATED 4 DECEMBER 2020 INVESTOR LIMITED and INVESTMENT LIMITED OPTION AGREEMENT FOR FUTURE SHARES       THIS AGREEMENT is made and entered into on 4 December 2020 (1) INVESTOR LIMITED a company incorporated in Jersey with registration number 123456 and whose registered office is at King Street, Jersey, Channel Islands, JE2 2EJ (the “Investor”); (2) INVESTMENT LIMITED a company incorporated in Jersey with registration number 654321 and whose registered office is at Queen Street, Jersey, JE2 2EJ, Channel Islands, Great Britain with an email address of investment@greatinvestments.com (the “Company”).


In [9]:
question_answer('Which two parties is the agreement between?', sent_corpus)

BERT Answer:
------------
 the company and the investor 

Sentence:
---------
 9 Representations and Warranties 9.1 Both the Company and the Investor have the right, power and authority and have taken all actions necessary to execute and to exercise their rights and perform their obligations under this Agreement.


In [None]:
question_answer('Where is the registered office of investor limited?', sent_corpus)

In [None]:
question_answer('Where is the registered office of investment limited?', sent_corpus)

In [10]:
question_answer("What is the Discount Rate:?", sent_corpus)

BERT Answer:
------------
 85 % 

Sentence:
---------
 1.1.9 Discount Price: the price per share of the Shares sold in the Fund Raising multiplied by the Discount Rate; 1.1.10 Discount Rate: 85%; 1.1.11 Fund Raising: the Company raising a total of £200,000,000 or more prior to the second anniversary of this Agreement from an issue of Shares to any person(s) (and excluding, for the avoidance of doubt, the Investment to be converted into Shares); 1.1.12 Investment: the sum of £500,000 to be invested into the Company as a convertible debt; and 1.1.13 Shares: the ordinary shares of £1.00 par value in the capital of the Company.


In [None]:
question_answer("How much has the Investor invested?", sent_corpus)

In [None]:
question_answer("Who pays the costs?", sent_corpus)

In [None]:
question_answer("Where are the registered offices of the contracting parties?", sent_corpus)

In [None]:
question_answer("How much notice does the investor have to give?", sent_corpus)

In [None]:
question_answer("Can parties transfer rights?", sent_corpus)

In [None]:
question_answer("How long does the agreement continue?", sent_corpus)