## download atticus data 

In [183]:
!git clone https://github.com/TheAtticusProject/cuad.git
!mv cuad cuad-training
!unzip cuad-training/data.zip -d cuad-data/
# !mkdir cuad-models
# !curl https://zenodo.org/record/4599830/files/roberta-base.zip?download=1 --output cuad-models/roberta-base.zip
# !unzip cuad-models/roberta-base.zip -d cuad-models/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (35) TCP connection reset by peer
unzip:  cannot find or open cuad-models/roberta-base.zip, cuad-models/roberta-base.zip.zip or cuad-models/roberta-base.zip.ZIP.


## run batch predictions on "roberta based" models trained on cuad

In [1]:
   
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits

def run_prediction(question_texts, context_text, model_path='marshmellow77/roberta-base-cuad'):
    ### Setting hyperparameters
    max_seq_length = 512
    doc_stride = 256
    n_best_size = 1
    max_query_length = 64
    max_answer_length = 512
    do_lower_case = False
    null_score_diff_threshold = 0.0

    # model_name_or_path = "../cuad-models/roberta-base/"

    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    config_class, model_class, tokenizer_class = (
        AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
    config = config_class.from_pretrained(model_path)
    tokenizer = tokenizer_class.from_pretrained(
        model_path, do_lower_case=True, use_fast=False)
    model = model_class.from_pretrained(model_path, config=config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    processor = SquadV2Processor()
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs.to_tuple()]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    final_predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=all_results,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        do_lower_case=do_lower_case,
        output_prediction_file=None,
        output_nbest_file=None,
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=null_score_diff_threshold,
        tokenizer=tokenizer
    )

    return final_predictions

## base model "bert (trained on non legal data)" 

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering


class DocumentReader:
    def __init__(self, pretrained_model_name_or_path='bert-large-uncased'):
        self.READER_PATH = pretrained_model_name_or_path
        self.tokenizer = AutoTokenizer.from_pretrained(self.READER_PATH)
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.READER_PATH)
        self.max_len = self.model.config.max_position_embeddings
        self.chunked = False

    def tokenize(self, question, text):
        self.inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
        self.input_ids = self.inputs["input_ids"].tolist()[0]

        if len(self.input_ids) > self.max_len:
            self.inputs = self.chunkify()
            self.chunked = True

    def chunkify(self):
        """ 
        Break up a long article into chunks that fit within the max token
        requirement for that Transformer model. 

        Calls to BERT / RoBERTa / ALBERT require the following format:
        [CLS] question tokens [SEP] context tokens [SEP].
        """

        # create question mask based on token_type_ids
        # value is 0 for question tokens, 1 for context tokens
        qmask = self.inputs['token_type_ids'].lt(1)
        qt = torch.masked_select(self.inputs['input_ids'], qmask)
        chunk_size = self.max_len - qt.size()[0] - 1 # the "-1" accounts for
        # having to add an ending [SEP] token to the end

        # create a dict of dicts; each sub-dict mimics the structure of pre-chunked model input
        chunked_input = OrderedDict()
        for k,v in self.inputs.items():
            q = torch.masked_select(v, qmask)
            c = torch.masked_select(v, ~qmask)
            chunks = torch.split(c, chunk_size)
            
            for i, chunk in enumerate(chunks):
                if i not in chunked_input:
                    chunked_input[i] = {}

                thing = torch.cat((q, chunk))
                if i != len(chunks)-1:
                    if k == 'input_ids':
                        thing = torch.cat((thing, torch.tensor([102])))
                    else:
                        thing = torch.cat((thing, torch.tensor([1])))

                chunked_input[i][k] = torch.unsqueeze(thing, dim=0)
        return chunked_input

    def get_answer(self):
        if self.chunked:
            answer = ''
            for k, chunk in self.inputs.items():
                output = self.model(**chunk)
                answer_start_scores, answer_end_scores = output['start_logits'] , output['end_logits']
                answer_start = torch.argmax(answer_start_scores)
                answer_end = torch.argmax(answer_end_scores) + 1

                ans = self.convert_ids_to_string(chunk['input_ids'][0][answer_start:answer_end])
                if ans != '[CLS]':
                    answer += ans + " / "
            return answer
        else:
            answer_start_scores, answer_end_scores = self.model(**self.inputs)

            answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
            answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
        
            return self.convert_ids_to_string(self.inputs['input_ids'][0][
                                              answer_start:answer_end])

    def convert_ids_to_string(self, input_ids):
        return self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids))


## Checking how cuad questions were set - t set custom question similar fashion

In [3]:
import json
with open('./cuad-data/CUADv1.json') as json_file:
    data = json.load(json_file)
questions = []
for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
    question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
    questions.append(question)
contract = data['data'][0]['paragraphs'][0]['context']

## Read in the questions and the document that serves the asnwers to those question

In [33]:
import pandas as pd
filename_with_questions = 'MSA/TermDefinitionswithQA.xlsx'
termdefinitionqa = pd.read_excel(filename_with_questions)
import glob 
import itertools
import re 
import itertools
from collections import OrderedDict
filename = 'MSA/GenentechInc_Restated_MSSA_01Jan17.pdf'
opfoldernname = filename.split(".pdf")[0]+"/output/"

lines = []
files = glob.glob(opfoldernname+ "./page*inreadingorder*.txt")
files = [x[1] for x in sorted(zip(list(map(lambda x : int(x.split("page_")[1].split("-pdf")[0]), files)), files))]
for fname in files:
    with open(fname) as f:
        lines.append(f.readlines())
lines = list(itertools.chain.from_iterable(lines))
msa_text = "".join(lines)

In [35]:
termdefinitionqa.head(2)

Unnamed: 0,CATEGORY,SUB-CATGEORY,DEFINITION,Question
0,INVESTMENTS,Costs to fulfill a contract (606),"Contract includes provision requiring PPD, at ...",What are the costs
1,INVESTMENTS,Free Services (606),Contract includes provision setting out that P...,What are the free or discounted services that ...


In [36]:
questions = termdefinitionqa.apply( lambda x : x['Question']+"?" + " Detail: "+ x['DEFINITION'],axis=1)


## run predictions on two Roberta and Deberta models on quad

In [None]:
questions = termdefinitionqa.apply( lambda x : x['Question']+"?" + " Detail: "+ x['DEFINITION'],axis=1)
predictions_I = run_prediction(questions, msa_text,"Rakib/roberta-base-on-cuad")
predictions_II = run_prediction(questions, msa_text,"akdeniz27/deberta-v2-xlarge-cuad")
predictions_III = []
# reader = DocumentReader("deepset/bert-base-cased-squad2") 
# for question in termdefinitionqa.Question:
#     reader.tokenize(question, msa_text)
#     predictions_III.append(reader.get_answer())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


convert squad examples to features: 100%|██████████| 45/45 [14:47<00:00, 19.72s/it]  
add example index and unique id: 100%|██████████| 45/45 [00:00<00:00, 5925.27it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


convert squad examples to features: 100%|██████████| 45/45 [13:18<00:00, 17.73s/it]  
add example index and unique id: 100%|██████████| 45/45 [00:00<00:00, 6242.56it/s]


In [None]:
output = pd.DataFrame({"Q": questions , "P1":predictions_II.values(),"P2":predictions_I.values(),"P3":""})
output["Answer"] =  output.\
apply( lambda x : x["P1"] if ( (len(x["P1"])>1) and (x["P1"]!='empty'))\
       else x["P2"] if ( (len(x["P1"])>1) and (x["P1"]!='empty')) \
       else x["P3"] , axis=1  )
output.to_csv(opfoldernname+"answer.csv")

In [None]:
## predict based on last hidden layer embedding -  mpnet and deepset-roberta based 

In [None]:
import numpy as np
import time
import hashlib
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
from tqdm import tqdm
import os
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# import textract
from scipy.special import softmax
import pandas as pd
from datetime import datetime
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
if device == 'cuda:0':
    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
else:
    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)
    
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

def encode_query(query):
    encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    embeddings = cls_pooling(model_output)

    return embeddings.cpu()


def encode_docs(docs,maxlen = 64, stride = 32):
    encoded_input = []
    embeddings = []
    spans = []
    file_names = []
    name, text = docs
    
    text = text.split(" ")
    if len(text) < maxlen:
        text = " ".join(text)
        
        encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
        spans.append(temp_text)
        file_names.append(name)

    else:
        num_iters = int(len(text)/maxlen)+1
        for i in range(num_iters):
            if i == 0:
                temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
            else:
                temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])

            encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
            spans.append(temp_text)
            file_names.append(name)

    with torch.no_grad():
        for encoded in tqdm(encoded_input): 
            model_output = model(**encoded, return_dict=True)
            embeddings.append(cls_pooling(model_output))
    
    embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())
    
#     np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings))) 
#     np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
#     np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))
    
    return embeddings, spans, file_names
   
def predict(query,text):
#     name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
    name_to_save = 'msa'
#     st = str([query,name_to_save])
#     st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
#     hist = st + " " + st_hashed 
#     now = datetime.now()
#     current_time = now.strftime("%H:%M:%S")
#     try:
#         df = pd.read_csv("{}.csv".format(st_hashed))
#         return df
#     except Exception as e:
#         print(e)
#         print(st)

#     if name_to_save+".txt" in os.listdir():
#         doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
#         doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
#         file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
        
#         doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
#         doc_text = list(doc_text.values())
#         file_names = list(file_names_dicto.values())
    
#     else:
#         text = textract.process("{}".format(data.name)).decode('utf8')
    text = text.replace("\r", " ")
    text = text.replace("\n", " ")
    text = text.replace(" . "," ")

    doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)

    doc_emb = doc_emb.reshape(-1, 768)
    with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
        f.write(text)
    start = time.time()
    query_emb = encode_query(query)
    
    scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
    doc_score_pairs = list(zip(doc_text, scores, file_names))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    k = 5
    probs_sum = 0
    probs = softmax(sorted(scores,reverse = True)[:k])
    table = {"Passage":[],"Answer":[],"Probabilities":[],"Source":[]}
    
    for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
        passage = passage.replace("\n","")
        passage = passage.replace(" . "," ")
        
        if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
            QA = {'question':query,'context':passage}
            ans = pipe(QA)
            probabilities = "P(a|p): {}, P(a|p,q): {}, P(p|q): {}".format(round(ans["score"],5), 
                                                                          round(ans["score"]*probs[i],5), 
                                                                          round(probs[i],5))
            passage = passage.replace(str(ans["answer"]),str(ans["answer"]).upper()) 
            table["Passage"].append(passage)
            table["Passage"].append("---")
            table["Answer"].append(str(ans["answer"]).upper())
            table["Answer"].append("---")
            table["Probabilities"].append(probabilities)
            table["Probabilities"].append("---")
            table["Source"].append(names)
            table["Source"].append("---")
        else:
            table["Passage"].append(passage)
            table["Passage"].append("---")
            table["Answer"].append("no_answer_calculated")
            table["Answer"].append("---")
            table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
            table["Probabilities"].append("---")
            table["Source"].append(names)
            table["Source"].append("---")
    df = pd.DataFrame(table)
    print("time: "+ str(time.time()-start))
    
#     with open("HISTORY.txt","a", encoding = "utf-8") as f:
#         f.write(hist)
#         f.write(" " + str(current_time))
#         f.write("\n")
#         f.close()
#     df.to_csv("{}.csv".format(st_hashed), index=False)
    
    return df

## comparision to Deepset prediction
### Deepset performed better

In [None]:
import pandas as pd
termdefinitionqa = pd.read_excel(filename_with_questions)
import glob 
import itertools
import re 
import itertools
from collections import OrderedDict
predictions_III= {}
questions = termdefinitionqa.apply( lambda x : x['Question']+"?" + " Detail: "+ x['DEFINITION'],axis=1)
for q in termdefinitionqa.Question:
    predictions_III[q] = predict(q+"?",msa_text) 
passages = []
for k in predictions_III.keys():
    op = predictions_III[k].query("Answer!='---' and Answer!='no_answer_calculated' ")
    op['Prob'] = op.Probabilities.apply(lambda x : float(x.split(",")[0].split("P(a|p):")[1]) )
    passages.append( ((op.sort_values(["Prob"],ascending=False).iloc[0].Passage),
    (op.sort_values(["Prob"],ascending=False).iloc[0].Prob)))
pd.DataFrame(passages).to_csv(opfoldernname+"answer2.csv")    

In [44]:
final = pd.concat([pd.read_csv(opfoldernname+"/answer.csv", index_col=0,keep_default_na=False).\
drop(["P3","Answer"],axis=1),
           pd.read_csv(opfoldernname+"/answer2.csv" , index_col=0,keep_default_na=False).\
rename({'0':'P3','1':'PROB'},axis=1)],axis=1)
final["Answer"] =  final.\
apply( lambda x : x["P1"] if ( (len(x["P1"])>1) and (x["P1"]!='empty'))\
       else x["P2"] if ( (len(x["P1"])>1) and (x["P1"]!='empty')) \
       else x["P3"] if x['PROB']>0.00001 else "no-answer" , axis=1  )
final[["Q","Answer","PROB"]].to_csv(opfoldernname+"finalop.csv")

'MSA/GenentechInc_Restated_MSSA_01Jan17/output/'