## Paper 2 Data Workflow for Data Extraction - CUADv1 - Inference

#### Sources of information, code and discussions


1. The foundation workflow is from Hugging Face's Token Classification example hosted on Colab [here][1]
2. The models are base models, each trained using a downstream token clasification task, example [here][2]

[1]: https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb
[2]: https://huggingface.co/roberta-base

### Initialize Environment

In [1]:
import os, re, math, random, json, string, csv

import pandas as pd
import numpy as np
from tqdm import tqdm
from IPython.display import display, HTML

import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, PreTrainedModel, RobertaTokenizerFast

from datasets import load_dataset, ClassLabel, Sequence 

import fitz # pip install PyMuPDF - PDF reader/parser

import spacy
from spacy.lang.en import English

from collections import defaultdict

# Resolve any conflicting libraries
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Configuration

In [2]:
# Hugging Face model references for Transformer library
models = dict(
    ROBERTA = "roberta-base", # Use for efficiency
    DEBERTA_V2_XL = "microsoft/deberta-v2-xlarge") # Use for accuracy

# RANDOM SEED FOR REPRODUCIBILITY
RANDOM_SEED = 42

# BATCH SIZE
# IDEALLY USE SAME BATCH SIZE FOR INFERENCE AS WAS USED FOR TRAINING
BATCH_SIZES = 2

# WHICH PRE-TRAINED TRANSFORMER TO FINE-TUNE?
MODEL_CHECKPOINT = models['ROBERTA']

### Step1: File and dataset handling

In [3]:
FEATURE_CLASS_LABELS = "feature_class_labels.json"
TEMP_MODEL_OUTPUT_DIR = 'temp_model_output_dir'
SAVED_MODEL = f"p2d-NER-Fine-Tune-Transformer-Final-{MODEL_CHECKPOINT}" # Change for notebook version
TEST_FILE_PATH = "./Test_Docs/"
TEST_DATA_FILE = 'test_data_file.json'
CSV_DATA_FILE = 'legal_agreement_data_file.csv'

In [4]:
# Walk through PDF files and create a dataframe with the names of the files, sorted alpha/num
pdf_files = []
for (dirpath, dirnames, filenames) in os.walk(TEST_FILE_PATH):
    pdf_files.extend(filenames)
# Remove any hidden files lurking in the directory
for i, f in enumerate(pdf_files):
    if f.startswith("."):
        pdf_files.pop(i)
print(f"Uploaded {len(pdf_files)} legal agreements from {TEST_FILE_PATH} folder: ", pdf_files)

Uploaded 5 legal agreements from ./Test_Docs/ folder:  ['4P 060427_WELLSFARGO_MBS_TRUST_YEA.PDF', '2P 05_04_2020-EX-10.3.PDF', '1P 04_24_1998-WFS.PDF', '5P 2020-12-15 H665 OOFFS_657.pdf', '3P 06_11_2020-EX-10.1-JVA.PDF']


### Step2: Pre-processing the data

In [5]:
# Text cleaning function for standard PDF parsing workflow
def pre_process_doc_common(text):
    text = text.replace("\n", " ")  # Simple replacement for "\n"   
    text = text.replace("\xa0", " ")  # Simple replacement for "\xa0"
    text = text.replace("\x0c", " ")  # Simple replacement for "\x0c"
    
    regex = "\ \.\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)  # Get rid of multiple dots
        
    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)  # Get rid of underscores
       
    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)   # Get rid of multiple dashes
        
    regex = "\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)  # Get rid of multiple stars
        
    regex = "\ +"
    subst = " "
    text = re.sub(regex, subst, text, 0)  # Get rid of multiple whitespace
    
    text = text.strip()  #Strip leading and trailing whitespace
    return text

In [6]:
# Function to take in the file list, read each file, clean the text and return all agreements in a list
def text_data(test_dir, pdf_files, print_text=False, clean_text=True, max_len=3000):
    text_list = []
    for filename in tqdm(pdf_files):
        agreement = fitz.open(test_dir+filename)
        full_text = ""
        for page in agreement:
            full_text += page.getText('text')#+"\n"
        if print_text:
            print("Text before cleaning: \n", full_text)

        # Run text through cleansing function
        if clean_text:
            full_text = pre_process_doc_common(full_text)
        short_text = full_text[:max_len]
        len_text = len(short_text)

        if print_text:
            print("Text after cleaning: \n", short_text)

        text_list.append([filename, full_text, short_text, len_text])
        
    return text_list

In [7]:
# Run reading and cleaning functions on the list of PDF files in the testing folder
# Use a max_length which is expected to capture the rich text information at the beginning of the document
test_dir = TEST_FILE_PATH
data = text_data(test_dir, pdf_files, print_text=False, clean_text=True, max_len=1000)

# Create dataframe with text
columns = ['File_Name','Full_Text', 'Short_Text', 'Length_Of_Short_Text']
text_df = pd.DataFrame(data=data, columns=columns)

100%|██████████| 5/5 [00:00<00:00, 97.98it/s]


In [8]:
# Have a look at the unstructured data captured so far
text_df

Unnamed: 0,File_Name,Full_Text,Short_Text,Length_Of_Short_Text
0,4P 060427_WELLSFARGO_MBS_TRUST_YEA.PDF,EXHIBIT 10.3 Yield Maintenance Agreement [LOGO...,EXHIBIT 10.3 Yield Maintenance Agreement [LOGO...,1000
1,2P 05_04_2020-EX-10.3.PDF,Ex 10.3 SERVICING AGREEMENT between CURO RECEI...,Ex 10.3 SERVICING AGREEMENT between CURO RECEI...,1000
2,1P 04_24_1998-WFS.PDF,1 EXHIBIT 10.14 OUTSOURCING AGREEMENT This Out...,1 EXHIBIT 10.14 OUTSOURCING AGREEMENT This Out...,1000
3,5P 2020-12-15 H665 OOFFS_657.pdf,DATED 4 DECEMBER 2020 INVESTOR LIMITED and INV...,DATED 4 DECEMBER 2020 INVESTOR LIMITED and INV...,1000
4,3P 06_11_2020-EX-10.1-JVA.PDF,Exhibit 10.1 JOINT VENTURE AGREEMENT THIS JOIN...,Exhibit 10.1 JOINT VENTURE AGREEMENT THIS JOIN...,1000


In [9]:
# What does an agreement look like?
text_df['Full_Text'][0]

'EXHIBIT 10.3 Yield Maintenance Agreement [LOGO UBS] Date: 27 April 2006 To: Wells Fargo Bank, N.A., not individually, but solely as Master Servicer on behalf of Wells Fargo Mortgage Backed Securities 2006-6 Trust ("Counterparty") Attention: Swaps Administration From: UBS AG, London Branch ("UBS AG") Subject: Interest Rate Cap Transaction UBS AG Ref: 37346733 Dear Sirs The purpose of this communication is to confirm the terms and conditions of the Transaction entered into between us on the Trade Date specified below. This Confirmation constitutes a "Confirmation" as referred to in the Master Agreement or Agreement specified below. The definitions contained in the 2000 ISDA Definitions as published by the International Swaps and Derivatives Association, Inc., are incorporated into this Confirmation. In the event of any inconsistency between any of the definitions listed above and this Confirmation, this Confirmation will govern. If you and we are parties to a master agreement that gover

### Step3: Tokenization and feature labels

In [10]:
# We tokenize each agreement prior to bringing into the transformer model
# Create tokens using spaCy
nlp = English()
text_df['tokens'] = text_df['Short_Text'].apply(lambda x: nlp(x))

# Split tokens into a list ready for CSV
text_df['split_tokens'] = text_df['tokens'].apply(lambda x: [tok.text for tok in x])

# Create dummy NER tags for alignment purposes (a bit lazy, but convinient)
text_df['dummy_ner_tags'] = text_df['tokens'].apply(lambda x: [0 for tok in x])

# Serialise the data to JSON for archive
export_columns = ['split_tokens', 'dummy_ner_tags']
export_df = text_df[export_columns]
export_df.to_json(TEST_DATA_FILE, orient="table", index=False)
text_df = text_df.drop(['dummy_ner_tags'], axis=1)

# Re-import the serialized JSON data and create a dataset in the format needed for the transformer
data_files = TEST_DATA_FILE
datasets = load_dataset('json', data_files=data_files, field='data')
print(datasets)

Using custom data configuration default-4142b84e942b24b2


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/phil/.cache/huggingface/datasets/json/default-4142b84e942b24b2/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /home/phil/.cache/huggingface/datasets/json/default-4142b84e942b24b2/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
DatasetDict({
    train: Dataset({
        features: ['split_tokens', 'dummy_ner_tags'],
        num_rows: 5
    })
})


In [11]:
# Open the label list created in pre-processing corresponding to the ner_tag indices
with open(FEATURE_CLASS_LABELS, 'r') as f:
    label_list = json.load(f)

for n in range(len(label_list)):
    print(n, label_list[n])

0 B-AGMT_DATE
1 B-DOC_NAME
2 B-PARTY
3 I-AGMT_DATE
4 I-DOC_NAME
5 I-PARTY
6 O


In [12]:
# Instantiate the tokenizer
#For RoBERTa-base, need to use RobertaTokenizerFast with add_prefix_space=True to use it with pretokenized inputs.

if MODEL_CHECKPOINT == models['ROBERTA']:
    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
        

In [13]:
# Functions deal with split tokens and special tokens used in each Transformer model
def word_id_func(input_ids, print_labs=False):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    word_ids = []
    i=0
    spec_toks = ['[CLS]', '[SEP]', '[PAD]']
    for t in tokens:
        if t in spec_toks:
            word_ids.append(-100)
            print(t, i) if print_labs else None
        elif t.startswith('▁'):
            i += 1
            word_ids.append(i)
            print(t, i) if print_labs else None
        else:
            word_ids.append(i)
            print(t, i) if print_labs else None
        print("Total:", i) if print_labs else None
    return word_ids

def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["dummy_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def tokenize_and_align_labels_deberta(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    word_ids_list = []
    for input_ids in tokenized_inputs["input_ids"]:
        wids = word_id_func(input_ids, print_labs=False)
        word_ids_list.append(wids)
    
    for i, label in enumerate(examples["dummy_ner_tags"]):
        word_ids = word_ids_list[i]
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx == -100:
                label_ids.append(-100)
            #We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx-1])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx-1] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
# To apply this function on all the words and labels in our dataset,
# we just use the map method of our dataset object we created earlier.

# 🤗 Datasets warns you when it uses cached files, you can pass load_from_cache_file=False in the
# call to map to not use the cached files and force the preprocessing to be applied again.
if MODEL_CHECKPOINT == models['DEBERTA_V2_XL']:
    tokenize_and_align_labels = tokenize_and_align_labels_deberta

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, load_from_cache_file=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




### Step 4: Predictions and Inference

Now to run the trained and serialised model on the evaluation set again, NOT the data used for training.

Always take care to ensure that there isn't any data leakage here, eg the same agreements, different agreements from the same set of agreement or potentially different agreements from the same parties. 

The objective is to ensure that the model is able to generalize well to new agreements never seen before.

To match the number of predictions to the original numnber of tokens, need to use: "label_all_tokens=False"

In [15]:
# Load the model and instantiate
loaded_model = AutoModelForTokenClassification.from_pretrained(SAVED_MODEL)

args = TrainingArguments(output_dir = TEMP_MODEL_OUTPUT_DIR,
                         per_device_train_batch_size=BATCH_SIZES,
                         per_device_eval_batch_size=BATCH_SIZES,
                         seed=RANDOM_SEED
                        )

data_collator = DataCollatorForTokenClassification(tokenizer)

# Note instantiation currently takes a bit of time: https://github.com/huggingface/transformers/issues/9205
# Instantiate the predictor
pred_trainer = Trainer(
    loaded_model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer)

In [16]:
# Extract the predictions
predictions, labels, _ = pred_trainer.predict(tokenized_datasets["train"])
predictions = np.argmax(predictions, axis=2)
text_df['predictions'] = list(predictions)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
text_df['true_predictions'] = true_predictions

# Consolidate all the information into the DataFrame
def data_extract(tuple_list):
    de_list = []
    for tup in tuple_list:
        if tup[1] != 'O':
            de_list.append(tup)
    return de_list

text_df['check_pred'] = list(list(zip(a,b)) for a,b in zip(text_df['split_tokens'], text_df['true_predictions']))
text_df['data_tuples'] = text_df['check_pred'].apply(data_extract)

# Have a look at the label predictions
text_df.head()[['File_Name', 'true_predictions']]

Unnamed: 0,File_Name,true_predictions
0,4P 060427_WELLSFARGO_MBS_TRUST_YEA.PDF,"[O, O, B-DOC_NAME, I-DOC_NAME, I-DOC_NAME, O, ..."
1,2P 05_04_2020-EX-10.3.PDF,"[O, O, B-DOC_NAME, I-DOC_NAME, O, B-PARTY, I-P..."
2,1P 04_24_1998-WFS.PDF,"[O, O, O, B-DOC_NAME, I-DOC_NAME, O, B-DOC_NAM..."
3,5P 2020-12-15 H665 OOFFS_657.pdf,"[B-AGMT_DATE, B-AGMT_DATE, I-AGMT_DATE, I-AGMT..."
4,3P 06_11_2020-EX-10.1-JVA.PDF,"[O, O, B-DOC_NAME, I-DOC_NAME, I-DOC_NAME, O, ..."


### Step5: Data Extraction

In [17]:
# Functions to extract each important data point based on the model's labeling of each token

def extract_agreement_date(tuple_list):
    for d in tuple_list:
        if d[1] == "B-AGMT_DATE":
            temp_date=d[0]
        elif d[1] == "I-AGMT_DATE":
            temp_date = temp_date + " " + d[0]
        else:
            continue
    return temp_date

text_df['agmt_date'] = text_df['data_tuples'].apply(extract_agreement_date)

def extract_agreement_name(tuple_list):
    for n in tuple_list:
        if n[1] == "B-DOC_NAME":
            temp_name=n[0]
        elif n[1] == "I-DOC_NAME":
            temp_name = temp_name + " " + n[0]
        else:
            continue
    return temp_name

text_df['agmt_name'] = text_df['data_tuples'].apply(extract_agreement_name)

def extract_agreement_parties(tuple_list):
    data_dict = defaultdict(list)
    for i, p in enumerate(tuple_list):
        if p[1] == "B-PARTY":
            temp_party=p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)
        elif p[1] == "I-PARTY":
            temp_party = temp_party + " " + p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)

    return list(dict.fromkeys(data_dict['Parties']))

text_df['agmt_parties'] = text_df['data_tuples'].apply(extract_agreement_parties)

# Create a dataframe with just the information we want to keep and 
export_df = text_df[['File_Name', 'agmt_name', 'agmt_date', 'agmt_parties', 'Full_Text']].copy()

# Let's have a look
export_df.head()

Unnamed: 0,File_Name,agmt_name,agmt_date,agmt_parties,Full_Text
0,4P 060427_WELLSFARGO_MBS_TRUST_YEA.PDF,Yield Maintenance Agreement,27 April 2006,"[Wells Fargo Bank , N.A., Wells Fargo Mortgage...",EXHIBIT 10.3 Yield Maintenance Agreement [LOGO...
1,2P 05_04_2020-EX-10.3.PDF,SERVICING AGREEMENT,"April 8 , 2020","[CURO RECEIVABLES FINANCE II , LLC, CURO MANAG...",Ex 10.3 SERVICING AGREEMENT between CURO RECEI...
2,1P 04_24_1998-WFS.PDF,Outsourcing Agreement,"January 1 , 1998","[Sykes HealthPlan Services , Inc., HealthPlan ...",1 EXHIBIT 10.14 OUTSOURCING AGREEMENT This Out...
3,5P 2020-12-15 H665 OOFFS_657.pdf,OPTION AGREEMENT FOR FUTURE SHARES,4 December 2020,"[INVESTOR LIMITED, INVESTMENT LIMITED]",DATED 4 DECEMBER 2020 INVESTOR LIMITED and INV...
4,3P 06_11_2020-EX-10.1-JVA.PDF,JOINT VENTURE AGREEMENT,"20th day of Friday , March 2020","[BorrowMoney.com , inc, JVLS , LLC, Vaccines 2Go]",Exhibit 10.1 JOINT VENTURE AGREEMENT THIS JOIN...


In [18]:
# Example data
sample=0
print("File Name: \t\t",export_df.iloc[sample][0])
print("Agreement Name: \t",export_df.iloc[sample][1])
print("Agreement Date: \t",export_df.iloc[sample][2])
print("Agreement Parties:")
for p in export_df.iloc[sample][3]:
    print("\t\t\t", p)

File Name: 		 4P 060427_WELLSFARGO_MBS_TRUST_YEA.PDF
Agreement Name: 	 Yield Maintenance Agreement
Agreement Date: 	 27 April 2006
Agreement Parties:
			 Wells Fargo Bank , N.A.
			 Wells Fargo Mortgage Backed Securities 2006 - 6 Trust
			 UBS AG


In [19]:
# Export to CSV file, upload to a database table or some other structured data format, we are done.
export_df.to_csv(CSV_DATA_FILE)