In [181]:
import re
import os
import itertools
import numpy as np
import pandas as pd
import nltk
from tqdm import tqdm

In [182]:
PATH_FOLDER_TXT = "./data/CUAD_v1/full_contract_txt/"
PATH_LABELS = './data/CUAD_v1/master_clauses.csv'

PATH_OUTPUT = './data/glaw/glaw_data.csv'

In [183]:
text_file_names = []
for (dirpath, dirnames, filenames) in os.walk(PATH_FOLDER_TXT):
    text_file_names.extend(filenames)

text_file_names.sort()

In [184]:
df_labels = pd.read_csv(PATH_LABELS)

df_main = df_labels[['Filename', 'Governing Law', 'Governing Law-Answer']].copy()
df_main.sort_values('Filename', axis=0, inplace=True, ignore_index=True)

# Bring in the list of the .txt filenames
df_main.insert(loc=1, column='text_file_name', value=text_file_names)
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Filename              510 non-null    object
 1   text_file_name        510 non-null    object
 2   Governing Law         510 non-null    object
 3   Governing Law-Answer  434 non-null    object
dtypes: object(4)
memory usage: 16.1+ KB


In [185]:
df_main["Governing Law"] = df_main["Governing Law"].apply(eval)
gw_empty_mask = df_main['Governing Law'].apply(lambda it: len(it) == 0)
df_main[gw_empty_mask] = np.nan
df_main['Governing Law-Answer'].replace('[]', np.NaN, inplace=True)
df_main['Governing Law-Answer'].replace('[* * *]', np.nan, inplace=True)
df_main.dropna(axis=0, how='any', inplace=True)
df_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 429 entries, 0 to 508
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Filename              429 non-null    object
 1   text_file_name        429 non-null    object
 2   Governing Law         429 non-null    object
 3   Governing Law-Answer  429 non-null    object
dtypes: object(4)
memory usage: 16.8+ KB


In [186]:
def pre_process_doc_common(text):
    # Simple replacement for "#"
    text = text.replace("#", " ")

    # Simple replacement for "\n"
    text = text.replace("\n", " ")

    # Simple replacement for "\xa0"
    text = text.replace("\xa0", " ")

    # Simple replacement for "\x0c"
    text = text.replace("\x0c", " ")

    # insert spaces around to get this as token if future
    text = text.replace('<omitted>', ' <omitted> ')

    # Get rid of multiple dots
    regex = "\\ \\.\\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)

    # Get rid of underscores
    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)

    # Get rid of multiple dashes
    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)

    # Get rid of multiple stars
    regex = "\\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)

    # Get rid of multiple whitespace
    regex = " +"
    subst = " "
    text = re.sub(regex, subst, text, 0)

    # Strip leading and trailing whitespace
    text = text.strip()

    return text


# Function to take in the file list, read each file, clean the text and return all agreements in a list
def get_text_data_cleared(file_path_list, print_text=False, clean_text=True):
    text_list = []
    for file_path in tqdm(file_path_list):
        with open(file_path, "r", encoding="utf8") as agreement:
            text = agreement.read()
            if print_text:
                print("Text before cleaning: \n", text)

        if clean_text:
            text = pre_process_doc_common(text)

        if print_text:
            print("Text after cleaning: \n", text)

        text_list.append(text)

    return text_list

In [187]:
print('b\xa0a')

b a


In [188]:
file_list = [os.path.join(PATH_FOLDER_TXT, row['text_file_name']) for index, row in df_main.iterrows()]
text_cleared_list = get_text_data_cleared(file_list, print_text=False, clean_text=True)
df_main.insert(loc=2, column='text', value=text_cleared_list)
df_main.head()

100%|██████████| 429/429 [00:01<00:00, 318.48it/s]


Unnamed: 0,Filename,text_file_name,text,Governing Law,Governing Law-Answer
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-...,[This Agreement will be governed and construed...,California
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDENT...,"[This Agreement and any claim, controversy or ...",Israel
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collecti...,[The Joint Venturers declare that in entering ...,Pennsylvania
3,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24B-...,"[This Agreement shall be governed by, and cons...",Germany
4,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY CONFIDENTIAL TREATMENT REQUESTED...,[This Agreement shall be governed and construe...,Kansas


In [189]:
def find_labels(labels_to_find, txt, key):
    result = []
    for lbl in labels_to_find:
        lbl_cleared = pre_process_doc_common(lbl)
        if len(lbl_cleared):
            matches = re.finditer(re.escape(lbl_cleared.lower()), txt.lower())
            for m in matches:
                s = m.start()
                e = m.end()
                result.append((s, e, key))

    return result

In [190]:
# Go through each label and find the label in the text, ensure label is pre-processed same as text.
# If labels don't match, append to a seperate file to check.
def sort_df(df):
    djson = {}
    djson_inspect = {}
    for index, row in tqdm(df.iterrows()):
        labels = list()
        text = row['text']

        #GOVERNING_LAW
        g_laws = row['Governing Law']
        if g_laws and len(g_laws):
            g_law_labels = find_labels(g_laws, text, 'GOVERNING_LAW')
            labels.extend(g_law_labels)

        # Check for incongruous finds, add to inspect file
        labels = tuple(labels)
        if len(labels) > 0:
            djson[index] = labels
        else:
            djson_inspect[index] = labels

    return djson, djson_inspect

In [191]:
djson, djson_inspect = sort_df(df_main)
djson_inspect

429it [00:00, 1239.96it/s]


{}

In [192]:
df_main['Governing Law Occurrences'] = djson.values()
df_main.head()

Unnamed: 0,Filename,text_file_name,text,Governing Law,Governing Law-Answer,Governing Law Occurrences
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-...,[This Agreement will be governed and construed...,California,"((21792, 21946, GOVERNING_LAW),)"
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDENT...,"[This Agreement and any claim, controversy or ...",Israel,"((21938, 22677, GOVERNING_LAW),)"
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collecti...,[The Joint Venturers declare that in entering ...,Pennsylvania,"((9526, 9952, GOVERNING_LAW),)"
3,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24B-...,"[This Agreement shall be governed by, and cons...",Germany,"((61688, 61949, GOVERNING_LAW),)"
4,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY CONFIDENTIAL TREATMENT REQUESTED...,[This Agreement shall be governed and construe...,Kansas,"((18573, 18665, GOVERNING_LAW),)"


In [193]:
jurisdictions_preprocessed = [pre_process_doc_common(location) for location in df_main['Governing Law-Answer']]
pattern_split_jurisdiction = r'; ?|, ?'
jurisdiction_splitted = [re.split(pattern_split_jurisdiction, location) for location in jurisdictions_preprocessed]
jurisdictions = list(itertools.chain(*jurisdiction_splitted))
df_main['Governing Law-Answer Splitted'] = jurisdiction_splitted
df_main.head()

Unnamed: 0,Filename,text_file_name,text,Governing Law,Governing Law-Answer,Governing Law Occurrences,Governing Law-Answer Splitted
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-...,[This Agreement will be governed and construed...,California,"((21792, 21946, GOVERNING_LAW),)",[California]
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDENT...,"[This Agreement and any claim, controversy or ...",Israel,"((21938, 22677, GOVERNING_LAW),)",[Israel]
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collecti...,[The Joint Venturers declare that in entering ...,Pennsylvania,"((9526, 9952, GOVERNING_LAW),)",[Pennsylvania]
3,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24B-...,"[This Agreement shall be governed by, and cons...",Germany,"((61688, 61949, GOVERNING_LAW),)",[Germany]
4,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY CONFIDENTIAL TREATMENT REQUESTED...,[This Agreement shall be governed and construe...,Kansas,"((18573, 18665, GOVERNING_LAW),)",[Kansas]


In [194]:
from data.data import states

glaw_search_anchors = set(jurisdictions + states)
glaw_search_anchors.remove('Washington')  # there are more sophisticated pattern
glaw_search_anchors.add('PRC')  # alias for People's Republic of China
glaw_search_anchors.add('English')  # this form was missed
glaw_search_anchors.add('Italian')  # this form was missed
glaw_search_anchors.add('Swiss')    # this form was missed
print(len(glaw_search_anchors))
glaw_search_anchors

92


{'Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'Australia',
 'Belgium',
 'British Columbia',
 'California',
 'Canada',
 'China',
 'Colombia',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'England',
 'England and Wales',
 'English',
 'Federal Republic of Germany',
 'Florida',
 'Georgia',
 'Germany',
 'Hawaii',
 'Hong Kong',
 'Idaho',
 'Illinois',
 'India',
 'Indiana',
 'Iowa',
 'Israel',
 'Italian',
 'Italy',
 'Japan',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Massachussets',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Netherlands',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Nova Scotia',
 'Ohio',
 'Oklahoma',
 'Ontario',
 'Oregon',
 'PRC',
 'Papua New Guinea',
 'Pennsylvania',
 "People's Republic of China",
 'Province of Ontario',
 'Republic of Kazakhstan',
 'Republic of South Africa',
 'Rhode Island',
 'Singapore',
 '

In [195]:
def has_intersection(first_span, second_span):
    if first_span[0] <= second_span[0] < first_span[1] or second_span[0] <= first_span[0] < second_span[1]:
        return True

    return False

In [200]:
# better approach would be to add to candidate_text_list true_occurrences and all occurrence candidates found that don't intersect with any of true_occurrences
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
pattern_glaw_search = '|'.join(glaw_search_anchors)
regex_glaw_search = re.compile(pattern_glaw_search, flags=re.IGNORECASE)
candidates_list = []
labels_list = []
for idx, row in df_main.loc[0:].iterrows():
    text = row['text']
    true_occurrences = row['Governing Law Occurrences']
    true_anchors = row['Governing Law-Answer Splitted']
    # crutches
    if "People's Republic of China" in true_anchors:
        true_anchors.append('PRC')
    if 'England' in true_anchors:
        true_anchors.append('English')
    if 'Italy' in true_anchors:
        true_anchors.append('Italian')
    if 'Switzerland' in true_anchors:
        true_anchors.append('Swiss')

    sentences = list(sentence_tokenizer.span_tokenize(text))
    sentences_used = [False] * len(sentences)

    candidate_text_list = []
    candidate_label_list = []
    for match in regex_glaw_search.finditer(text):
        span = match.start(), match.end()
        for s_idx, s in enumerate(sentences):
            if has_intersection(span, s) and not sentences_used[s_idx]:
                sentences_used[s_idx] = True
                candidate_text = text[s[0]: s[1]]
                if s_idx > 0:
                    prev_s = sentences[s_idx - 1]
                    candidate_text = text[prev_s[0]: prev_s[1]] + ' ' + candidate_text

                is_true = False
                for true_s in true_occurrences:
                    if has_intersection(s, true_s):
                        for true_a in true_anchors:
                            if re.search(true_a, text[s[0]: s[1]], flags=re.IGNORECASE):
                                is_true = True
                                break

                candidate_text_list.append(candidate_text)
                candidate_label_list.append(int(is_true))
                break

    # workarounds
    if idx == 252:
        candidate_label_list[3] = 0
    elif idx == 326:
        del candidate_text_list[3]
        del candidate_label_list[3]
    elif idx == 382:
        candidate_label_list[6] = 0
    elif idx == 448:
        candidate_label_list[2] = 0
    doc_exception_idx_list = [145]  # these docs checked, such behavior is correct
    if len(true_occurrences) != sum(candidate_label_list) and idx not in doc_exception_idx_list:
        for s in true_occurrences:
            print(text[s[0]: s[1]])
            print()
        print(true_occurrences)
        print(candidate_text_list[2:])
        print(candidate_label_list)
        raise Exception(f'Error at document {idx}')

    candidates_list.append(candidate_text_list)
    labels_list.append(candidate_label_list)

df_main['candidates'] = candidates_list
df_main['labels'] = labels_list
df_main.head()

Unnamed: 0,Filename,text_file_name,text,Governing Law,Governing Law-Answer,Governing Law Occurrences,Governing Law-Answer Splitted,candidates,labels
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-...,[This Agreement will be governed and construed...,California,"((21792, 21946, GOVERNING_LAW),)",[California],[CO-BRANDING AND ADVERTISING AGREEMENT THIS CO...,"[0, 0, 0, 1, 0]"
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDENT...,"[This Agreement and any claim, controversy or ...",Israel,"((21938, 22677, GOVERNING_LAW),)",[Israel],[EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDEN...,"[0, 0, 1, 0]"
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collecti...,[The Joint Venturers declare that in entering ...,Pennsylvania,"((9526, 9952, GOVERNING_LAW),)",[Pennsylvania],[Offices of the Joint Venture. The principal p...,"[0, 0, 1]"
3,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...,Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24B-...,"[This Agreement shall be governed by, and cons...",Germany,"((61688, 61949, GOVERNING_LAW),)",[Germany],[Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24B...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY CONFIDENTIAL TREATMENT REQUESTED...,[This Agreement shall be governed and construe...,Kansas,"((18573, 18665, GOVERNING_LAW),)",[Kansas],[25. APPLICABLE LAW This Agreement shall be go...,"[1, 0, 0, 0]"


In [205]:
df_main.to_csv(PATH_OUTPUT, columns=['candidates', 'labels'])