# 1. Dataset + schema (CUAD → normalized tables + document model)

In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qa = load_dataset(
    "theatticusproject/cuad-qa",
    revision="53fc9be1de79a35a82ac36f33198a753df949523",  # commit where script was deleted
    download_mode="force_redownload",
    cache_dir="./hf_cache_cuadqa",
)

Generating train split: 100%|██████████| 22450/22450 [00:27<00:00, 802.70 examples/s]
Generating test split: 100%|██████████| 4182/4182 [00:05<00:00, 804.34 examples/s]


In [3]:
qa

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 22450
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4182
    })
})

### We'll use 2 dataset sources

- CUAD-QA (Parquet): for learning schema, labels, evaluation

- Raw CUAD PDFs (from theatticusproject/cuad files): for ingestion pipeline with pypdf

In [4]:
qa["train"].column_names

['id', 'title', 'context', 'question', 'answers']

we create:

- a documents table (one row per contract)

- an annotations table (one row per (contract, question))

- a stable doc_id you can reuse later for PDFs, Pinecone, SQLite, etc.

In [5]:
import pandas as pd

train_df = qa["train"].to_pandas()
train_df.head(3)


Unnamed: 0,id,title,context,question,answers
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st..."
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}"
2,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['Electric City of Illinois L.L.C.'],..."


In [6]:
train_df.loc[2,"question"]

'Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract'

In [7]:
train_df.loc[2,"answers"]

{'text': array(['Electric City of Illinois L.L.C.'], dtype=object),
 'answer_start': array([49574], dtype=int32)}

In [56]:
train_df.loc[2,"context"]

'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  hav

In [8]:
print("rows:", len(train_df))
print("unique titles:", train_df["title"].nunique())
#Each contract appears many times because each row is a different question/label asked about the same contract text.

rows: 22450
unique titles: 408


In [9]:
contexts_per_title = train_df.groupby("title")["context"].nunique().sort_values(ascending=False)
contexts_per_title.head(10)


title
2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement    1
ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT                                                    1
ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT                                                1
ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT                                              1
ADAPTIMMUNETHERAPEUTICSPLC_04_06_2017-EX-10.11-STRATEGIC ALLIANCE AGREEMENT                         1
ADIANUTRITION,INC_04_01_2005-EX-10.D2-RESELLER AGREEMENT                                            1
ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement                               1
ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT                                            1
ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT(1)                                         1
AIRTECHINTERNATIONALGROUPINC_05_08_2000-EX-10.4-FRANCHISE AGREEMENT         

### Create doc_id for each contract

In [10]:
"hello".encode("utf-8")

b'hello'

In [11]:
import hashlib
def make_doc_id(title):
    return hashlib.sha256(title.encode("utf-8")).hexdigest()[:16]

docs_df = (
    train_df[["title", "context"]]
    # This removes duplicate rows from the DataFrame, keeping only the first occurrence of each unique 'title' value.
    .drop_duplicates(subset=["title"])
    .rename(columns={"context": "text"})
    .copy()
)
docs_df

Unnamed: 0,title,text
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...
68,"WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION A...",Exhibit 10.26 CONFIDENTIAL TREATMENT HAS BE...
124,NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT,Exhibit 1\n\nJOINT FILING AGREEMENT\n\nThe und...
166,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY\n\nCONFIDENTIAL TREATMENT REQUES...
217,"KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...",Exhibit 10.23 Corporate Address Fannin South P...
...,...,...
22172,CcRealEstateIncomeFundadv_20181205_POS 8C_EX-9...,Exhibit 99(h)(3) WHOLESALE MARKETING AGREEMENT...
22218,"BLUEROCKRESIDENTIALGROWTHREIT,INC_06_01_2016-E...","Exhibit 1.1 400,000 Shares BLUEROCK RESIDE..."
22267,"TALLGRASSENERGY,LP_02_20_2020-EX-99.26-JOINT F...",Exhibit 26\n\nJOINT FILING AGREEMENT\n\nPursua...
22317,KINGPHARMACEUTICALSINC_08_09_2006-EX-10.1-PROM...,Exhibit 10.1\n\n\n\nPROMOTION AGREEMENT\n\nby ...


In [12]:
docs_df["doc_id"] = docs_df["title"].apply(make_doc_id)
docs_df["source"] = "cuad-qa"
docs_df.head()

Unnamed: 0,title,text,doc_id,source
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,f71b4149a185d016,cuad-qa
68,"WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION A...",Exhibit 10.26 CONFIDENTIAL TREATMENT HAS BE...,43ab152a17a15599,cuad-qa
124,NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT,Exhibit 1\n\nJOINT FILING AGREEMENT\n\nThe und...,644b67c819fbca9c,cuad-qa
166,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY\n\nCONFIDENTIAL TREATMENT REQUES...,2d790a4a4132cc53,cuad-qa
217,"KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...",Exhibit 10.23 Corporate Address Fannin South P...,b9d11c50bada2ade,cuad-qa


we will use an SQLite db so we want a unique ID for each (document, question) pair

In [13]:
def make_ann_id(doc_id: str, question: str) -> str:
    """
    Create a stable ID for the pair (doc_id, question).

    We combine them into one string, hash it, and keep the first 16 hex characters.
    """
    raw = doc_id + "::" + question      # combine into one string
    raw_bytes = raw.encode("utf-8")     # convert to bytes (required for hashing)
    full_hash = hashlib.sha256(raw_bytes).hexdigest()  # long hex string
    short_id = full_hash[:16]           # shorten it (still plenty unique for our use)
    return short_id


In [14]:
ann_df = train_df[["title", "question", "answers"]].copy()
ann_df.head(2)


Unnamed: 0,title,question,answers
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st..."
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}"


In [15]:
doc_ids = []
for title in ann_df["title"]:
    doc_ids.append(make_doc_id(title))

ann_df["doc_id"] = doc_ids
ann_df.head(2)

Unnamed: 0,title,question,answers,doc_id
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016


In [16]:
ann_df['answers'][9]

{'text': array(['The term of this  Agreement  shall be ten (10)                            years (the "Term")  which shall  commence on the date                            upon which the Company  delivers to  Distributor  the                            last Sample, as defined  hereinafter.'],
       dtype=object),
 'answer_start': array([5268], dtype=int32)}

In [17]:
def extract_answer_texts(answers_obj):
    if isinstance(answers_obj, dict) and "text" in answers_obj:
        return answers_obj["text"]
    return []

def extract_answer_starts(answers_obj):
    if isinstance(answers_obj, dict) and "answer_start" in answers_obj:
        return answers_obj["answer_start"]
    return []

In [18]:
extract_answer_texts(ann_df['answers'][9])

array(['The term of this  Agreement  shall be ten (10)                            years (the "Term")  which shall  commence on the date                            upon which the Company  delivers to  Distributor  the                            last Sample, as defined  hereinafter.'],
      dtype=object)

In [19]:
answer_texts_col = []
answer_starts_col = []

for answers_obj in ann_df["answers"]:
    answer_texts_col.append(extract_answer_texts(answers_obj))
    answer_starts_col.append(extract_answer_starts(answers_obj))

ann_df["answer_texts"] = answer_texts_col
ann_df["answer_starts"] = answer_starts_col

ann_df.head(2)

Unnamed: 0,title,question,answers,doc_id,answer_texts,answer_starts
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016,[DISTRIBUTOR AGREEMENT],[44]
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016,[Distributor],[244]


In [20]:
# make annotation id (doc_id, question)
annotation_ids = []

for i in range(len(ann_df)):
    doc_id = ann_df.loc[i, "doc_id"]
    question = ann_df.loc[i, "question"]
    annotation_ids.append(make_ann_id(doc_id, question))

ann_df["annotation_id"] = annotation_ids
ann_df.head(2)

Unnamed: 0,title,question,answers,doc_id,answer_texts,answer_starts,annotation_id
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016,[DISTRIBUTOR AGREEMENT],[44],eadada7bb423fc45
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016,[Distributor],[244],81d97df4a77d9de8


In [21]:
ann_df = ann_df.rename(columns={"question": "label"})
ann_df = ann_df[["annotation_id", "doc_id", "label", "answer_texts", "answer_starts"]]
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts
0,eadada7bb423fc45,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44]
1,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244]
2,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574]
3,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212]
4,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197]


In [22]:
# Check one row: starts should match texts count
row = ann_df.iloc[0]
print(len(row["answer_texts"]), len(row["answer_starts"]))
print(row["label"])
print(row["answer_texts"][:2])
print(row["answer_starts"][:2])


1 1
Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
['DISTRIBUTOR AGREEMENT']
[44]


### verify the substring actually matches the context

In [23]:
# get the corresponding raw row from train_df so we can access the original context
raw = train_df.iloc[0]
context = raw["context"]

answer = row["answer_texts"][0]
start = row["answer_starts"][0]
end = start + len(answer)

print("answer:", repr(answer))
print("start:", start, "end:", end)
print("context slice:", repr(context[start:end]))
print("match:", context[start:end] == answer)


answer: 'DISTRIBUTOR AGREEMENT'
start: 44 end: 65
context slice: 'DISTRIBUTOR AGREEMENT'
match: True


## Chunking: start with chunking one document

In [24]:
doc = docs_df.iloc[0]
doc_id = doc["doc_id"]
text = doc["text"]
print("doc_id:", doc_id)
print("chars:", len(text))
print(text[:400])

doc_id: f71b4149a185d016
chars: 54290
EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.

                                    RECITALS

         A. The  Company's  Business.  The Company is  present


In [25]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150):
    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")

    chunks = []
    start = 0
    chunk_index = 0
    n = len(text)

    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end]

        chunks.append({
            "chunk_index": chunk_index,
            "start_char": start,
            "end_char": end,
            "text": chunk
        })

        chunk_index += 1

        # If we're at the end, we're done.
        if end == n:
            break

        next_start = end - overlap

        # Safety: ensure progress (avoid infinite loops)
        if next_start <= start:
            break

        start = next_start

    return chunks


In [26]:
chunks_doc1 = chunk_text(text, chunk_size=1000, overlap=150)
print('num chunks', len(chunks_doc1))
print("num chunks:", len(chunks_doc1))
print("first chunk range:", chunks_doc1[0]["start_char"], chunks_doc1[0]["end_char"])
print("last chunk range:", chunks_doc1[-1]["start_char"], chunks_doc1[-1]["end_char"])


num chunks 64
num chunks: 64
first chunk range: 0 1000
last chunk range: 53550 54290


In [28]:
ranges_ok = all(chunks_doc1[i]["start_char"] < chunks_doc1[i+1]["start_char"] for i in range(len(chunks_doc1)-1))
print("ranges increasing:", ranges_ok)


ranges increasing: True


### Map answer span to the chunk that contains it (find which chunk(s) cover that [start, end) range.)

In [None]:
row = ann_df.iloc[0]
answer_text = row["answer_texts"][0]
answer_start = row["answer_starts"][0]
answer_end = answer_start + len(answer_text)

print("answer_text:", repr(answer_text))
print("answer_start:", answer_start)
print("answer_end:", answer_end)

answer_text: 'DISTRIBUTOR AGREEMENT'
answer_start: 44
answer_end: 65


In [33]:
doc_id = row["doc_id"]
print(docs_df["doc_id"] == doc_id)
docs_df.loc[docs_df["doc_id"] == doc_id, "text"]

0         True
68       False
124      False
166      False
217      False
         ...  
22172    False
22218    False
22267    False
22317    False
22401    False
Name: doc_id, Length: 408, dtype: bool


0    EXHIBIT 10.6\n\n                              ...
Name: text, dtype: object

In [34]:
doc_text = docs_df.loc[docs_df["doc_id"] == doc_id, "text"].iloc[0]
print("doc chars:", len(doc_text))
print(doc_text[:120])


doc chars: 54290
EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreemen


In [35]:
chunks = chunk_text(doc_text, chunk_size=1000, overlap=150)
print("num chunks:", len(chunks))

num chunks: 64


In [36]:
chunks

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

### find which chunk contains the answer span: an answer span is inside a chunk if 
chunk_start <= answer_start and answer_end <= chunk_end

In [37]:
containing = []
for c in chunks:
    c_start = c["start_char"]
    c_end = c["end_char"]
    if c_start <= answer_start and answer_end <= c_end:
        containing.append(c)

In [39]:
print("chunks containing span:", len(containing))
if containing:
    c = containing[0]
    print("chunk_index:", c["chunk_index"])
    print("chunk range:", c["start_char"], c["end_char"])

chunks containing span: 1
chunk_index: 0
chunk range: 0 1000


### Highlight answer in chunk
convert doc coordinates -> chunk coordinates

relative_start = answer_start - chunk_start

relative_end = answer_end - chunk_start

then print

chunk_text[:relative_start] + [[ANSWER]] + chunk_text[relative_end:]


In [None]:
print("answer_start:", answer_start, "answer_end:", answer_end)
print("num chunks:", len(chunks))
print("chunks containing span:", len(containing))

answer_start: 44 answer_end: 65
num chunks: 64
chunks containing span: 1


In [44]:
chunk = containing[0]
chunk_start = chunk["start_char"]
chunk_end = chunk["end_char"]
print(f"chunk_start: {chunk_start}, chunk_end: {chunk_end}")
relative_start = answer_start - chunk_start
relative_end = answer_end - chunk_start
print(f"relative start: {relative_start}, relative_end: {relative_end}")
print("chunk_start:", chunk_start, "chunk_end:", chunk_end)
print("relative_start:", relative_start, "relative_end:", relative_end)
print("chunk length:", len(chunk["text"]))

chunk_start: 0, chunk_end: 1000
relative start: 44, relative_end: 65
chunk_start: 0 chunk_end: 1000
relative_start: 44 relative_end: 65
chunk length: 1000


In [42]:
chunk_text_str = chunk["text"]

highlighted = (
    chunk_text_str[:relative_start]
    + "[["
    + chunk_text_str[relative_start:relative_end]
    + "]]"
    + chunk_text_str[relative_end:]
)

print(highlighted)

EXHIBIT 10.6

                              [[DISTRIBUTOR AGREEMENT]]

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.

                                    RECITALS

         A. The  Company's  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.

         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  have  the f

In [43]:
WINDOW = 80
left = max(0, relative_start - WINDOW)
right = min(len(chunk_text_str), relative_end + WINDOW)
snippet = (
    chunk_text_str[left:relative_start]
    + "[["
    + chunk_text_str[relative_start:relative_end]
    + "]]"
    + chunk_text_str[relative_end:right]
)

print(snippet)


EXHIBIT 10.6

                              [[DISTRIBUTOR AGREEMENT]]

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and betwe


In [45]:
def make_chunk_id(doc_id, chunk_index, start_char, end_char):
    raw = f"{doc_id}::chunk{chunk_index}::{start_char}-{end_char}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]

In [46]:
for c in chunks:
    c["doc_id"] = doc_id
    c["chunk_id"] = make_chunk_id(doc_id, c["chunk_index"], c["start_char"], c["end_char"])

In [47]:
chunks[0]

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [48]:
chunk

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [50]:
ann_df.index[9]

9

In [51]:
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts
0,eadada7bb423fc45,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44]
1,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244]
2,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574]
3,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212]
4,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197]


In [52]:
evidence_ids = [chunk["chunk_id"]]
print("evidence_ids:", evidence_ids)

evidence_ids: ['74d4359635077bf8']


In [53]:
ann_df.at[ann_df.index[0], "evidence_chunk_ids"] = evidence_ids
ann_df.iloc[0][["label", "answer_texts", "evidence_chunk_ids"]]

label                 Highlight the parts (if any) of this contract ...
answer_texts                                    [DISTRIBUTOR AGREEMENT]
evidence_chunk_ids                                   [74d4359635077bf8]
Name: 0, dtype: object

In [54]:
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,eadada7bb423fc45,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[74d4359635077bf8]
1,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],
2,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],
3,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],
4,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],


In [60]:
chunks_df = pd.DataFrame(chunks)[["chunk_id", "doc_id", "chunk_index", "start_char", "end_char", "text"]]

Given a chunk_id we want to fetch the chunk text

In [61]:
def get_chunk_text(chunks_df: pd.DataFrame, chunk_id: str) -> str:
    matches = chunks_df.loc[chunks_df["chunk_id"] == chunk_id]
    if len(matches) == 0:
        raise KeyError(f"chunk_id not found: {chunk_id}")
    if len(matches) > 1:
        raise ValueError(f"duplicate chunk_id found: {chunk_id}")
    return matches.iloc[0]["text"]


In [62]:
get_chunk_text(chunks_df, chunk["chunk_id"])

'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  hav

A chunk overlaps the answer if these intervals overlap at all:

chunk_start < answer_end AND chunk_end > answer_start

In [63]:
def find_overlapping_chunks(chunks, answer_start, answer_end):
    """
    Return a list of chunk dicts whose [start_char, end_char) overlaps
    with [answer_start, answer_end).
    """
    overlapping = []

    for c in chunks:
        c_start = c["start_char"]
        c_end = c["end_char"]

        overlaps = (c_start < answer_end) and (c_end > answer_start)
        if overlaps:
            overlapping.append(c)

    return overlapping


In [64]:
row = ann_df.iloc[0]
answer_text = row["answer_texts"][0]
answer_start = row["answer_starts"][0]
answer_end = answer_start + len(answer_text)

doc_id = row["doc_id"]
doc_text = docs_df.loc[docs_df["doc_id"] == doc_id, "text"].iloc[0]

chunks = chunk_text(doc_text, chunk_size=1000, overlap=150)

overlapping = find_overlapping_chunks(chunks, answer_start, answer_end)

print("num overlapping chunks:", len(overlapping))
if overlapping:
    print("chunk indexes:", [c["chunk_index"] for c in overlapping])


num overlapping chunks: 1
chunk indexes: [0]


We’ll create an artificial span that crosses from chunk 0 into chunk 1.

chunk 0 ends at 1000

so we’ll start at 990 and end at 1010

In [65]:
fake_start = 990
fake_end = 1010

overlapping_fake = find_overlapping_chunks(chunks, fake_start, fake_end)

print("fake span:", fake_start, fake_end)
print("num overlapping chunks:", len(overlapping_fake))
print("chunk indexes:", [c["chunk_index"] for c in overlapping_fake])


fake span: 990 1010
num overlapping chunks: 2
chunk indexes: [0, 1]


In [66]:
chunks

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

In [67]:
for c in chunks:
    if "chunk_id" not in c:
        c["doc_id"] = doc_id
        c["chunk_id"] = make_chunk_id(doc_id, c["chunk_index"], c["start_char"], c["end_char"])


In [68]:
chunks[0]

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [69]:
overlapping_fake

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

In [70]:
evidence_ids_fake = []
for c in overlapping_fake:
    evidence_ids_fake.append(c["chunk_id"])

print("evidence_ids_fake:", evidence_ids_fake)


evidence_ids_fake: ['74d4359635077bf8', '6555a0b244453dc3']


In [72]:
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,eadada7bb423fc45,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[74d4359635077bf8]
1,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],
2,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],
3,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],
4,81d97df4a77d9de8,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],
