# 1. Dataset + schema (CUAD → normalized tables + document model)

In [132]:
from datasets import load_dataset
import pandas as pd

### if downloading for the first time run the cell below

In [133]:
qa = load_dataset(
    "theatticusproject/cuad-qa",
    revision="53fc9be1de79a35a82ac36f33198a753df949523",  # commit where script was deleted
    download_mode="force_redownload",
    cache_dir="./hf_cache_cuadqa",
)

KeyboardInterrupt: 

### if you downloaded before, load the dataset using the cell below

In [None]:
qa = load_dataset(
    "theatticusproject/cuad-qa",
    revision="53fc9be1de79a35a82ac36f33198a753df949523",
    cache_dir="./hf_cache_cuadqa",
)

In [None]:
qa

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 22450
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4182
    })
})

### We'll use 2 dataset sources

- CUAD-QA (Parquet): for learning schema, labels, evaluation

- Raw CUAD PDFs (from theatticusproject/cuad files): for ingestion pipeline with pypdf

In [None]:
qa["train"].column_names

['id', 'title', 'context', 'question', 'answers']

we create:

- a documents table (one row per contract)

- an annotations table (one row per (contract, question))

- a stable doc_id you can reuse later for PDFs, Pinecone, SQLite, etc.

In [None]:
import pandas as pd

train_df = qa["train"].to_pandas()
train_df.head(3)


Unnamed: 0,id,title,context,question,answers
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st..."
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}"
2,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,Highlight the parts (if any) of this contract ...,"{'text': ['Electric City of Illinois L.L.C.'],..."


In [None]:
train_df.loc[2,"question"]

'Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract'

In [None]:
print(train_df.loc[21,"question"])
print(train_df.loc[21,"answers"]["text"])

Highlight the parts (if any) of this contract related to "No-Solicit Of Employees" that should be reviewed by a lawyer. Details: Is there a restriction on a party’s soliciting or hiring employees and/or contractors from the  counterparty, whether during the contract or after the contract ends (or both)?
['During the Term of this Agreement and for a period of                            twelve (12) months  thereafter,  the  Distributor (on                            behalf of itself,  each of its affiliates and each of                            their respective representatives) agrees that it will                            not  directly  or  indirectly  solicit  or  hire  any                            executive,  managerial  or technical  employee of the                            Company or any of its affiliates.']


In [None]:
train_df.loc[2,"answers"]

{'text': array(['Electric City of Illinois L.L.C.'], dtype=object),
 'answer_start': array([49574], dtype=int32)}

In [None]:
train_df.loc[2,"context"]

'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  hav

In [None]:
print("rows:", len(train_df))
print("unique titles:", train_df["title"].nunique())
#Each contract appears many times because each row is a different question/label asked about the same contract text.

rows: 22450
unique titles: 408


In [None]:
contexts_per_title = train_df.groupby("title")["context"].nunique().sort_values(ascending=False)
contexts_per_title.head(10)


title
2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement    1
ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT                                                    1
ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT                                                1
ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT                                              1
ADAPTIMMUNETHERAPEUTICSPLC_04_06_2017-EX-10.11-STRATEGIC ALLIANCE AGREEMENT                         1
ADIANUTRITION,INC_04_01_2005-EX-10.D2-RESELLER AGREEMENT                                            1
ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement                               1
ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT                                            1
ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT(1)                                         1
AIRTECHINTERNATIONALGROUPINC_05_08_2000-EX-10.4-FRANCHISE AGREEMENT         

### Create doc_id for each contract

In [None]:
"hello".encode("utf-8")

b'hello'

In [None]:
import hashlib
def make_doc_id(title):
    return hashlib.sha256(title.encode("utf-8")).hexdigest()[:16]

docs_df = (
    train_df[["title", "context"]]
    # This removes duplicate rows from the DataFrame, keeping only the first occurrence of each unique 'title' value.
    .drop_duplicates(subset=["title"])
    .rename(columns={"context": "text"})
    .copy()
)
docs_df

Unnamed: 0,title,text
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...
68,"WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION A...",Exhibit 10.26 CONFIDENTIAL TREATMENT HAS BE...
124,NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT,Exhibit 1\n\nJOINT FILING AGREEMENT\n\nThe und...
166,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY\n\nCONFIDENTIAL TREATMENT REQUES...
217,"KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...",Exhibit 10.23 Corporate Address Fannin South P...
...,...,...
22172,CcRealEstateIncomeFundadv_20181205_POS 8C_EX-9...,Exhibit 99(h)(3) WHOLESALE MARKETING AGREEMENT...
22218,"BLUEROCKRESIDENTIALGROWTHREIT,INC_06_01_2016-E...","Exhibit 1.1 400,000 Shares BLUEROCK RESIDE..."
22267,"TALLGRASSENERGY,LP_02_20_2020-EX-99.26-JOINT F...",Exhibit 26\n\nJOINT FILING AGREEMENT\n\nPursua...
22317,KINGPHARMACEUTICALSINC_08_09_2006-EX-10.1-PROM...,Exhibit 10.1\n\n\n\nPROMOTION AGREEMENT\n\nby ...


In [None]:
docs_df["doc_id"] = docs_df["title"].apply(make_doc_id)
docs_df["source"] = "cuad-qa"
docs_df.head()

Unnamed: 0,title,text,doc_id,source
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,EXHIBIT 10.6\n\n ...,f71b4149a185d016,cuad-qa
68,"WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION A...",Exhibit 10.26 CONFIDENTIAL TREATMENT HAS BE...,43ab152a17a15599,cuad-qa
124,NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT,Exhibit 1\n\nJOINT FILING AGREEMENT\n\nThe und...,644b67c819fbca9c,cuad-qa
166,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...,REDACTED COPY\n\nCONFIDENTIAL TREATMENT REQUES...,2d790a4a4132cc53,cuad-qa
217,"KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSU...",Exhibit 10.23 Corporate Address Fannin South P...,b9d11c50bada2ade,cuad-qa


we will use an SQLite db so we want a unique ID for each (document, question) pair

In [None]:
# def make_ann_id(doc_id: str, question: str) -> str:
#     """
#     Create a stable ID for the pair (doc_id, question).

#     We combine them into one string, hash it, and keep the first 16 hex characters.
#     """
#     raw = doc_id + "::" + question      # combine into one string
#     raw_bytes = raw.encode("utf-8")     # convert to bytes (required for hashing)
#     full_hash = hashlib.sha256(raw_bytes).hexdigest()  # long hex string
#     short_id = full_hash[:16]           # shorten it (still plenty unique for our use)
#     return short_id


In [None]:
# make annotation id from combination of doc_id, row_id because we can have duplicates from the (doc_id, question) pair
def make_ann_id_from_row(doc_id: str, row_id: str) -> str:
    raw = f"{doc_id}::row::{row_id}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]

In [None]:
ann_df = train_df[["id", "title", "question", "answers"]].copy()
ann_df.head(2)


Unnamed: 0,id,title,question,answers
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st..."
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}"


In [None]:
doc_ids = []
for title in ann_df["title"]:
    doc_ids.append(make_doc_id(title))

ann_df["doc_id"] = doc_ids
ann_df.head(2)

Unnamed: 0,id,title,question,answers,doc_id
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016


In [None]:
ann_df['answers'][9]

{'text': array(['The term of this  Agreement  shall be ten (10)                            years (the "Term")  which shall  commence on the date                            upon which the Company  delivers to  Distributor  the                            last Sample, as defined  hereinafter.'],
       dtype=object),
 'answer_start': array([5268], dtype=int32)}

In [None]:
def extract_answer_texts(answers_obj):
    if isinstance(answers_obj, dict) and "text" in answers_obj:
        return answers_obj["text"]
    return []

def extract_answer_starts(answers_obj):
    if isinstance(answers_obj, dict) and "answer_start" in answers_obj:
        return answers_obj["answer_start"]
    return []

In [None]:
extract_answer_texts(ann_df['answers'][9])

array(['The term of this  Agreement  shall be ten (10)                            years (the "Term")  which shall  commence on the date                            upon which the Company  delivers to  Distributor  the                            last Sample, as defined  hereinafter.'],
      dtype=object)

In [None]:
answer_texts_col = []
answer_starts_col = []

for answers_obj in ann_df["answers"]:
    answer_texts_col.append(extract_answer_texts(answers_obj))
    answer_starts_col.append(extract_answer_starts(answers_obj))

ann_df["answer_texts"] = answer_texts_col
ann_df["answer_starts"] = answer_starts_col

ann_df.head(2)

Unnamed: 0,id,title,question,answers,doc_id,answer_texts,answer_starts
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016,[DISTRIBUTOR AGREEMENT],[44]
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016,[Distributor],[244]


In [None]:
# make annotation id (doc_id, question)
annotation_ids = []

for i in range(len(ann_df)):
    doc_id = ann_df.loc[i, "doc_id"]
    # question = ann_df.loc[i, "question"]
    id = ann_df.loc[i, "id"]
    
    annotation_ids.append(make_ann_id_from_row(doc_id, id))

ann_df["annotation_id"] = annotation_ids
ann_df.head(2)

Unnamed: 0,id,title,question,answers,doc_id,answer_texts,answer_starts,annotation_id
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['DISTRIBUTOR AGREEMENT'], 'answer_st...",f71b4149a185d016,[DISTRIBUTOR AGREEMENT],[44],85ec9ab84c06ae69
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,"{'text': ['Distributor'], 'answer_start': [244]}",f71b4149a185d016,[Distributor],[244],27956857c285b923


In [None]:
ann_df = ann_df.rename(columns={"question": "label"})
ann_df = ann_df[["annotation_id", "doc_id", "label", "answer_texts", "answer_starts"]]
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts
0,85ec9ab84c06ae69,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44]
1,27956857c285b923,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244]
2,8951f023dfe64525,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574]
3,c66875b38edac636,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212]
4,f4aa8381671b625f,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197]


In [None]:
# ensure evidence column exists
if "evidence_chunk_ids" not in ann_df.columns:
    ann_df["evidence_chunk_ids"] = [[] for _ in range(len(ann_df))]


In [None]:
# Check one row: starts should match texts count
row = ann_df.iloc[0]
print(len(row["answer_texts"]), len(row["answer_starts"]))
print(row["label"])
print(row["answer_texts"][:2])
print(row["answer_starts"][:2])


1 1
Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
['DISTRIBUTOR AGREEMENT']
[44]


### Normalize evidence so it’s always a list (never NaN)

In [None]:
# If any cell is NaN instead of list, replace with []
fixed = 0
for i in range(len(ann_df)):
    val = ann_df.at[ann_df.index[i], "evidence_chunk_ids"]
    if not isinstance(val, list):
        ann_df.at[ann_df.index[i], "evidence_chunk_ids"] = []
        fixed += 1
print("fixed evidence cells:", fixed)

fixed evidence cells: 0


### verify the substring actually matches the context

In [None]:
# get the corresponding raw row from train_df so we can access the original context
raw = train_df.iloc[0]
context = raw["context"]

answer = row["answer_texts"][0]
start = row["answer_starts"][0]
end = start + len(answer)

print("answer:", repr(answer))
print("start:", start, "end:", end)
print("context slice:", repr(context[start:end]))
print("match:", context[start:end] == answer)


answer: 'DISTRIBUTOR AGREEMENT'
start: 44 end: 65
context slice: 'DISTRIBUTOR AGREEMENT'
match: True


## Chunking: start with chunking one document

In [None]:
doc = docs_df.iloc[0]
doc_id = doc["doc_id"]
text = doc["text"]
print("doc_id:", doc_id)
print("chars:", len(text))
print(text[:400])

doc_id: f71b4149a185d016
chars: 54290
EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.

                                    RECITALS

         A. The  Company's  Business.  The Company is  present


In [None]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150):
    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")

    chunks = []
    start = 0
    chunk_index = 0
    n = len(text)

    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end]

        chunks.append({
            "chunk_index": chunk_index,
            "start_char": start,
            "end_char": end,
            "text": chunk
        })

        chunk_index += 1

        # If we're at the end, we're done.
        if end == n:
            break

        next_start = end - overlap

        # Safety: ensure progress (avoid infinite loops)
        if next_start <= start:
            break

        start = next_start

    return chunks


In [None]:
chunks_doc1 = chunk_text(text, chunk_size=1000, overlap=150)
print('num chunks', len(chunks_doc1))
print("num chunks:", len(chunks_doc1))
print("first chunk range:", chunks_doc1[0]["start_char"], chunks_doc1[0]["end_char"])
print("last chunk range:", chunks_doc1[-1]["start_char"], chunks_doc1[-1]["end_char"])


num chunks 64
num chunks: 64
first chunk range: 0 1000
last chunk range: 53550 54290


In [None]:
ranges_ok = all(chunks_doc1[i]["start_char"] < chunks_doc1[i+1]["start_char"] for i in range(len(chunks_doc1)-1))
print("ranges increasing:", ranges_ok)


ranges increasing: True


### Map answer span to the chunk that contains it (find which chunk(s) cover that [start, end) range.)

In [None]:
row = ann_df.iloc[0]
answer_text = row["answer_texts"][0]
answer_start = row["answer_starts"][0]
answer_end = answer_start + len(answer_text)

print("answer_text:", repr(answer_text))
print("answer_start:", answer_start)
print("answer_end:", answer_end)

answer_text: 'DISTRIBUTOR AGREEMENT'
answer_start: 44
answer_end: 65


In [None]:
doc_id = row["doc_id"]
print(docs_df["doc_id"] == doc_id)
docs_df.loc[docs_df["doc_id"] == doc_id, "text"]

0         True
68       False
124      False
166      False
217      False
         ...  
22172    False
22218    False
22267    False
22317    False
22401    False
Name: doc_id, Length: 408, dtype: bool


0    EXHIBIT 10.6\n\n                              ...
Name: text, dtype: object

In [None]:
doc_text = docs_df.loc[docs_df["doc_id"] == doc_id, "text"].iloc[0]
print("doc chars:", len(doc_text))
print(doc_text[:120])


doc chars: 54290
EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreemen


In [None]:
chunks = chunk_text(doc_text, chunk_size=1000, overlap=150)
print("num chunks:", len(chunks))

num chunks: 64


In [None]:
chunks

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

### find which chunk contains the answer span: an answer span is inside a chunk if 
chunk_start <= answer_start and answer_end <= chunk_end

In [None]:
containing = []
for c in chunks:
    c_start = c["start_char"]
    c_end = c["end_char"]
    if c_start <= answer_start and answer_end <= c_end:
        containing.append(c)

In [None]:
print("chunks containing span:", len(containing))
if containing:
    c = containing[0]
    print("chunk_index:", c["chunk_index"])
    print("chunk range:", c["start_char"], c["end_char"])

chunks containing span: 1
chunk_index: 0
chunk range: 0 1000


### Highlight answer in chunk
convert doc coordinates -> chunk coordinates

relative_start = answer_start - chunk_start

relative_end = answer_end - chunk_start

then print

chunk_text[:relative_start] + [[ANSWER]] + chunk_text[relative_end:]


In [None]:
print("answer_start:", answer_start, "answer_end:", answer_end)
print("num chunks:", len(chunks))
print("chunks containing span:", len(containing))

answer_start: 44 answer_end: 65
num chunks: 64
chunks containing span: 1


In [None]:
chunk = containing[0]
chunk_start = chunk["start_char"]
chunk_end = chunk["end_char"]
print(f"chunk_start: {chunk_start}, chunk_end: {chunk_end}")
relative_start = answer_start - chunk_start
relative_end = answer_end - chunk_start
print(f"relative start: {relative_start}, relative_end: {relative_end}")
print("chunk_start:", chunk_start, "chunk_end:", chunk_end)
print("relative_start:", relative_start, "relative_end:", relative_end)
print("chunk length:", len(chunk["text"]))

chunk_start: 0, chunk_end: 1000
relative start: 44, relative_end: 65
chunk_start: 0 chunk_end: 1000
relative_start: 44 relative_end: 65
chunk length: 1000


In [None]:
chunk_text_str = chunk["text"]

highlighted = (
    chunk_text_str[:relative_start]
    + "[["
    + chunk_text_str[relative_start:relative_end]
    + "]]"
    + chunk_text_str[relative_end:]
)

print(highlighted)

EXHIBIT 10.6

                              [[DISTRIBUTOR AGREEMENT]]

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.

                                    RECITALS

         A. The  Company's  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.

         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  have  the f

In [None]:
WINDOW = 80
left = max(0, relative_start - WINDOW)
right = min(len(chunk_text_str), relative_end + WINDOW)
snippet = (
    chunk_text_str[left:relative_start]
    + "[["
    + chunk_text_str[relative_start:relative_end]
    + "]]"
    + chunk_text_str[relative_end:right]
)

print(snippet)


EXHIBIT 10.6

                              [[DISTRIBUTOR AGREEMENT]]

         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and betwe


In [None]:
def make_chunk_id(doc_id, chunk_index, start_char, end_char):
    raw = f"{doc_id}::chunk{chunk_index}::{start_char}-{end_char}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]

In [None]:
for c in chunks:
    c["doc_id"] = doc_id
    c["chunk_id"] = make_chunk_id(doc_id, c["chunk_index"], c["start_char"], c["end_char"])

In [None]:
chunks[0]

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [None]:
chunk

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [None]:
ann_df.index[9]

9

In [None]:
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,85ec9ab84c06ae69,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[]
1,27956857c285b923,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],[]
2,8951f023dfe64525,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],[]
3,c66875b38edac636,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],[]
4,f4aa8381671b625f,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],[]


In [None]:
evidence_ids = [chunk["chunk_id"]]
print("evidence_ids:", evidence_ids)

evidence_ids: ['74d4359635077bf8']


In [None]:
ann_df.at[ann_df.index[0], "evidence_chunk_ids"] = evidence_ids
ann_df.iloc[0][["label", "answer_texts", "evidence_chunk_ids"]]

label                 Highlight the parts (if any) of this contract ...
answer_texts                                    [DISTRIBUTOR AGREEMENT]
evidence_chunk_ids                                   [74d4359635077bf8]
Name: 0, dtype: object

In [None]:
ann_df.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,85ec9ab84c06ae69,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[74d4359635077bf8]
1,27956857c285b923,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],[]
2,8951f023dfe64525,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],[]
3,c66875b38edac636,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],[]
4,f4aa8381671b625f,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],[]


In [None]:
chunks_df = pd.DataFrame(chunks)[["chunk_id", "doc_id", "chunk_index", "start_char", "end_char", "text"]]

Given a chunk_id we want to fetch the chunk text

In [None]:
def get_chunk_text(chunks_df: pd.DataFrame, chunk_id: str) -> str:
    matches = chunks_df.loc[chunks_df["chunk_id"] == chunk_id]
    if len(matches) == 0:
        raise KeyError(f"chunk_id not found: {chunk_id}")
    if len(matches) > 1:
        raise ValueError(f"duplicate chunk_id found: {chunk_id}")
    return matches.iloc[0]["text"]


In [None]:
get_chunk_text(chunks_df, chunk["chunk_id"])

'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement,  the  Distributor  has  represented  that  it has or  will  hav

A chunk overlaps the answer if these intervals overlap at all:

chunk_start < answer_end AND chunk_end > answer_start

In [None]:
def find_overlapping_chunks(chunks, answer_start, answer_end):
    """
    Return a list of chunk dicts whose [start_char, end_char) overlaps
    with [answer_start, answer_end).
    """
    overlapping = []

    for c in chunks:
        c_start = c["start_char"]
        c_end = c["end_char"]

        overlaps = (c_start < answer_end) and (c_end > answer_start)
        if overlaps:
            overlapping.append(c)

    return overlapping


In [None]:
row = ann_df.iloc[0]
answer_text = row["answer_texts"][0]
answer_start = row["answer_starts"][0]
answer_end = answer_start + len(answer_text)

doc_id = row["doc_id"]
doc_text = docs_df.loc[docs_df["doc_id"] == doc_id, "text"].iloc[0]

chunks = chunk_text(doc_text, chunk_size=1000, overlap=150)

overlapping = find_overlapping_chunks(chunks, answer_start, answer_end)

print("num overlapping chunks:", len(overlapping))
if overlapping:
    print("chunk indexes:", [c["chunk_index"] for c in overlapping])


num overlapping chunks: 1
chunk indexes: [0]


We’ll create an artificial span that crosses from chunk 0 into chunk 1.

chunk 0 ends at 1000

so we’ll start at 990 and end at 1010

In [None]:
fake_start = 990
fake_end = 1010

overlapping_fake = find_overlapping_chunks(chunks, fake_start, fake_end)

print("fake span:", fake_start, fake_end)
print("num overlapping chunks:", len(overlapping_fake))
print("chunk indexes:", [c["chunk_index"] for c in overlapping_fake])


fake span: 990 1010
num overlapping chunks: 2
chunk indexes: [0, 1]


In [None]:
chunks

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

In [None]:
for c in chunks:
    if "chunk_id" not in c:
        c["doc_id"] = doc_id
        c["chunk_id"] = make_chunk_id(doc_id, c["chunk_index"], c["start_char"], c["end_char"])


In [None]:
chunks[0]

{'chunk_index': 0,
 'start_char': 0,
 'end_char': 1000,
 'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agreement

In [None]:
overlapping_fake

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'EXHIBIT 10.6\n\n                              DISTRIBUTOR AGREEMENT\n\n         THIS  DISTRIBUTOR  AGREEMENT (the  "Agreement")  is made by and between Electric City Corp.,  a Delaware  corporation  ("Company")  and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n\n                                    RECITALS\n\n         A. The  Company\'s  Business.  The Company is  presently  engaged in the business  of selling an energy  efficiency  device,  which is  referred to as an "Energy  Saver"  which may be improved  or  otherwise  changed  from its present composition (the "Products").  The Company may engage in the business of selling other  products  or  other  devices  other  than  the  Products,  which  will be considered  Products if Distributor  exercises its options pursuant to Section 7 hereof.\n\n         B. Representations.  As an inducement to the Company to enter into this Agree

In [None]:
evidence_ids_fake = []
for c in overlapping_fake:
    evidence_ids_fake.append(c["chunk_id"])

print("evidence_ids_fake:", evidence_ids_fake)


evidence_ids_fake: ['74d4359635077bf8', '6555a0b244453dc3']


### lets process only one document

In [None]:
sample_doc_id = docs_df.sample(1, random_state=0)["doc_id"].iloc[0]
print("sample_doc_id:", sample_doc_id)

sample_doc_id: b26c38227ce11a4e


In [None]:
# Get all annotations for that doc
doc_ann = ann_df.loc[ann_df["doc_id"] == sample_doc_id]
doc_ann.head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
13446,a230d95a2692f1fe,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 L...,[510],[]
13447,ff594586c9b0ed7c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[Dong-A],[717],[]
13448,0f1e9bc9ae15c22c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[NeuroBo],[932],[]
13449,9243e126f05b602d,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[Dong-A ST Co., Ltd.,]",[717],[]
13450,91d9cd636c1d1043,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[NeuroBo Pharmaceuticals, Inc.,]",[932],[]


ann_df.head()

In [None]:
doc_ann.index

Index([13446, 13447, 13448, 13449, 13450, 13451, 13452, 13453, 13454, 13455,
       13456, 13457, 13458, 13459, 13460, 13461, 13462, 13463, 13464, 13465,
       13466, 13467, 13468, 13469, 13470, 13471, 13472, 13473, 13474, 13475,
       13476, 13477, 13478, 13479, 13480, 13481, 13482, 13483, 13484, 13485,
       13486, 13487, 13488, 13489],
      dtype='int64')

In [None]:
doc_ann.loc[doc_ann.index[3], "answer_texts"]

array(['Dong-A ST Co., Ltd.,'], dtype=object)

In [None]:
# chunk that document
doc_text = docs_df.loc[docs_df["doc_id"] == sample_doc_id, "text"].iloc[0]
chunks = chunk_text(doc_text, chunk_size=1000, overlap=150)
chunks

[{'chunk_index': 0,
  'start_char': 0,
  'end_char': 1000,
  'text': 'Exhibit 10.36 [Pursuant to Item 601(b)(10) of Regulation S-K, certain confidential portions of this exhibit have been omitted by means of marking such portions with asterisks as the identified confidential portions (i) are not material and (ii) would be competitively harmful if publicly disclosed.] MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: September 28, 2018\n\nSource: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019\n\n\n\n\n\nMANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products) This MANUFACTURING AND SUPPLY AGREEMENT (this "Agreement") is made and entered into as of September 28, 2018 ("Effective Date") by and between: Dong-A ST Co., Ltd., a corporation duly incorporated under the laws of the Republic of Korea, having its principal place of business at 64 Cheonho-daero, Dongdaemun-gu, Seoul 02587, Republic of Korea ("Don

In [None]:
for c in chunks:
    c["doc_id"] = sample_doc_id
    c["chunk_id"] = make_chunk_id(sample_doc_id, c["chunk_index"], c["start_char"], c["end_char"])

print("num chunks:", len(chunks))
print("first chunk_id:", chunks[0]["chunk_id"])

num chunks: 24
first chunk_id: ff7f3a9d475478f8


In [None]:
def is_empty_seq(x) -> bool:
    """
    Returns True if x is None or has length 0.
    Works for lists, tuples, numpy arrays, pandas Series, etc.
    """
    if x is None:
        return True
    try:
        return len(x) == 0
    except TypeError:
        # if it has no length, treat it as empty
        return True


def evidence_for_annotation_row(chunks, answer_texts, answer_starts):
    """
    return a list of chunk_ids that overlap any answer span.
    """
    evidence = []
    seen = set()

    # if no answers, return empty list
    if is_empty_seq(answer_texts) or is_empty_seq(answer_starts):
        return []
    
    #it should be that len(answer_text)=len(answer_starts) but if not loop over the smaller one
    for i in range(min(len(answer_texts), len(answer_starts))):
        text = answer_texts[i]
        start = answer_starts[i]
        end = start + len(text)

        overlapping = find_overlapping_chunks(chunks, start, end)
        for c in overlapping:
            cid = c["chunk_id"]
            if cid not in seen:
                evidence.append(cid)
                seen.add(cid)

    return evidence


In [None]:
doc_ann.head(5)

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
13446,a230d95a2692f1fe,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 L...,[510],[]
13447,ff594586c9b0ed7c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[Dong-A],[717],[]
13448,0f1e9bc9ae15c22c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[NeuroBo],[932],[]
13449,9243e126f05b602d,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[Dong-A ST Co., Ltd.,]",[717],[]
13450,91d9cd636c1d1043,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[NeuroBo Pharmaceuticals, Inc.,]",[932],[]


In [None]:
ann_df.head(5)

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,85ec9ab84c06ae69,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[74d4359635077bf8]
1,27956857c285b923,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],[]
2,8951f023dfe64525,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],[]
3,c66875b38edac636,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],[]
4,f4aa8381671b625f,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],[]


In [None]:
subset = doc_ann.head(20)

updated = 0
for idx in subset.index:
    answer_texts = ann_df.at[idx, "answer_texts"]
    answer_starts = ann_df.at[idx, "answer_starts"]

    evidence_ids = evidence_for_annotation_row(chunks, answer_texts, answer_starts)

    ann_df.at[idx, "evidence_chunk_ids"] = evidence_ids
    updated += 1

print(updated)

20


In [None]:
ann_df.head(5)

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
0,85ec9ab84c06ae69,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[DISTRIBUTOR AGREEMENT],[44],[74d4359635077bf8]
1,27956857c285b923,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Distributor],[244],[]
2,8951f023dfe64525,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois L.L.C.],[49574],[]
3,c66875b38edac636,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Electric City of Illinois LLC],[212],[]
4,f4aa8381671b625f,f71b4149a185d016,Highlight the parts (if any) of this contract ...,[Company],[197],[]


In [None]:
ann_df.loc[subset.index].head()

Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
13446,a230d95a2692f1fe,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 L...,[510],[ff7f3a9d475478f8]
13447,ff594586c9b0ed7c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[Dong-A],[717],[ff7f3a9d475478f8]
13448,0f1e9bc9ae15c22c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[NeuroBo],[932],"[ff7f3a9d475478f8, c7061889511944bd]"
13449,9243e126f05b602d,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[Dong-A ST Co., Ltd.,]",[717],[ff7f3a9d475478f8]
13450,91d9cd636c1d1043,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,"[NeuroBo Pharmaceuticals, Inc.,]",[932],"[ff7f3a9d475478f8, c7061889511944bd]"


### normalize columns so everything is a python list (we know np.array exists by the earlier error which caused us to write is_empty_seq)

In [None]:
def to_py_list(x):
    """
    Convert list/tuple/numpy array/pandas Series to a plain Python list.
    If x is None, return [].
    """
    if x is None:
        return []

    # already a normal list
    if isinstance(x, list):
        return x

    # tuples are fine
    if isinstance(x, tuple):
        return list(x)

    # numpy arrays / pandas Series often have .tolist()
    if hasattr(x, "tolist"):
        return x.tolist()

    # fallback: try to iterate
    try:
        return list(x)
    except TypeError:
        return []


In [None]:
new_texts = []
new_starts = []

for i in range(len(ann_df)):
    texts_i = to_py_list(ann_df.at[ann_df.index[i], "answer_texts"])
    starts_i = to_py_list(ann_df.at[ann_df.index[i], "answer_starts"])

    # ensure starts are plain ints
    starts_i_int = []
    for s in starts_i:
        starts_i_int.append(int(s))

    new_texts.append(texts_i)
    new_starts.append(starts_i_int)

ann_df["answer_texts"] = new_texts
ann_df["answer_starts"] = new_starts

print("done normalizing")


done normalizing


### Create sqlite DB with SQLAlchemy

In [None]:
from pathlib import Path
from sqlalchemy import create_engine

db_path = Path("data/contractiq.db")
db_path.parent.mkdir(parents=True, exist_ok=True)  # makes the "data/" folder if missing

engine = create_engine(f"sqlite:///{db_path}")
print("DB file will be at:", db_path.resolve())


DB file will be at: /mnt/c/Ubuntu/my_github_repos/ContractIQ/notebooks/data/contractiq.db


### Define schema

In [None]:
from sqlalchemy import (
    MetaData, Table, Column,
    String, Integer, Text, ForeignKey
)

metadata = MetaData()

documents = Table(
    "documents", metadata,
    Column("doc_id", String, primary_key=True),
    Column("title", Text, nullable=False),
    Column("source", String, nullable=False),
    Column("text", Text, nullable=False),
)

chunks = Table(
    "chunks", metadata,
    Column("chunk_id", String, primary_key=True),
    Column("doc_id", String, ForeignKey("documents.doc_id"), nullable=False),
    Column("chunk_index", Integer, nullable=False),
    Column("start_char", Integer, nullable=False),
    Column("end_char", Integer, nullable=False),
    Column("text", Text, nullable=False),
)

annotations = Table(
    "annotations", metadata,
    Column("annotation_id", String, primary_key=True),
    Column("doc_id", String, ForeignKey("documents.doc_id"), nullable=False),
    Column("label", Text, nullable=False),
    Column("answer_texts_json", Text, nullable=False),
    Column("answer_starts_json", Text, nullable=False),
    Column("evidence_chunk_ids_json", Text, nullable=False),
)


### Create the tables

In [None]:
metadata.create_all(engine)
print("tables created (if not already).")


tables created (if not already).


In [None]:
sample_doc_id = docs_df.sample(1, random_state=0)["doc_id"]
print(sample_doc_id)
sample_doc_id = docs_df.sample(1, random_state=0)["doc_id"].iloc[0]
doc_row = docs_df.loc[docs_df["doc_id"] == sample_doc_id]
print('doc_row')
print(doc_row)
doc_row = docs_df.loc[docs_df["doc_id"] == sample_doc_id].iloc[0]
print('doc_row first row', doc_row)

13446    b26c38227ce11a4e
Name: doc_id, dtype: object
doc_row
                                                   title  \
13446  NeuroboPharmaceuticalsInc_20190903_S-4_EX-10.3...   

                                                    text            doc_id  \
13446  Exhibit 10.36 [Pursuant to Item 601(b)(10) of ...  b26c38227ce11a4e   

        source  
13446  cuad-qa  
doc_row first row title     NeuroboPharmaceuticalsInc_20190903_S-4_EX-10.3...
text      Exhibit 10.36 [Pursuant to Item 601(b)(10) of ...
doc_id                                     b26c38227ce11a4e
source                                              cuad-qa
Name: 13446, dtype: object


### Build chunks for that document

In [None]:
doc_text = doc_row["text"]
chunks_list = chunk_text(doc_text, chunk_size=1000, overlap=150)

for c in chunks_list:
    c["doc_id"] = sample_doc_id
    c["chunk_id"] = make_chunk_id(sample_doc_id, c["chunk_index"], c["start_char"], c["end_char"])

print("num chunks:", len(chunks_list))
print("first chunk_id:", chunks_list[0]["chunk_id"])


num chunks: 24
first chunk_id: ff7f3a9d475478f8


### collect annotations for that document

In [None]:
doc_annotations = ann_df[ann_df["doc_id"] == sample_doc_id].copy()
print("num annotation rows for doc:", len(doc_annotations))
doc_annotations.head(3)


num annotation rows for doc: 44


Unnamed: 0,annotation_id,doc_id,label,answer_texts,answer_starts,evidence_chunk_ids
13446,a230d95a2692f1fe,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 L...,[510],[ff7f3a9d475478f8]
13447,ff594586c9b0ed7c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[Dong-A],[717],[ff7f3a9d475478f8]
13448,0f1e9bc9ae15c22c,b26c38227ce11a4e,Highlight the parts (if any) of this contract ...,[NeuroBo],[932],"[ff7f3a9d475478f8, c7061889511944bd]"


### lets normalize doc_annotations because it still has numpy arrays

In [None]:
fixed = 0
for idx in doc_annotations.index:
    texts = ann_df.at[idx, "answer_texts"]
    starts = ann_df.at[idx, "answer_starts"]

    texts_list = to_py_list(texts)
    starts_list = to_py_list(starts)

    # ensure starts are ints
    starts_int = []
    for s in starts_list:
        starts_int.append(int(s))

    ann_df.at[idx, "answer_texts"] = texts_list
    ann_df.at[idx, "answer_starts"] = starts_int

    fixed += 1

print("normalized rows:", fixed)


normalized rows: 44


In [None]:
# create JSON converter so that the SQLite can store lists in JSON as opposed to normal lists
import json

def dumps_json(x) -> str:
    return json.dumps(x, ensure_ascii=False)

`documents.delete()` is equivalent to `DELETE FROM documents;`

`documents.c` means "columns of the documents table"

`documents.c.doc_id` is the `doc_id` column

so `documents.delete().where(documents.c.doc_id == sample_doc_id)` is equivalent to `DELETE FROM documents WHERE doc_id = sample_doc_id;`

we start with `delete` so that it overwrites the command so that we dont get duplicates or primary key conflicts if we run the cell again.

```
    conn.execute(documents.insert().values(
        doc_id=sample_doc_id,
        title=str(doc_row["title"]),
        source=str(doc_row["source"]),
        text=str(doc_row["text"]),
    ))
    ```

is equivalent to 
```
INSERT INTO documents (doc_id, title, source, text)
VALUES (:doc_id, :title, :source, :text);
```

and 
```
conn.execute(chunks.insert(), chunk_rows)
```

is bulk insert where `chunk_rows` is a list of dictionaries, one dict per row. example of one dict:

```
{
  "chunk_id": "...",
  "doc_id": "...",
  "chunk_index": 0,
  "start_char": 0,
  "end_char": 1000,
  "text": "...."
}
```

In [None]:
with engine.begin() as conn:
    # delete old rows for this doc (safe for reruns)
    conn.execute(documents.delete().where(documents.c.doc_id == sample_doc_id))
    conn.execute(chunks.delete().where(chunks.c.doc_id == sample_doc_id))
    conn.execute(annotations.delete().where(annotations.c.doc_id == sample_doc_id))

    # insert the document
    conn.execute(documents.insert().values(
        doc_id=sample_doc_id,
        title=str(doc_row["title"]),
        source=str(doc_row["source"]),
        text=str(doc_row["text"]),
    ))

    # insert all chunks for this doc
    chunk_rows = []
    for c in chunks_list:
        chunk_rows.append({
            "chunk_id": c["chunk_id"],
            "doc_id": c["doc_id"],
            "chunk_index": int(c["chunk_index"]),
            "start_char": int(c["start_char"]),
            "end_char": int(c["end_char"]),
            "text": str(c["text"]),
        })
    conn.execute(chunks.insert(), chunk_rows)

    # insert all annotations for this doc
    ann_rows = []
    for idx in doc_annotations.index:
        ann_rows.append({
            "annotation_id": ann_df.at[idx, "annotation_id"],
            "doc_id": ann_df.at[idx, "doc_id"],
            "label": ann_df.at[idx, "label"],
            "answer_texts_json": dumps_json(ann_df.at[idx, "answer_texts"]),
            "answer_starts_json": dumps_json(ann_df.at[idx, "answer_starts"]),
            "evidence_chunk_ids_json": dumps_json(ann_df.at[idx, "evidence_chunk_ids"]),
        })
    conn.execute(annotations.insert(), ann_rows)

print("inserted doc + chunks + annotations")


inserted doc + chunks + annotations


### lets verify that the db really contains what we think it contains. how many total rows in each table? how many rows are tied to this one document?

```
select(func.count()).select_from(documents)
```

is equivalent to

```
SELECT count(*) FROM documents;
```

`.scalar_one()` retruns one value (a single number)

In [None]:
from sqlalchemy import select, func

with engine.connect() as conn:
    doc_count = conn.execute(select(func.count()).select_from(documents)).scalar_one()
    chunk_count = conn.execute(select(func.count()).select_from(chunks)).scalar_one()
    ann_count = conn.execute(select(func.count()).select_from(annotations)).scalar_one()

print("documents:", doc_count)
print("chunks:", chunk_count)
print("annotations:", ann_count)


documents: 1
chunks: 24
annotations: 44


### now count rows for just this doc

In [None]:
with engine.connect() as conn:
    chunk_count_doc = conn.execute(
        select(func.count()).select_from(chunks).where(chunks.c.doc_id == sample_doc_id)
    ).scalar_one()

    ann_count_doc = conn.execute(
        select(func.count()).select_from(annotations).where(annotations.c.doc_id == sample_doc_id)
    ).scalar_one()

print("chunks for doc:", chunk_count_doc)
print("annotations for doc:", ann_count_doc)


chunks for doc: 24
annotations for doc: 44


### We make a tiny “contract explorer” function that, given:

doc_id

a keyword like "Effective Date" or "Parties"

returns:

the matching annotation rows

the answer text(s)

a short snippet from the contract around each answer span

In [None]:
from sqlalchemy import select
import json

def fetch_document_text(conn, doc_id: str) -> str:
    row = conn.execute(
        select(documents.c.text).where(documents.c.doc_id == doc_id)
    ).first()
    if row is None:
        raise KeyError(f"doc_id not found: {doc_id}")
    return row[0]

### make a snippet around an answer span

In [None]:
def snippet_around_span(text: str, start: int, span_len: int, window: int = 80) -> str:
    """
    Returns a short snippet with [[...]] around the answer span.
    """
    start = int(start)
    end = start + int(span_len)

    left = max(0, start - window)
    right = min(len(text), end + window)

    return (
        text[left:start]
        + "[["
        + text[start:end]
        + "]]"
        + text[end:right]
    )


### Fetch annotations for a doc + optional keyword filter

In [None]:
def fetch_annotations_for_doc(conn, doc_id: str, label_contains: str | None = None, limit: int = 10):
    stmt = select(
        annotations.c.annotation_id,
        annotations.c.label,
        annotations.c.answer_texts_json,
        annotations.c.answer_starts_json,
    ).where(annotations.c.doc_id == doc_id)

    if label_contains is not None:
        stmt = stmt.where(annotations.c.label.like(f"%{label_contains}%"))

    rows = conn.execute(stmt).fetchmany(limit)
    return rows


In [None]:
def show_answers_with_snippets(doc_id: str, label_contains: str, limit: int = 5):
    with engine.connect() as conn:
        doc_text = fetch_document_text(conn, doc_id)
        rows = fetch_annotations_for_doc(conn, doc_id, label_contains=label_contains, limit=limit)

    print("doc_id:", doc_id)
    print("filter:", label_contains)
    print("rows returned:", len(rows))
    print("-" * 60)

    for r in rows:
        annotation_id = r[0]
        label = r[1]
        answer_texts = json.loads(r[2])
        answer_starts = json.loads(r[3])

        print("\nannotation_id:", annotation_id)
        print("label:", label)

        if len(answer_texts) == 0:
            print("answer: (no answer)")
            continue

        # show up to 2 spans
        for i in range(min(2, len(answer_texts), len(answer_starts))):
            ans = answer_texts[i]
            st = answer_starts[i]
            print("\nanswer:", ans)
            print("start:", st)
            print("snippet:\n", snippet_around_span(doc_text, st, len(ans), window=80))


In [None]:
show_answers_with_snippets(sample_doc_id, label_contains="Effective Date", limit=5)

doc_id: b26c38227ce11a4e
filter: Effective Date
rows returned: 1
------------------------------------------------------------

annotation_id: 099852557f935b9c
label: Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective

answer: September 28, 2018
start: 432
snippet:
  Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]

Source: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019





MANUFACTURING AND SU


In [None]:
def search_annotations(doc_id: str, label_contains: str, limit: int = 5):
    with engine.connect() as conn:
        doc_text = fetch_document_text(conn, doc_id)
        rows = fetch_annotations_for_doc(conn, doc_id, label_contains=label_contains, limit=limit)

    results = []
    for r in rows:
        annotation_id = r[0]
        label = r[1]
        answer_texts = json.loads(r[2])
        answer_starts = json.loads(r[3])

        answers = []
        for i in range(min(len(answer_texts), len(answer_starts))):
            ans = answer_texts[i]
            st = int(answer_starts[i])
            answers.append({
                "text": ans,
                "start_char": st,
                "snippet": snippet_around_span(text=doc_text, start=st, span_len=len(ans), window=80),
            })

        results.append({
            "annotation_id": annotation_id,
            "label": label,
            "answers": answers,   # empty list if none
        })

    return {
        "doc_id": doc_id,
        "filter": label_contains,
        "results": results,
    }


In [None]:
out = search_annotations(sample_doc_id, "Effective Date", limit=3)
out


{'doc_id': 'b26c38227ce11a4e',
 'filter': 'Effective Date',
 'results': [{'annotation_id': '099852557f935b9c',
   'label': 'Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective',
   'answers': [{'text': 'September 28, 2018',
     'start_char': 432,
     'snippet': ' Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]\n\nSource: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019\n\n\n\n\n\nMANUFACTURING AND SU'}]}]}

In [None]:
out["results"][0]

{'annotation_id': '099852557f935b9c',
 'label': 'Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective',
 'answers': [{'text': 'September 28, 2018',
   'start_char': 432,
   'snippet': ' Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]\n\nSource: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019\n\n\n\n\n\nMANUFACTURING AND SU'}]}

In [None]:
out = search_annotations(sample_doc_id, 'related to "Effective Date"', limit=3)
out


{'doc_id': 'b26c38227ce11a4e',
 'filter': 'related to "Effective Date"',
 'results': [{'annotation_id': '099852557f935b9c',
   'label': 'Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective',
   'answers': [{'text': 'September 28, 2018',
     'start_char': 432,
     'snippet': ' Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]\n\nSource: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019\n\n\n\n\n\nMANUFACTURING AND SU'}]}]}

In [None]:
def show_first_result(out):
    if len(out["results"]) == 0:
        print("No results")
        return

    r = out["results"][0]
    print("label:", r["label"])

    if len(r["answers"]) == 0:
        print("answer: (no answer)")
        return

    a = r["answers"][0]
    print("answer:", a["text"])
    print("snippet:", a["snippet"])

show_first_result(out)

label: Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective
answer: September 28, 2018
snippet:  Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]

Source: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019





MANUFACTURING AND SU


the 'list_labels_for_doc' should show the answer to “Show me all the questions CUAD asked about this document.”

In [None]:
def list_labels_for_doc(doc_id: str, limit: int = 500):
    with engine.connect() as conn:
        rows = conn.execute(
            select(annotations.c.label).where(annotations.c.doc_id == doc_id)
        ).fetchmany(limit)

    labels = []
    seen = set()
    for row in rows:
        label = row[0]
        if label not in seen:
            labels.append(label)
            seen.add(label)

    return labels


In [None]:
labels = list_labels_for_doc(sample_doc_id)
print("num labels:", len(labels))
print(labels[0])


num labels: 41
Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract


In [None]:
chosen_label = labels[0]   
out = search_annotations(sample_doc_id, chosen_label, limit=3)  # current function uses LIKE
out


{'doc_id': 'b26c38227ce11a4e',
 'filter': 'Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
 'results': [{'annotation_id': 'a230d95a2692f1fe',
   'label': 'Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
   'answers': [{'text': 'MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products)',
     'start_char': 510,
     'snippet': ': September 28, 2018\n\nSource: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019\n\n\n\n\n\n[[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products)]] This MANUFACTURING AND SUPPLY AGREEMENT (this "Agreement") is made and entered '}]}]}

In [None]:
show_first_result(out)

label: Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract
answer: MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products)
snippet: : September 28, 2018

Source: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019





[[MANUFACTURING AND SUPPLY AGREEMENT (DA-9801 Licensed Products)]] This MANUFACTURING AND SUPPLY AGREEMENT (this "Agreement") is made and entered 


In [None]:
def search_labels(labels: list[str], query: str, max_results: int = 10):
    q = query.strip().lower()
    if q == "":
        return []

    matches = []
    for label in labels:
        if q in label.lower():
            matches.append(label)
        if len(matches) >= max_results:
            break

    return matches


In [None]:
labels = list_labels_for_doc(sample_doc_id)
labels

['Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
 'Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract',
 'Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract',
 'Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective',
 'Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract\'s initial term expire?',
 'Highlight the parts (if any) of this contract related to "Renewal Term" that should be reviewed by a lawyer. Details: What is the renewal term after the initial term expires? This includes aut

In [None]:
matches = search_labels(labels, query="date", max_results=10)
print("matches:", len(matches))
for m in matches:
    print("-", m)


matches: 4
- Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract
- Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective
- Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?
- Highlight the parts (if any) of this contract related to "Liquidated Damages" that should be reviewed by a lawyer. Details: Does the contract contain a clause that would award either party liquidated damages for breach or a fee upon the termination of a contract (termination fee)?


In [None]:
if len(matches) > 0:
    chosen_label = matches[0]
    out = search_annotations(sample_doc_id, chosen_label, limit=3)
    show_first_result(out)
else:
    print("No label matches found.")


label: Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract
answer: September 28, 2018
snippet:  Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]

Source: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019





MANUFACTURING AND SU


### return json-ready data

In [None]:
def explore_doc(doc_id: str, query: str, max_labels: int = 10, max_answers_per_label: int = 2):
    # 1) get all labels for the doc
    labels = list_labels_for_doc(doc_id)

    # 2) find matching labels
    matched_labels = search_labels(labels, query=query, max_results=max_labels)

    # 3) fetch results for each matched label
    results = []
    with engine.connect() as conn:
        doc_text = fetch_document_text(conn, doc_id)

        for label in matched_labels:
            rows = fetch_annotations_for_doc(conn, doc_id, label_contains=label, limit=10)

            # rows might contain multiple entries; we’ll aggregate answers from the first row we see
            if len(rows) == 0:
                continue

            r = rows[0]
            annotation_id = r[0]
            label_str = r[1]
            answer_texts = json.loads(r[2])
            answer_starts = json.loads(r[3])

            answers = []
            for i in range(min(max_answers_per_label, len(answer_texts), len(answer_starts))):
                ans = answer_texts[i]
                st = int(answer_starts[i])
                answers.append({
                    "text": ans,
                    "start_char": st,
                    "snippet": snippet_around_span(doc_text, st, len(ans), window=80),
                })

            results.append({
                "annotation_id": annotation_id,
                "label": label_str,
                "answers": answers,
            })

    return {
        "doc_id": doc_id,
        "query": query,
        "matched_labels": matched_labels,
        "results": results,
    }


In [None]:
resp = explore_doc(sample_doc_id, "date", max_labels=5)
print(resp["matched_labels"])
print(resp["results"][0]["answers"][0]["snippet"])


['Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract', 'Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective', 'Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract\'s initial term expire?', 'Highlight the parts (if any) of this contract related to "Liquidated Damages" that should be reviewed by a lawyer. Details: Does the contract contain a clause that would award either party liquidated damages for breach or a fee upon the termination of a contract (termination fee)?']
 Products) Between DONG-A ST CO., LTD. And NEUROBO PHARMACEUTICALS, INC. Dated: [[September 28, 2018]]

Source: NEUROBO PHARMACEUTICALS, INC., S-4, 9/3/2019





MANUFACTURING AND SU


### one label can appear in multiple rows, so merge answers across all rows for that label

In [None]:
def merge_answer_spans(rows):
    """
    rows: list of tuples (annotation_id, label, answer_texts_json, answer_starts_json)
    returns: list of dicts [{"text": ..., "start_char": ...}, ...]
    """
    spans = []
    seen = set()

    for r in rows:
        answer_texts = json.loads(r[2])
        answer_starts = json.loads(r[3])

        n = min(len(answer_texts), len(answer_starts))
        for i in range(n):
            text = answer_texts[i]
            start = int(answer_starts[i])

            key = (start, text)
            if key in seen:
                continue

            spans.append({"text": text, "start_char": start})
            seen.add(key)

    # Sort by where the answer appears in the contract
    spans.sort(key=lambda s: s["start_char"])
    return spans


In [137]:
def extract_field_name(label: str) -> str | None:
    first = label.find('"')#find returns the index in the string
    if first == -1:
        return None
    second = label.find('"', first + 1)#second argument of find tell us to start search at index first+1
    if second == -1:
        return None
    return label[first + 1:second]


In [140]:
def explore_doc_all_rows(doc_id: str, query: str, max_labels: int = 10, max_answers_per_label: int = 2):
    labels = list_labels_for_doc(doc_id)
    matched_labels = search_labels(labels, query=query, max_results=max_labels)

    results = []

    with engine.connect() as conn:
        doc_text = fetch_document_text(conn, doc_id)

        for label in matched_labels:
            rows = fetch_annotations_for_doc(conn, doc_id, label_contains=label, limit=50)
            if len(rows) == 0:
                continue

            label_str = rows[0][1]
            merged_spans = merge_answer_spans(rows)

            answers = []
            for s in merged_spans[:max_answers_per_label]:
                ans = s["text"]
                st = s["start_char"]
                answers.append({
                    "text": ans,
                    "start_char": st,
                    "snippet": snippet_around_span(doc_text, st, len(ans), window=80),
                })

            field = extract_field_name(label_str)

            results.append({
                "annotation_id": rows[0][0],
                "field": field,
                "label": label_str,
                "answers": answers,
                "num_spans_found": len(merged_spans),
                "num_rows_merged": len(rows),
            })

    return {
        "doc_id": doc_id,
        "query": query,
        "matched_labels": matched_labels,
        "results": results,
    }


In [141]:
resp = explore_doc_all_rows(sample_doc_id, query="date", max_labels=5, max_answers_per_label=2)

for r in resp["results"]:
    print("field:", r["field"])
    if r["answers"]:
        print("  answer:", r["answers"][0]["text"])


field: Agreement Date
  answer: September 28, 2018
field: Effective Date
  answer: September 28, 2018
field: Expiration Date
  answer: This Agreement shall commence on the Effective Date and, unless earlier terminated, shall continue in full force and effect for a period of [***] years thereafter.
field: Liquidated Damages


In [142]:
def list_documents(limit: int = 20):
    with engine.connect() as conn:
        rows = conn.execute(
            select(documents.c.doc_id, documents.c.title, documents.c.source)
        ).fetchmany(limit)

    docs = []
    for r in rows:
        docs.append({
            "doc_id": r[0],
            "title": r[1],
            "source": r[2],
        })
    return docs

docs = list_documents(limit=10)
print("num docs:", len(docs))
for d in docs:
    print(d["doc_id"], "-", d["title"][:80])


num docs: 1
b26c38227ce11a4e - NeuroboPharmaceuticalsInc_20190903_S-4_EX-10.36_11802165_EX-10.36_Manufacturing 


### lets unsert 5 documents into db

In [143]:
N = 5
doc_ids_to_insert = docs_df["doc_id"].drop_duplicates().head(N).tolist()

print("doc_ids_to_insert:", len(doc_ids_to_insert))
doc_ids_to_insert


doc_ids_to_insert: 5


['f71b4149a185d016',
 '43ab152a17a15599',
 '644b67c819fbca9c',
 '2d790a4a4132cc53',
 'b9d11c50bada2ade']

In [144]:
def insert_one_doc(conn, doc_id: str, chunk_size: int = 1000, overlap: int = 150):
    # fetch the document row
    doc_row = docs_df.loc[docs_df["doc_id"] == doc_id].iloc[0]
    doc_text = doc_row["text"]

    # chunk it
    chunks_list = chunk_text(doc_text, chunk_size=chunk_size, overlap=overlap)
    for c in chunks_list:
        c["doc_id"] = doc_id
        c["chunk_id"] = make_chunk_id(doc_id, c["chunk_index"], c["start_char"], c["end_char"])

    # annotations for this doc
    doc_annotations = ann_df[ann_df["doc_id"] == doc_id]

    # delete old rows for this doc (safe reruns)
    conn.execute(chunks.delete().where(chunks.c.doc_id == doc_id))
    conn.execute(annotations.delete().where(annotations.c.doc_id == doc_id))
    conn.execute(documents.delete().where(documents.c.doc_id == doc_id))

    # insert document
    conn.execute(documents.insert().values(
        doc_id=doc_id,
        title=str(doc_row["title"]),
        source=str(doc_row["source"]),
        text=str(doc_row["text"]),
    ))

    # insert chunks
    chunk_rows = []
    for c in chunks_list:
        chunk_rows.append({
            "chunk_id": c["chunk_id"],
            "doc_id": c["doc_id"],
            "chunk_index": int(c["chunk_index"]),
            "start_char": int(c["start_char"]),
            "end_char": int(c["end_char"]),
            "text": str(c["text"]),
        })
    conn.execute(chunks.insert(), chunk_rows)

    # insert annotations (no evidence ids stored)
    ann_rows = []
    for idx in doc_annotations.index:
        ann_rows.append({
            "annotation_id": ann_df.at[idx, "annotation_id"],
            "doc_id": ann_df.at[idx, "doc_id"],
            "label": ann_df.at[idx, "label"],
            "answer_texts_json": dumps_json(ann_df.at[idx, "answer_texts"]),
            "answer_starts_json": dumps_json(ann_df.at[idx, "answer_starts"]),
            "evidence_chunk_ids_json": dumps_json([]),
        })
    conn.execute(annotations.insert(), ann_rows)

    return len(chunks_list), len(doc_annotations)


In [145]:
inserted = 0

with engine.begin() as conn:
    for doc_id in doc_ids_to_insert:
        num_chunks, num_anns = insert_one_doc(conn, doc_id)
        inserted += 1
        print("inserted doc", inserted, "/", len(doc_ids_to_insert), "| chunks:", num_chunks, "| anns:", num_anns)

print("done")


inserted doc 1 / 5 | chunks: 64 | anns: 68
inserted doc 2 / 5 | chunks: 83 | anns: 56
inserted doc 3 / 5 | chunks: 2 | anns: 42
inserted doc 4 / 5 | chunks: 29 | anns: 51
inserted doc 5 / 5 | chunks: 22 | anns: 51
done


In [146]:
with engine.connect() as conn:
    doc_count = conn.execute(select(func.count()).select_from(documents)).scalar_one()
    chunk_count = conn.execute(select(func.count()).select_from(chunks)).scalar_one()
    ann_count = conn.execute(select(func.count()).select_from(annotations)).scalar_one()

print("documents:", doc_count)
print("chunks:", chunk_count)
print("annotations:", ann_count)


documents: 6
chunks: 224
annotations: 312


-----
### we are done with this notebook but here is TF-IDF as a sanity check that pinecone does the right thing

In [152]:
from sqlalchemy import select

def load_chunks_for_doc(doc_id: str):
    with engine.connect() as conn:
        rows = conn.execute(
            select(
                chunks.c.chunk_id,
                chunks.c.chunk_index,
                chunks.c.text
            ).where(chunks.c.doc_id == doc_id)
        ).all()

    out = []
    for r in rows:
        out.append({
            "chunk_id": r[0],
            "chunk_index": int(r[1]),
            "text": r[2],
        })
    return out

doc_id = sample_doc_id
doc_chunks = load_chunks_for_doc(doc_id)
print("chunks loaded:", len(doc_chunks))
print(doc_chunks[0]["chunk_id"], doc_chunks[0]["chunk_index"])


chunks loaded: 24
ff7f3a9d475478f8 0


In [153]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_tfidf_index(chunks_list):
    texts = []
    chunk_ids = []
    for c in chunks_list:
        texts.append(c["text"])
        chunk_ids.append(c["chunk_id"])

    vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
    X = vectorizer.fit_transform(texts)
    return vectorizer, X, chunk_ids


In [154]:
vectorizer, X, chunk_ids = build_tfidf_index(doc_chunks)
print("TF-IDF matrix shape:", X.shape)
print(X)

TF-IDF matrix shape: (24, 533)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1218 stored elements and shape (24, 533)>
  Coords	Values
  (0, 197)	0.13431902441632182
  (0, 2)	0.1843893898460789
  (0, 12)	0.10417500732623265
  (0, 389)	0.083694527999056
  (0, 268)	0.10417500732623265
  (0, 13)	0.10417500732623265
  (0, 407)	0.083694527999056
  (0, 83)	0.09219469492303944
  (0, 107)	0.2083500146524653
  (0, 361)	0.31252502197869797
  (0, 332)	0.10417500732623265
  (0, 307)	0.10417500732623265
  (0, 301)	0.10417500732623265
  (0, 57)	0.10417500732623265
  (0, 243)	0.10417500732623265
  (0, 303)	0.09219469492303944
  (0, 244)	0.05973390319266958
  (0, 98)	0.10417500732623265
  (0, 232)	0.10417500732623265
  (0, 386)	0.10417500732623265
  (0, 168)	0.10417500732623265
  (0, 300)	0.21514264678758838
  (0, 479)	0.14660612251367178
  (0, 35)	0.15062358012393204
  (0, 139)	0.1843893898460789
  :	:
  (22, 191)	0.2353830349447424
  (22, 87)	0.1176915174723712
  (22, 82)	0.235383034

In [155]:
def retrieve_top_k(vectorizer, X, chunk_ids, chunks_list, query: str, k: int = 5):
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, X).ravel()

    # get top k indices
    top_idx = scores.argsort()[::-1][:k]

    results = []
    for i in top_idx:
        results.append({
            "chunk_id": chunk_ids[i],
            "score": float(scores[i]),
            "text": chunks_list[i]["text"],
        })
    return results


In [157]:
query = "What is the effective date of this agreement?"
top = retrieve_top_k(vectorizer, X, chunk_ids, doc_chunks, query=query, k=3)

for r in top:
    print("\nscore:", r["score"], "chunk_id:", r["chunk_id"])
    print(r["text"])



score: 0.24419133205233917 chunk_id: acea9d64feb6681a
oducts and their matching placebo for the purpose of research and development of the Licensed Products, including the use in phase III clinical trials to be conducted by NeuroBo for the purpose of obtaining the NDA in the Territory, pursuant to the License Agreement; WHEREAS, NeuroBo wishes that Dong-A supply to NeuroBo the Licensed Products and their matching placebo already manufactured by Dong-A as of the Effective Date in compliance with the Korea Good Manufacturing Practices promulgated by the Governmental Authority in the Republic of Korea (the "KGMP") and the Licensed Products and/or their matching placebo to be manufactured by Dong-A after the Effective Date in compliance with the KGMP; and WHEREAS, Dong-A agrees (i) to supply to NeuroBo the Licensed Products and their matching placebo already manufactured by Dong-A as of the Effective Date in compliance with the KGMP, and (ii) to manufacture and supply to NeuroBo the Licen