In [1]:
import sys
import os

In [7]:
from config import settings
print("sqlite_path:", settings.sqlite_path)
print("pinecone_index:", settings.pinecone_index_name)
print("pinecone_namespace:", settings.pinecone_namespace)


sqlite_path: /mnt/c/Ubuntu/my_github_repos/ContractIQ/data/contractrag.db
pinecone_index: contractiq-384
pinecone_namespace: cuad-chunks-v2


In [8]:
from pinecone import Pinecone
from config import settings

pc = Pinecone(api_key=settings.pinecone_api_key)
idx = pc.Index(settings.pinecone_index_name)

stats = idx.describe_index_stats()
print(stats.get("namespaces", {}).keys())
print(stats.get("namespaces", {}).get(settings.pinecone_namespace, {}))


dict_keys(['cuad-chunks-v2', 'cuad-chunks-v1'])
{'vector_count': 11185}


In [11]:
module_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
print(module_path)
sys.path.append(module_path)

/mnt/c/Ubuntu/my_github_repos/ContractIQ/src


In [12]:
from sqlalchemy import text
from db import get_conn
from config import settings

print("sqlite_path used NOW:", settings.sqlite_path)

with get_conn() as conn:
    d = conn.execute(text("SELECT COUNT(*) FROM documents")).fetchone()[0]
    c = conn.execute(text("SELECT COUNT(*) FROM chunks")).fetchone()[0]
print("documents:", d, "chunks:", c)


sqlite_path used NOW: /mnt/c/Ubuntu/my_github_repos/ContractIQ/data/contractrag.db
documents: 200 chunks: 11185


In [13]:
from sqlalchemy import text
from db import get_conn

needle = "govern"
with get_conn() as conn:
    rows = conn.execute(text("""
        SELECT chunk_id, chunk_index, text
        FROM chunks
        WHERE doc_id = :doc_id AND lower(text) LIKE :pat
        ORDER BY chunk_index
        LIMIT 10
    """), {"doc_id": doc_id, "pat": f"%{needle}%"}).fetchall()

print("govern-hits:", len(rows))
for r in rows[:3]:
    print("chunk_id:", r[0], "chunk_index:", r[1])
    print(r[2][:400])


govern-hits: 10
chunk_id: b68ec11312678911 chunk_index: 6
 agreed. Updated price lists shall not apply to  valid Quotes      2





     issued by Accuray and subject to acceptance by Distributor prior to the effective date of such updated price lists.     2.2.2. Notwithstanding the foregoing or anything to the contrary contained in this Agreement, Distributor may present for approval  to Accuray opportunities for sales of Products and Services at prices
chunk_id: 1738eaa34f123b70 chunk_index: 9
s based on a Quote provided by Accuray, Distributor will issue a purchase order,  which shall include specific references to the quote number of such Quote (the "Purchase Order"). Accuray shall either  accept or reject such Purchase Order within two weeks after receipt thereof, with any failure to approve or disapprove of  such Purchase Order in such period constituting disapproval. Each purchase 
chunk_id: a91fab4f3cb6379b chunk_index: 15
stomer. For the  avoidance of doubt, (i) the obligation

In [14]:
# 1) Collect the chunk_ids that contain 'govern' according to SQLite
govern_chunk_ids = [r[0] for r in rows]
print("num govern_chunk_ids:", len(govern_chunk_ids))
print("first 5 govern_chunk_ids:", govern_chunk_ids[:5])

# 2) Retrieve from Pinecone for the same doc_id and see overlap
from retrieval import pinecone_query

res = pinecone_query("governing law governed and construed", top_k=60, doc_id=doc_id)
matches = res.get("matches", [])
retrieved_ids = [m["id"] for m in matches]

print("retrieved:", len(retrieved_ids))
overlap = set(retrieved_ids) & set(govern_chunk_ids)
print("overlap_count:", len(overlap))
print("overlap_ids:", list(overlap)[:10])


num govern_chunk_ids: 10
first 5 govern_chunk_ids: ['b68ec11312678911', '1738eaa34f123b70', 'a91fab4f3cb6379b', '4e5663cd0eeb0a15', '286ab767af9374c1']
retrieved: 60
overlap_count: 7
overlap_ids: ['286ab767af9374c1', '1b0b81a0f315bf4e', '870e3cce5ca2b1bf', '74b26aaf9d3a0816', '4e5663cd0eeb0a15', '96c4ea882db9f03b', '09aafca20fb69b43']


In [15]:
from documents import fetch_chunks_by_ids

cid = "286ab767af9374c1"  # pick one from overlap_ids
chunk = fetch_chunks_by_ids([cid])[0]
txt = chunk["text"]

pos = txt.lower().find("govern")
print("pos:", pos)
print(txt[max(0, pos-250):pos+450])


pos: 1180
ation furnished to a party or its Affiliates, employees,  consultants, and advisors in connection with this Agreement will      15





     be subject to and the parties' rights and obligations with respect to such Confidential Information shall be governed by the Conf


In [20]:
from sqlalchemy import text
from db import get_conn

patterns = [
    "%governed by the laws%",
    "%governed and construed%",
    "%governed by and construed%",
    "%laws of the state%",
    "%laws of%",
    "%jurisdiction%",
    "%venue%",
]

with get_conn() as conn:
    for pat in patterns:
        rows = conn.execute(text("""
            SELECT chunk_id, chunk_index, text
            FROM chunks
            WHERE doc_id = :doc_id AND lower(text) LIKE :pat
            ORDER BY chunk_index
            LIMIT 3
        """), {"doc_id": doc_id, "pat": pat}).fetchall()

        print("\nPATTERN:", pat, "| hits:", len(rows))
        for r in rows:
            # print a small snippet so we can see if it's the actual governing law clause
            t = r[2]
            print("  -", r[0], "idx", r[1], "|", t[:250].replace("\n"," "))



PATTERN: %governed by the laws% | hits: 0

PATTERN: %governed and construed% | hits: 0

PATTERN: %governed by and construed% | hits: 0

PATTERN: %laws of the state% | hits: 0

PATTERN: %laws of% | hits: 3
  - cc569af6f11d56c2 idx 0 | Exhibit 10.31    PURSUANT TO 17 C.F.R. ยง 240.24B-2, CONFIDENTIAL INFORMATION (INDICATED BY {*****}) HAS BEEN OMITTED FROM THIS  DOCUMENT AND HAS BEEN FILED SEPARATELY WITH THE SECURITIES AND EXCHANGE COMMISSION PURSUANT TO A  CONFIDENTIAL TREATMENT A
  - 4e5663cd0eeb0a15 idx 16 |  any extension thereof the ability  to distribute, market and sell the Products and Services in accordance with the terms of this Agreement, in full compliance  with all governmental, regulatory and other requirements under any applicable law. Furthe
  - 96c4ea882db9f03b idx 62 |   Purchase Order or other agreement entered into in connection with this Agreement) (a "Claim") shall be limited to the aggregate  amount of the purchase prices paid by Distributor to Accuray for Produc

In [16]:
from documents import list_documents
from rag import rag_answer

docs = list_documents(limit=5)

In [17]:
for d in docs:
    print(d)

{'doc_id': '77ab0faa303507ad', 'title': 'ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT', 'source': 'cuad-v1', 'raw_path': '/mnt/c/Ubuntu/my_github_repos/ContractIQ/data/raw/theatticusproject__cuad/CUAD_v1/full_contract_txt/Part_I/ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT.txt'}
{'doc_id': '424083274037f63d', 'title': 'ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT', 'source': 'cuad-v1', 'raw_path': '/mnt/c/Ubuntu/my_github_repos/ContractIQ/data/raw/theatticusproject__cuad/CUAD_v1/full_contract_txt/Part_I/ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT.txt'}
{'doc_id': '2d790a4a4132cc53', 'title': 'ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT', 'source': 'cuad-v1', 'raw_path': '/mnt/c/Ubuntu/my_github_repos/ContractIQ/data/raw/theatticusproject__cuad/CUAD_v1/full_contract_txt/Part_I/ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt'}
{'doc_id': 'a7559c456c1cac33', 'title': 'AIRSPANNETWORKSINC_04_11_2000-EX-10.5-Distributor Agreement', 'source': 'cu

In [None]:
doc_id = docs[1]["doc_id"]
print(doc_id)
resp = rag_answer("What is the effective date of this agreement?", 
doc_id=doc_id, 
top_k=12, debug=False)
resp["answer"], resp.get("retrieved_chunk_ids", [])[:3]

424083274037f63d


('I cannot find the effective date of this agreement in the provided text.',
 [])

In [13]:
from retrieval import pinecone_query

res = pinecone_query("governing law", top_k=12, doc_id=doc_id)
matches = res.get("matches", [])
print("matches:", len(matches))

doc_ids = []
for m in matches:
    md = m.get("metadata", {}) or {}
    doc_ids.append(md.get("doc_id"))
print("distinct metadata doc_ids:", sorted(set(doc_ids)))


matches: 12
distinct metadata doc_ids: ['77ab0faa303507ad']


In [16]:
res

QueryResponse(matches=[{'id': 'c7572ef1bf3d63ce',
 'metadata': {'chunk_index': 0,
              'doc_id': '77ab0faa303507ad',
              'end_char': 1200,
              'source': 'cuad-v1',
              'start_char': 0},
 'score': 0.291097164,
 'values': []}, {'id': 'f29321c77e87426b',
 'metadata': {'chunk_index': 6,
              'doc_id': '77ab0faa303507ad',
              'end_char': 7200,
              'source': 'cuad-v1',
              'start_char': 6000},
 'score': 0.268082142,
 'values': []}, {'id': '8999198349fd627d',
 'metadata': {'chunk_index': 2,
              'doc_id': '77ab0faa303507ad',
              'end_char': 3200,
              'source': 'cuad-v1',
              'start_char': 2000},
 'score': 0.264698029,
 'values': []}, {'id': 'ede11ed934901639',
 'metadata': {'chunk_index': 22,
              'doc_id': '77ab0faa303507ad',
              'end_char': 23200,
              'source': 'cuad-v1',
              'start_char': 22000},
 'score': 0.258494377,
 'values': []}, {

In [14]:
from documents import fetch_chunks_by_ids

retrieved_ids = [m["id"] for m in matches]
chunks = fetch_chunks_by_ids(retrieved_ids)

print("got chunks:", len(chunks))
for i, c in enumerate(chunks[:3]):
    print("\n--- chunk", i, c["chunk_id"], "chunk_index", c["chunk_index"])
    print(c["text"][:600])


got chunks: 12

--- chunk 0 2df7e35f0c06a455 chunk_index 25
the Parties shall negotiate in good faith to replace invalid or unenforceable provisions with valid provisions, the economic effect of which comes as close as possible to that of the invalid or unenforceable provisions. 7.7 Construction. The Parties have participated jointly in the negotiation and drafting of this Agreement. In the event an ambiguity or question of intent or interpretation arises, this Agreement shall be construed as if drafted jointly by the Parties and no presumption or burden of proof shall arise favoring or disfavoring any Party by virtue of the authorship of any of the pr

--- chunk 1 333fb0d616d40dc5 chunk_index 16
uch information for the Permitted Purpose and who have been advised of the terms of this Section 4.1 and the Receiving Party shall be liable for any breach of these confidentiality provisions by such Persons; provided, however, that any Receiving Party may disclose such Confidential Informatio

In [15]:
from sqlalchemy import text
from db import get_conn

needle = "govern"
with get_conn() as conn:
    rows = conn.execute(text("""
        SELECT chunk_id, chunk_index, text
        FROM chunks
        WHERE doc_id = :doc_id AND lower(text) LIKE :pat
        ORDER BY chunk_index
        LIMIT 5
    """), {"doc_id": doc_id, "pat": f"%{needle}%"}).fetchall()

print("hits:", len(rows))
for r in rows:
    print(r[0], "idx", r[1])
    print(r[2][:500])


hits: 4
5fe3ecf5d14b7050 idx 13
r is prevented from or delayed in complying, either totally or in part, with any of the terms or provisions of this Agreement by reason of fire, flood, storm, strike, lockout or other labor trouble or shortage, delays by unaffiliated suppliers or carriers, shortages of fuel, power, raw materials or components, any law, order, proclamation, regulation, ordinance, demand, seizure or requirement of any governmental authority, riot, civil commotion, war, rebellion, acts of terrorism, nuclear acciden
f0138be3ab6114e0 idx 21
 * * * ] With a copy to: N/A If to Recipient: TELCOSTAR PTE. LTD 6 Eu Tong Sen Street Tel Aviv, Israel, 6770007 #10-15 The Central Singapore 059817 Email: avi@ability.co.il Attention: Avi Levin With a copy to: McDermott Will & Emery LLP 340 Madison Avenue New York, NY 10173-1922 Telephone: (212) 547-5541 Facsimile: (212) 547-5444 EMAIL: GEMMANUEL@MWE.COM Attention: Gary Emmanuel

7





Any Party may change the address to which notices, re