In [1]:
##################################
# Notebook Cell 1: 安装并导入依赖
##################################
#!pip install --quiet langchain-openai langchain-community langgraph chromadb python-dotenv

import os
import getpass
import langsmith
from dotenv import load_dotenv

# 如果你要自动上报到LangSmith:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Patent_Project_Local_Text2Sql_Test"

load_dotenv()

# 如果没有 OPENAI_API_KEY，则可在此输入
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")


In [24]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
#########################################
# Notebook Cell 2: 连接SQLite数据库
#########################################
from sqlalchemy import create_engine, text
from sqlalchemy.pool import StaticPool

db_path = os.path.expanduser("~/Desktop/patents/data/patent.db")
engine = create_engine(
    f"sqlite:///{db_path}",
    connect_args={"check_same_thread": False},
    poolclass=StaticPool
)

with engine.begin() as conn:
    table_rows = conn.execute(text("SELECT name FROM sqlite_master WHERE type='table';")).fetchall()
print(f"Successfully connected to database at {db_path}")
print("Detected tables:", table_rows)

Successfully connected to database at /Users/yuxiangwang/Desktop/patents/data/patent.db
Detected tables: [('patents',), ('inventors',), ('sqlite_sequence',), ('assignees',), ('prior_art_keywords',), ('events',), ('external_links',), ('images',), ('classifications',), ('claims',), ('applications_claiming_priority',), ('worldwide_applications',), ('patent_citations',), ('cited_by',), ('legal_events',), ('concepts',), ('child_applications',), ('parent_applications',), ('priority_applications',), ('non_patent_citations',), ('similar_documents',), ('error_logs',)]


In [5]:
#########################################
# Notebook Cell 3: 构建 SQLDatabase/Toolkit
#########################################
from langchain_community.utilities.sql_database import SQLDatabase
db = SQLDatabase(engine)
print("SQLDatabase object:", type(db))

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
    model_name="gpt-4o",  # 或 "gpt-3.5-turbo"
    temperature=0.0
)
print("LLM init done.")

from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
toolkit = SQLDatabaseToolkit(db=db, llm=llm)
tools = toolkit.get_tools()
print("\nDefault SQL Tools from SQLDatabaseToolkit:")
for t in tools:
    print(" -", t.name, ":", t.description)


SQLDatabase object: <class 'langchain_community.utilities.sql_database.SQLDatabase'>
LLM init done.

Default SQL Tools from SQLDatabaseToolkit:
 - sql_db_query : Input to this tool is a detailed and correct SQL query, output is a result from the database. If the query is not correct, an error message will be returned. If an error is returned, rewrite the query, check the query, and try again. If you encounter an issue with Unknown column 'xxxx' in 'field list', use sql_db_schema to query the correct table fields.
 - sql_db_schema : Input to this tool is a comma-separated list of tables, output is the schema and sample rows for those tables. Be sure that the tables actually exist by calling sql_db_list_tables first! Example Input: table1, table2, table3
 - sql_db_list_tables : Input is an empty string, output is a comma-separated list of tables in the database.
 - sql_db_query_checker : Use this tool to double check if your query is correct before executing it. Always use this tool befo

In [145]:
# 将字典保存为 JSON 文件
with open("schema_docs.json", "w", encoding="utf-8") as f:
    json.dump(SCHEMA_DOCS, f, indent=4, ensure_ascii=False)  # ensure_ascii=False 以支持非ASCII字符

In [141]:
#########################################
# Notebook Cell 4: 构建 RAG 向量索引
#########################################
# 这里写你已有的 SCHEMA_DOCS

SCHEMA_DOCS = {
    "patents": {
        "table_comment": "Stores the core patent record using a text-based primary key (e.g., US20180044418A1). "
                         "Contains fundamental information like title, type, and key dates.",
        "columns": {
            "patent_id": "Text-based unique primary key for the patent (e.g., US20180044418A1).",
            "title": "Full title of the patent, used for quick references.",
            "type": "Type of patent (e.g., 'patent', 'utility', etc.).",
            "pdf_link": "URL to the patent PDF. Useful for immediate access to the full document.",
            "publication_number": "Official publication identifier (often same as patent_id).",
            "country": "Country/region of origin (e.g., 'United States').",
            "application_number": "Number assigned when the application was filed.",
            "priority_date": "Earliest filing/priority date in YYYY-MM-DD format.",
            "filing_date": "The date the application was filed (YYYY-MM-DD).",
            "publication_date": "Official publication date in YYYY-MM-DD format.",
            "prior_art_date": "Date used for prior art cutoff, if applicable (YYYY-MM-DD).",
            "family_id": "Identifier for the patent family, if provided.",
            "abstract": "Summary of the patent's main technical contribution.",
            "description_link": "Link to a more detailed description or text of the patent."
        }
    },

    "inventors": {
        "table_comment": "Lists inventors associated with a patent. One patent can have multiple inventors.",
        "columns": {
            "id": "Auto-increment primary key for internal tracking.",
            "patent_id": "References patents(patent_id) to show which patent this inventor relates to.",
            "inventor_name": "Full name of the inventor (e.g., 'John Doe').",
            "link": "Optional URL with additional details about the inventor.",
            "serpapi_link": "Optional SerpApi link specific to this inventor."
        }
    },

    "assignees": {
        "table_comment": "Captures the entities or organizations that hold the rights to the patent.",
        "columns": {
            "id": "Auto-increment primary key for reference.",
            "patent_id": "Links to the primary patent record via patents(patent_id).",
            "name": "Name of the assignee (e.g., 'Merck Sharp & Dohme LLC')."
        }
    },

    "prior_art_keywords": {
        "table_comment": "Contains keywords relevant to prior art, often used for classification or quick searching.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "keyword": "A single keyword string (e.g., 'cancer')."
        }
    },

    "events": {
        "table_comment": "Tracks major patent-related events (e.g., filings, publications, assignments).",
        "columns": {
            "id": "Primary key (auto-increment).",
            "patent_id": "Reference to patents(patent_id).",
            "event_date": "Date the event occurred (YYYY-MM-DD).",
            "title": "Short description of the event (e.g., 'Application filed').",
            "type": "Event category (e.g., 'filed', 'publication', 'legal-status').",
            "critical": "Flag indicating high significance (0 or 1).",
            "assignee_search": "Info about the assignee if relevant to the event.",
            "description": "Extended text or notes about the event (could be combined from a list)."
        }
    },

    "external_links": {
        "table_comment": "External resources or references (e.g., official patent office links).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "text": "Display label for the link (e.g., 'USPTO').",
            "link": "Actual URL to the external resource."
        }
    },

    "images": {
        "table_comment": "Image links for figures or diagrams associated with the patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "image_url": "Direct URL to the patent image."
        }
    },

    "classifications": {
        "table_comment": "CPC, IPC, or other classification codes describing the technology domain.",
        "columns": {
            "id": "Auto-increment key for each classification entry.",
            "patent_id": "References patents(patent_id).",
            "code": "Classification code (e.g., 'C07K16/2818').",
            "description": "Textual explanation of what the code signifies.",
            "leaf": "Indicates whether this code is at the lowest (leaf) level (0/1).",
            "first_code": "Flag if it's the first listed classification (0/1).",
            "is_cpc": "Flag indicating if the code is a CPC code (0/1).",
            "additional": "Marks if it's an additional classification (0/1)."
        }
    },

    "claims": {
        "table_comment": "Contains the individual claims text for each patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "claim_no": "The claim's index or number (integer).",
            "claim_txt": "Full text of this particular claim."
        }
    },

    "applications_claiming_priority": {
        "table_comment": "Tracks applications that claim priority from the current patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "application_number": "Application identifier for the priority claimant.",
            "priority_date": "Priority date in YYYY-MM-DD.",
            "filing_date": "Filing date in YYYY-MM-DD.",
            "representative_publication": "Representative publication number, if any.",
            "primary_language": "Language code (e.g., 'en').",
            "title": "Title of the priority-claiming application."
        }
    },

    "worldwide_applications": {
        "table_comment": "Shows international (family) applications corresponding to the patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "year": "Year of the application (integer).",
            "application_number": "Unique app number for this worldwide filing.",
            "country_code": "Two-letter country/region code (e.g., 'US').",
            "document_id": "Document identifier if different from application_number.",
            "filing_date": "Date the worldwide application was filed (YYYY-MM-DD).",
            "legal_status": "Short descriptor of the legal status (e.g., 'Active').",
            "legal_status_cat": "Categorized status (e.g., 'active', 'not_active').",
            "this_app": "Boolean-like flag (0/1) indicating if this corresponds to the same patent application."
        }
    },

    "patent_citations": {
        "table_comment": "Lists other patents that the current patent references (cites).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "is_family_to_family": "Marks if the citation is family-to-family (0/1).",
            "publication_number": "Publication number of the cited patent.",
            "primary_language": "Language of the cited document.",
            "examiner_cited": "Marks if an examiner cited it (0/1).",
            "priority_date": "Priority date of the cited patent (YYYY-MM-DD).",
            "publication_date": "Publication date of the cited patent (YYYY-MM-DD).",
            "assignee_original": "Original assignee name, if available.",
            "title": "Title of the cited patent.",
            "serpapi_link": "URL from SerpApi for further info.",
            "patent_id_ref": "Raw reference to the cited patent ID (e.g., 'patent/WO2015035112A1/en')."
        }
    },

    "cited_by": {
        "table_comment": "Reverse citation data: shows which patents cite this one.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "is_family_to_family": "Flag for family-to-family citation (0/1).",
            "publication_number": "Publication number of the citing patent.",
            "primary_language": "Main language of the citing document.",
            "examiner_cited": "Indicator if it was examiner-cited (0/1).",
            "priority_date": "Priority date of the citing patent (YYYY-MM-DD).",
            "publication_date": "Publication date of the citing patent (YYYY-MM-DD).",
            "assignee_original": "Original assignee for the citing patent.",
            "title": "Title of the citing patent.",
            "serpapi_link": "Link to SerpApi record for further details.",
            "patent_id_ref": "ID reference for the citing patent."
        }
    },

    "legal_events": {
        "table_comment": "Tracks legal changes or official updates (assignments, status changes, etc.).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "date": "When the legal event was recorded (YYYY-MM-DD).",
            "code": "Abbreviated code (e.g., 'AS', 'STPP').",
            "title": "Short descriptive label (e.g., 'Assignment').",
            "attributes_json": "JSON with additional attributes for the event."
        }
    },

    "concepts": {
        "table_comment": "Stores recognized concepts or compounds extracted from the patent data (title, claims, etc.).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "concept_id": "Unique identifier from the data (e.g., 'CUI' or similar).",
            "domain": "Category or domain (e.g., 'Diseases').",
            "name": "Concept/compound name (e.g., 'Neoplasm').",
            "similarity": "Numerical score indicating relevance or match strength.",
            "sections": "Combined list of sections where the concept was found (e.g., 'title;claims').",
            "count": "Approximate count of how many times the concept appears.",
            "inchi_key": "InChI key for chemical compounds, if applicable.",
            "smiles": "SMILES string for the compound, if applicable."
        }
    },

    "child_applications": {
        "table_comment": "Any child or continuation applications derived from this patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "application_number": "Number assigned to the child application.",
            "relation_type": "Relationship type (e.g., 'Continuation').",
            "representative_publication": "Representative publication number for this child.",
            "primary_language": "Language of the child application (e.g., 'en').",
            "priority_date": "Earliest priority date (YYYY-MM-DD).",
            "filing_date": "Filing date (YYYY-MM-DD).",
            "title": "Title of the child application."
        }
    },

    "parent_applications": {
        "table_comment": "Any parent or previous continuation applications related to this patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "application_number": "Parent application number.",
            "relation_type": "Type of parent relationship (e.g., 'Continuation').",
            "representative_publication": "Representative publication for the parent app.",
            "primary_language": "Parent application's language code.",
            "priority_date": "Parent's priority date (YYYY-MM-DD).",
            "filing_date": "Parent's filing date (YYYY-MM-DD).",
            "title": "Title of the parent application."
        }
    },

    "priority_applications": {
        "table_comment": "List of priority applications that the patent claims (besides the earliest).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "application_number": "Application number for the claimed priority.",
            "representative_publication": "Representative publication number.",
            "primary_language": "Language of the priority application.",
            "priority_date": "Priority date claimed (YYYY-MM-DD).",
            "filing_date": "Filing date (YYYY-MM-DD).",
            "title": "Title of the priority application."
        }
    },

    "non_patent_citations": {
        "table_comment": "Citations to non-patent literature (journals, articles, etc.).",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "citation_title": "Title or short reference of the cited non-patent work.",
            "examiner_cited": "Indicates if the citation came from the examiner (0/1)."
        }
    },

    "similar_documents": {
        "table_comment": "Documents (patent or otherwise) deemed similar to this patent.",
        "columns": {
            "id": "Auto-increment primary key.",
            "patent_id": "References patents(patent_id).",
            "is_patent": "Flag indicating if it's a patent document (0/1).",
            "doc_patent_id": "If patent, internal ID (e.g., 'patent/US11734097B1/en').",
            "serpapi_link": "SerpApi link for additional data.",
            "publication_number": "Official publication number of the similar document.",
            "primary_language": "Language code of the document.",
            "publication_date": "Publication date (YYYY-MM-DD).",
            "title": "Title of the similar document."
        }
    },

    "error_logs": {
        "table_comment": "Collects error messages for debugging or troubleshooting insertion/processing issues.",
        "columns": {
            "id": "Auto-increment primary key.",
            "error_message": "Short description of the error encountered.",
            "stack_trace": "Detailed stack trace or diagnostic info, if available.",
            "created_at": "Timestamp in YYYY-MM-DD HH:MM:SS marking when the error was logged."
        }
    }
}

def create_schema_texts(schema_docs: dict):
    docs = []
    for table_name, info in schema_docs.items():
        table_comment = info.get("table_comment", "")
        col_texts = []
        for col, desc in info["columns"].items():
            col_texts.append(f"{col}: {desc}")
        col_section = "\n".join(col_texts)
        
        text_chunk = f"Table: {table_name}\nComment: {table_comment}\nColumns:\n{col_section}"
        doc_item = {
            "text": text_chunk,
            "metadata": {"table_name": table_name}
        }
        docs.append(doc_item)
    return docs

schema_list = create_schema_texts(SCHEMA_DOCS)
for item in schema_list:
    print("\n--- SCHEMA TEXT ---\n", item["text"], item["metadata"])




--- SCHEMA TEXT ---
 Table: patents
Comment: Stores the core patent record using a text-based primary key (e.g., US20180044418A1). Contains fundamental information like title, type, and key dates.
Columns:
patent_id: Text-based unique primary key for the patent (e.g., US20180044418A1).
title: Full title of the patent, used for quick references.
type: Type of patent (e.g., 'patent', 'utility', etc.).
pdf_link: URL to the patent PDF. Useful for immediate access to the full document.
publication_number: Official publication identifier (often same as patent_id).
country: Country/region of origin (e.g., 'United States').
application_number: Number assigned when the application was filed.
priority_date: Earliest filing/priority date in YYYY-MM-DD format.
filing_date: The date the application was filed (YYYY-MM-DD).
publication_date: Official publication date in YYYY-MM-DD format.
prior_art_date: Date used for prior art cutoff, if applicable (YYYY-MM-DD).
family_id: Identifier for the patent

In [9]:
#########################################
# Notebook Cell 5: 向量化 & 存储 (Chroma)
#########################################
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()  # 需OPENAI_API_KEY

documents = [
    Document(page_content=x["text"], metadata=x["metadata"])
    for x in schema_list
]
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    collection_name="schema_docs_collection"
)
print("Built Chroma vectorstore for schema docs.")

Built Chroma vectorstore for schema docs.


In [61]:
data = vectorstore._collection.get(
    include=["documents", "embeddings", "metadatas"]
)

In [67]:
df = pd.DataFrame({
    "documents": data["documents"],
    "metadatas": data["metadatas"],
    # 只展示前3维embedding
    "embedding(first 3)": [emb[:3] for emb in data["embeddings"]]
})
df

Unnamed: 0,documents,metadatas,embedding(first 3)
0,"Table: patents\nComment: Stores the core patent record using a text-based primary key (e.g., US20180044418A1). Contains fundamental information like title, type, and key dates.\nColumns:\npatent_id: Text-based unique primary key for the patent (e.g., US20180044418A1).\ntitle: Full title of the patent, used for quick references.\ntype: Type of patent (e.g., 'patent', 'utility', etc.).\npdf_link: URL to the patent PDF. Useful for immediate access to the full document.\npublication_number: Official publication identifier (often same as patent_id).\ncountry: Country/region of origin (e.g., 'United States').\napplication_number: Number assigned when the application was filed.\npriority_date: Earliest filing/priority date in YYYY-MM-DD format.\nfiling_date: The date the application was filed (YYYY-MM-DD).\npublication_date: Official publication date in YYYY-MM-DD format.\nprior_art_date: Date used for prior art cutoff, if applicable (YYYY-MM-DD).\nfamily_id: Identifier for the patent family, if provided.\nabstract: Summary of the patent's main technical contribution.\ndescription_link: Link to a more detailed description or text of the patent.",{'table_name': 'patents'},"[-0.007805537432432175, 0.036302920430898666, 0.013617009855806828]"
1,"Table: inventors\nComment: Lists inventors associated with a patent. One patent can have multiple inventors.\nColumns:\nid: Auto-increment primary key for internal tracking.\npatent_id: References patents(patent_id) to show which patent this inventor relates to.\ninventor_name: Full name of the inventor (e.g., 'John Doe').\nlink: Optional URL with additional details about the inventor.\nserpapi_link: Optional SerpApi link specific to this inventor.",{'table_name': 'inventors'},"[0.0018644628580659628, 0.01769978739321232, 0.004226563964039087]"
2,"Table: assignees\nComment: Captures the entities or organizations that hold the rights to the patent.\nColumns:\nid: Auto-increment primary key for reference.\npatent_id: Links to the primary patent record via patents(patent_id).\nname: Name of the assignee (e.g., 'Merck Sharp & Dohme LLC').",{'table_name': 'assignees'},"[-0.008756179362535477, -0.008181208744645119, 0.01623772457242012]"
3,"Table: prior_art_keywords\nComment: Contains keywords relevant to prior art, often used for classification or quick searching.\nColumns:\nid: Auto-increment primary key.\npatent_id: References patents(patent_id).\nkeyword: A single keyword string (e.g., 'cancer').",{'table_name': 'prior_art_keywords'},"[0.00570357171818614, 0.016940034925937653, -0.008598027750849724]"
4,"Table: events\nComment: Tracks major patent-related events (e.g., filings, publications, assignments).\nColumns:\nid: Primary key (auto-increment).\npatent_id: Reference to patents(patent_id).\nevent_date: Date the event occurred (YYYY-MM-DD).\ntitle: Short description of the event (e.g., 'Application filed').\ntype: Event category (e.g., 'filed', 'publication', 'legal-status').\ncritical: Flag indicating high significance (0 or 1).\nassignee_search: Info about the assignee if relevant to the event.\ndescription: Extended text or notes about the event (could be combined from a list).",{'table_name': 'events'},"[-0.012615074403584003, 0.0009164944058284163, 0.006033445242792368]"
5,"Table: external_links\nComment: External resources or references (e.g., official patent office links).\nColumns:\nid: Auto-increment primary key.\npatent_id: References patents(patent_id).\ntext: Display label for the link (e.g., 'USPTO').\nlink: Actual URL to the external resource.",{'table_name': 'external_links'},"[0.0024545672349631786, 0.025752386078238487, -0.0025916937738656998]"
6,Table: images\nComment: Image links for figures or diagrams associated with the patent.\nColumns:\nid: Auto-increment primary key.\npatent_id: References patents(patent_id).\nimage_url: Direct URL to the patent image.,{'table_name': 'images'},"[-0.010977236554026604, 0.021750815212726593, 0.016944454982876778]"
7,"Table: classifications\nComment: CPC, IPC, or other classification codes describing the technology domain.\nColumns:\nid: Auto-increment key for each classification entry.\npatent_id: References patents(patent_id).\ncode: Classification code (e.g., 'C07K16/2818').\ndescription: Textual explanation of what the code signifies.\nleaf: Indicates whether this code is at the lowest (leaf) level (0/1).\nfirst_code: Flag if it's the first listed classification (0/1).\nis_cpc: Flag indicating if the code is a CPC code (0/1).\nadditional: Marks if it's an additional classification (0/1).",{'table_name': 'classifications'},"[0.012523122131824493, 0.03499426692724228, 0.02229495346546173]"
8,Table: claims\nComment: Contains the individual claims text for each patent.\nColumns:\nid: Auto-increment primary key.\npatent_id: References patents(patent_id).\nclaim_no: The claim's index or number (integer).\nclaim_txt: Full text of this particular claim.,{'table_name': 'claims'},"[-0.01166535448282957, 0.009311757981777191, 0.01598256267607212]"
9,"Table: applications_claiming_priority\nComment: Tracks applications that claim priority from the current patent.\nColumns:\nid: Auto-increment primary key.\npatent_id: References patents(patent_id).\napplication_number: Application identifier for the priority claimant.\npriority_date: Priority date in YYYY-MM-DD.\nfiling_date: Filing date in YYYY-MM-DD.\nrepresentative_publication: Representative publication number, if any.\nprimary_language: Language code (e.g., 'en').\ntitle: Title of the priority-claiming application.",{'table_name': 'applications_claiming_priority'},"[-0.0044761961326003075, 0.0025820299051702023, 0.021835433319211006]"


In [11]:
#########################################
# Notebook Cell 6: 定义 RAG Tool
#########################################
from langchain.tools import BaseTool

class SchemaDocRAGTool(BaseTool):
    name: str = "schema_doc_rag_tool"
    description: str = (
        "Do a semantic search over the large schema doc. "
        "Input: any question about tables or columns. Output: relevant snippet(s)."
    )

    vectorstore: Chroma

    def _run(self, query: str) -> str:
        docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=3)
        if not docs_and_scores:
            return "No relevant schema snippet found."

        lines = []
        for doc, score in docs_and_scores:
            lines.append(
                f"score={score:.2f}, table_name={doc.metadata.get('table_name', '')}\nContent:\n{doc.page_content}"
            )
        return "\n\n".join(lines)

    async def _arun(self, query: str) -> str:
        return self._run(query)

rag_tool = SchemaDocRAGTool(vectorstore=vectorstore)
all_tools = tools + [rag_tool]
print("\nTools after adding RAG tool:")
for t in all_tools:
    print(" -", t.name)


Tools after adding RAG tool:
 - sql_db_query
 - sql_db_schema
 - sql_db_list_tables
 - sql_db_query_checker
 - schema_doc_rag_tool


In [96]:
from langchain.tools import BaseTool

class ExtendedSQLSchemaTool(BaseTool):
    """
    Combine real DB schema from `sql_db_schema` with additional doc from SCHEMA_DOCS or RAG.
    """
    name: str = "extended_sql_db_schema"
    description: str = (
        "Call this tool to get the real DB schema + extra doc for a table. "
        "Input is a single table name, output is both CREATE TABLE info plus business usage info."
    )

    sql_schema_tool: BaseTool
    doc_source: dict  # or your RAG vectorstore

    def _run(self, table_name: str) -> str:
        table_name = table_name.strip()
        # 1) 先用 sql_schema_tool 查询数据库实际 schema
        try:
            db_schema_str = self.sql_schema_tool.run(table_name)
        except Exception as e:
            raise Exception(f"Error calling sql_db_schema: {e}")

        # 2) 再获取 “硬编码 docs” (或 RAG)
        doc_part = ""
        if table_name in self.doc_source:
            info = self.doc_source[table_name]
            lines = [f"Table Explaination{table_name}: {info.get('table_comment', '')}"]
            lines.append("列含义：")
            for col, desc in info["columns"].items():
                lines.append(f" - {col}: {desc}")
            doc_part = "\n".join(lines)
        else:
            doc_part = f"(No extended doc found for table '{table_name}')."

        # 3) 合并两部分
        combined = f"{db_schema_str}\n\n=== Additional Business Doc ===\n{doc_part}"
        return combined

    async def _arun(self, table_name: str) -> str:
        return self._run(table_name)

In [98]:
all_tools = toolkit.get_tools()
schema_tool = None
for t in all_tools:
    if t.name == "sql_db_schema":
        schema_tool = t
        break

In [100]:
from langchain.tools import BaseTool

ext_schema_tool = ExtendedSQLSchemaTool(
    sql_schema_tool=schema_tool,
    doc_source=SCHEMA_DOCS
)

In [110]:
system_message_with_tool = """
You are an advanced SQL agent. 
You have a custom tool named 'extended_sql_db_schema' which merges DB's real schema 
and hard-coded or RAG doc about each table. 
If the user specifically wants the usage/meaning of columns in a table, 
call 'extended_sql_db_schema' to get both physical structure + business doc.

If the user wants raw database create statements, you can also use 'extended_sql_db_schema' 
or any other relevant tool.
"""

agent_executor = create_react_agent(
    model=llm,
    tools=new_tools,
    prompt=system_message_with_tool
)

In [114]:
#########################################
# Notebook Cell 7: 创建Agent + 测试 + 记录token
#########################################
from langgraph.prebuilt import create_react_agent

system_message_with_tool = """
You are an SQL agent with knowledge of a large schema doc. 
When you have questions about table or column details, call the 'schema_doc_rag_tool'.
If the user asks "what is the 'country' column of the 'patents' table", you can retrieve the snippet from that tool.
"""

agent_executor = create_react_agent(
    model=llm,
    tools=all_tools,
    prompt=system_message_with_tool
)

test_query = "在 'inventors' 表里, inventor_name 是什么含义？"
print("\n===== Now asking the Agent =====\n")

# 用 get_openai_callback() 记录 Token 用量
from langchain.callbacks import get_openai_callback

with get_openai_callback() as cb:
    events = agent_executor.stream(
        {"messages": [("user", test_query)]}, 
        stream_mode="values"
    )
    for event in events:
        msg = event["messages"][-1]
        msg.pretty_print()

# 打印 Token 用量
print("\n===== Token Usage =====")
print(f"Prompt tokens:      {cb.prompt_tokens}")
print(f"Completion tokens:  {cb.completion_tokens}")
print(f"Total tokens:       {cb.total_tokens}")
print(f"Estimated cost (USD): {cb.total_cost}")


===== Now asking the Agent =====


在 'inventors' 表里, inventor_name 是什么含义？
Tool Calls:
  sql_db_list_tables (call_qDGDOrD1LsvifS7RuCvrmN6t)
 Call ID: call_qDGDOrD1LsvifS7RuCvrmN6t
  Args:
Name: sql_db_list_tables

applications_claiming_priority, assignees, child_applications, cited_by, claims, classifications, concepts, error_logs, events, external_links, images, inventors, legal_events, non_patent_citations, parent_applications, patent_citations, patents, prior_art_keywords, priority_applications, similar_documents, sqlite_sequence, worldwide_applications
Tool Calls:
  sql_db_schema (call_3K89CNZUejvd52ZxsGXi4JU4)
 Call ID: call_3K89CNZUejvd52ZxsGXi4JU4
  Args:
    table_names: inventors
Name: sql_db_schema


CREATE TABLE inventors (
	id INTEGER, 
	patent_id TEXT NOT NULL, 
	inventor_name TEXT, 
	link TEXT, 
	serpapi_link TEXT, 
	PRIMARY KEY (id), 
	FOREIGN KEY(patent_id) REFERENCES patents (patent_id) ON DELETE CASCADE
)

/*
3 rows from inventors table:
id	patent_id	inventor_name	lin

In [116]:
import pandas as pd
import json

json_str = """
[
  {
    "purpose": "Simple Table Purpose",
    "question": "What does the `patents` table store?"
  },
  {
    "purpose": "Column Meaning",
    "question": "Which column in the `inventors` table links each inventor to a specific patent?"
  },
  {
    "purpose": "Schema Relationship",
    "question": "Which table lists the companies or organizations that own the rights to a patent, and how does it link to the `patents` table?"
  },
  {
    "purpose": "Listing Columns",
    "question": "Show me all column names and their meanings from the `prior_art_keywords` table."
  },
  {
    "purpose": "Single-Table Query",
    "question": "Which `title` in the `patents` table has the earliest `publication_date`?"
  },
  {
    "purpose": "Text Condition",
    "question": "Find all patents in the `patents` table whose `abstract` contains the word ‘cancer’."
  },
  {
    "purpose": "Counting Records",
    "question": "How many distinct inventors are stored in the `inventors` table?"
  },
  {
    "purpose": "Joining for Detailed Information",
    "question": "Retrieve each `patent_id` along with all associated `inventor_name` values by joining the `patents` and `inventors` tables."
  },
  {
    "purpose": "Filtering by Flag",
    "question": "List all `events` in the `events` table that are marked as `critical = 1` for a specific `patent_id`."
  },
  {
    "purpose": "Aggregate Query",
    "question": "How many total claims does each patent have? (Join `patents` with `claims` and group by `patent_id`.)"
  },
  {
    "purpose": "Multiple Joins",
    "question": "Show the patent title, the assignee name, and the date of each `legal_events` record in one combined result set."
  },
  {
    "purpose": "Check for Missing Relationships",
    "question": "Which `patent_id` values in the `patents` table do not appear in the `inventors` table?"
  },
  {
    "purpose": "Subquery or Join Condition",
    "question": "Find all patents that have at least one worldwide application with `legal_status` = 'Active' in the `worldwide_applications` table."
  },
  {
    "purpose": "Complex Filter",
    "question": "List all records in `patent_citations` where `is_family_to_family` = 1 and `examiner_cited` = 1, along with the corresponding `publication_number`."
  },
  {
    "purpose": "Parent-Child Relationship",
    "question": "How many child applications does each patent have, based on the `child_applications` table?"
  },
  {
    "purpose": "Multi-Column Condition",
    "question": "Which patents have `publication_date` after '2021-01-01' and also have at least one event of type 'legal-status' in the `events` table?"
  },
  {
    "purpose": "Join + Group + Sort",
    "question": "Which 3 patents have the highest total number of references in the `patent_citations` table?"
  },
  {
    "purpose": "Examining Classification",
    "question": "List each patent’s `title` along with every `code` in the `classifications` table (join on `patent_id`)."
  },
  {
    "purpose": "Multi-Table Link for Priority",
    "question": "Find all `applications_claiming_priority` that reference a specific patent_id (e.g., 'US20180044418A1') and show their `title` and `filing_date`."
  },
  {
    "purpose": "Error Handling Table",
    "question": "What columns exist in the `error_logs` table, and how can they help debug database insertion problems?"
  }
]"""

In [117]:
# 1. Parse the JSON string to a Python list of dicts
data_list = json.loads(json_str)

# 2. Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data_list)

In [167]:
for i in df.question:
    print(i)

What does the `patents` table store?
Which column in the `inventors` table links each inventor to a specific patent?
Which table lists the companies or organizations that own the rights to a patent, and how does it link to the `patents` table?
Show me all column names and their meanings from the `prior_art_keywords` table.
Which `title` in the `patents` table has the earliest `publication_date`?
Find all patents in the `patents` table whose `abstract` contains the word ‘cancer’.
How many distinct inventors are stored in the `inventors` table?
Retrieve each `patent_id` along with all associated `inventor_name` values by joining the `patents` and `inventors` tables.
List all `events` in the `events` table that are marked as `critical = 1` for a specific `patent_id`.
How many total claims does each patent have? (Join `patents` with `claims` and group by `patent_id`.)
Show the patent title, the assignee name, and the date of each `legal_events` record in one combined result set.
Which `pat

In [156]:
display(df)

Unnamed: 0,purpose,question
0,Simple Table Purpose,What does the `patents` table store?
1,Column Meaning,Which column in the `inventors` table links each inventor to a specific patent?
2,Schema Relationship,"Which table lists the companies or organizations that own the rights to a patent, and how does it link to the `patents` table?"
3,Listing Columns,Show me all column names and their meanings from the `prior_art_keywords` table.
4,Single-Table Query,Which `title` in the `patents` table has the earliest `publication_date`?
5,Text Condition,Find all patents in the `patents` table whose `abstract` contains the word ‘cancer’.
6,Counting Records,How many distinct inventors are stored in the `inventors` table?
7,Joining for Detailed Information,Retrieve each `patent_id` along with all associated `inventor_name` values by joining the `patents` and `inventors` tables.
8,Filtering by Flag,List all `events` in the `events` table that are marked as `critical = 1` for a specific `patent_id`.
9,Aggregate Query,How many total claims does each patent have? (Join `patents` with `claims` and group by `patent_id`.)


In [123]:
query = "help me find the patent document for US20190160148A1"
query1 = "How many total claims does each patent have?"
query2 = "whats the claims of US20220380469A1?"
query3 = "how many compositions mentioned in patent of US20220380469A1"
query4 = "summary the claims of US20220380469A1"
query5 = "tell me everything about patent of US20220380469A1"

In [125]:
print("\n===== Now asking the Agent =====\n")

# 用 get_openai_callback() 记录 Token 用量
from langchain.callbacks import get_openai_callback

with get_openai_callback() as cb:
    events = agent_executor.stream(
        {"messages": [("user", query5)]}, 
        stream_mode="values"
    )
    for event in events:
        msg = event["messages"][-1]
        msg.pretty_print()

# 打印 Token 用量
print("\n===== Token Usage =====")
print(f"Prompt tokens:      {cb.prompt_tokens}")
print(f"Completion tokens:  {cb.completion_tokens}")
print(f"Total tokens:       {cb.total_tokens}")
print(f"Estimated cost (USD): {cb.total_cost}")


===== Now asking the Agent =====


tell me everything about patent of US20220380469A1
Tool Calls:
  sql_db_list_tables (call_PD6w9kem0Sd9dcn2cC4qMx3y)
 Call ID: call_PD6w9kem0Sd9dcn2cC4qMx3y
  Args:
Name: sql_db_list_tables

applications_claiming_priority, assignees, child_applications, cited_by, claims, classifications, concepts, error_logs, events, external_links, images, inventors, legal_events, non_patent_citations, parent_applications, patent_citations, patents, prior_art_keywords, priority_applications, similar_documents, sqlite_sequence, worldwide_applications
Tool Calls:
  sql_db_schema (call_xjbLSYsJMcWiT0D0o2eGGKHh)
 Call ID: call_xjbLSYsJMcWiT0D0o2eGGKHh
  Args:
    table_names: patents
  sql_db_schema (call_mJeLfF4jZsGppQ22eibdo6NU)
 Call ID: call_mJeLfF4jZsGppQ22eibdo6NU
  Args:
    table_names: assignees
  sql_db_schema (call_oBqo55vgx6pK4IVrWDVF6se3)
 Call ID: call_oBqo55vgx6pK4IVrWDVF6se3
  Args:
    table_names: inventors
  sql_db_schema (call_Wa8VDyPen2A4BnkZvWTsBYgm