#Get Patent Data

In [1]:
import os
import tarfile
import argparse # Import argparse for command-line arguments
from huggingface_hub import hf_hub_download

def download_and_extract_hupd(repo_id: str, filename: str, download_dir: str, extract_dir: str):
    """
    Downloads a file from Hugging Face Hub and extracts it if it's a tar.gz.

    Args:
        repo_id (str): The Hugging Face repository ID (e.g., "HUPD/hupd").
        filename (str): The specific file to download (e.g., "data/2018.tar.gz").
        download_dir (str): Directory to save the downloaded file.
        extract_dir (str): Directory to extract the contents.
    """
    # Create directories if they don't exist
    os.makedirs(download_dir, exist_ok=True)
    os.makedirs(extract_dir, exist_ok=True)

    print(f"Downloading {filename} from {repo_id} using huggingface_hub...")
    try:
        downloaded_filepath = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            cache_dir=download_dir,
            repo_type="dataset"
        )
        print(f"File downloaded to: {downloaded_filepath}")
    except Exception as e:
        print(f"Error downloading file from Hugging Face Hub: {e}")
        return

    print(f"Extracting {downloaded_filepath} to {extract_dir}...")
    try:
        with tarfile.open(downloaded_filepath, "r:gz") as tar:
            tar.extractall(path=extract_dir) # Extract into the base directory
        print("Extraction complete.")

        # Construct the expected path for the extracted JSON files
        # This assumes the tarball contains a top-level folder named after the year (e.g., "2018/")
        year_folder_name = filename.split('/')[1].split('.')[0]
        final_json_path = os.path.join(extract_dir, year_folder_name)
        print(f"JSON files should now be in: {final_json_path}")

        # Optional: Verify by listing some contents
        if os.path.exists(final_json_path) and os.listdir(final_json_path):
            print(f"First 5 items in {final_json_path}: {os.listdir(final_json_path)[:5]}")
        else:
            print(f"Warning: Expected directory {final_json_path} either doesn't exist or is empty after extraction.")

    except tarfile.ReadError as e:
        print(f"Error: Could not read tar.gz file. It might be corrupted: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")

HUGGINGFACE_REPO_ID = "HUPD/hupd"

# Use the year from the command-line argument
year_to_download = 2018 #args.year
TAR_FILENAME = f"data/{year_to_download}.tar.gz"

DOWNLOAD_CACHE_DIR = "hf_cache"
# Extract directly into 'hupd_extracted'. The tarball itself will create the year subdirectory.
EXTRACT_DIR = "hupd_extracted"

download_and_extract_hupd(HUGGINGFACE_REPO_ID, TAR_FILENAME, DOWNLOAD_CACHE_DIR, EXTRACT_DIR)

Downloading data/2018.tar.gz from HUPD/hupd using huggingface_hub...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


2018.tar.gz:   0%|          | 0.00/513M [00:00<?, ?B/s]

File downloaded to: hf_cache/datasets--HUPD--hupd/snapshots/f570a84b03663180b6034c1f7f4c15864f94385e/data/2018.tar.gz
Extracting hf_cache/datasets--HUPD--hupd/snapshots/f570a84b03663180b6034c1f7f4c15864f94385e/data/2018.tar.gz to hupd_extracted...
Extraction complete.
JSON files should now be in: hupd_extracted/2018
First 5 items in hupd_extracted/2018: ['15759363.json', '15892753.json', '15907738.json', '15859703.json', '15861362.json']


# Create MD files

In [2]:

import os
import json
import argparse

def convert_json_to_markdown(json_data, indent=0):
    """
    Converts a JSON object or array into a Markdown formatted string,
    using proper subheading levels and spacing for better readability
    and LLM understanding. Skips the 'full_description' and 'claims' fields.
    """
    markdown_output = []
    # Indentation for nested items
    indent_str = "    " * indent
    # Fields to skip
    # FIELDS_TO_SKIP = {"full_description", "claims", "background"} # Using a set for efficient lookup
    FIELDS_TO_SKIP = {
        "full_description", "claims", "background",
        "abandon_date", "date_produced", "date_published", "application_number",
        "main_ipcr_label",
        "uspc_class", "uspc_subclass", "examiner_id",
        "examiner_name_last", "examiner_name_first", "examiner_name_middle",
    }


    if isinstance(json_data, dict):
        for key, value in json_data.items():
            # Skip the specified fields
            if key in FIELDS_TO_SKIP:
                continue

            # Determine the heading level, capping at H6
            heading_level = min(6, indent + 1)
            heading_prefix = "#" * heading_level

            # Add a newline for spacing before each new section/key-value pair
            if indent == 0 or (indent > 0 and isinstance(value, (dict, list))):
                markdown_output.append("\n") # Extra newline for spacing sections

            # Use subheadings for nested objects/arrays, or bold for simple key-values
            if isinstance(value, (dict, list)):
                markdown_output.append(f"{indent_str}{heading_prefix} {key.replace('_', ' ').title()}\n")
                # Recurse for nested dictionaries or lists
                markdown_output.append(convert_json_to_markdown(value, indent + 1))
            else:
                # For simple key-value pairs, use bold key and its value
                markdown_output.append(f"{indent_str}**{key.replace('_', ' ').title()}:** {value}\n")

    elif isinstance(json_data, list):
        # Add a newline for spacing before a new list section
        if indent > 0:
            markdown_output.append("\n")

        for i, item in enumerate(json_data):
            # For each item in the list, use a list item prefix
            markdown_output.append(f"{indent_str}- ")
            if isinstance(item, (dict, list)):
                # If the list item is a nested object or list, recurse
                # No extra newline for the current list item, it's handled inside recursion
                markdown_output.append(convert_json_to_markdown(item, indent + 1))
            else:
                # For simple list items
                markdown_output.append(f"{item}\n")

    else:
        # For simple values that might be passed directly (e.g., from a list)
        markdown_output.append(f"{json_data}\n")

    return "".join(markdown_output)

input_path = "/content/hupd_extracted/2018"
output_path = "/content/Documents"
max_files_to_process = 5

# Create output folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

processed_count = 0
skipped_count = 0
print(f"Processing JSON files from: {input_path}")
print(f"Saving Markdown files to: {output_path}")
print(f"Maximum files to process: {max_files_to_process}")

try:
    # List all files in the input folder
    files = [f for f in os.listdir(input_path) if f.endswith('.json')]
    files.sort() # Ensure consistent order for processing

    for filename in files:
        if processed_count >= max_files_to_process:
            print(f"Reached maximum file limit of {max_files_to_process}. Stopping.")
            break

        json_filepath = os.path.join(input_path, filename)
        markdown_filename = os.path.splitext(filename)[0] + ".md"
        markdown_filepath = os.path.join(output_path, markdown_filename)

        print(f"Attempting to process '{filename}'...")

        try:
            with open(json_filepath, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Check the 'decision' field
            decision = json_data.get('decision')
            if decision in ["ACCEPTED", "REJECTED"]:
                markdown_content = convert_json_to_markdown(json_data)

                with open(markdown_filepath, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)

                processed_count += 1
                print(f"Successfully converted '{filename}' (Decision: {decision}).")
            else:
                skipped_count += 1
                print(f"Skipping '{filename}' due to decision '{decision}'. Only 'ACCEPTED' or 'REJECTED' are processed.")

        except json.JSONDecodeError:
            print(f"Error: Could not decode JSON from '{filename}'. Skipping.")
        except FileNotFoundError:
            print(f"Error: File '{filename}' not found. Skipping.")
        except Exception as e:
            print(f"An unexpected error occurred while processing '{filename}': {e}. Skipping.")

except FileNotFoundError:
    print(f"Error: Input folder '{input_path}' not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

print(f"\nFinished processing. Converted {processed_count} JSON files to Markdown. Skipped {skipped_count} files.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping '15763767.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763769.json'...
Skipping '15763769.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763770.json'...
Skipping '15763770.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763771.json'...
Skipping '15763771.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763772.json'...
Skipping '15763772.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763773.json'...
Skipping '15763773.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting to process '15763774.json'...
Skipping '15763774.json' due to decision 'PENDING'. Only 'ACCEPTED' or 'REJECTED' are processed.
Attempting

# Dependencies required

In [3]:
%pip install --upgrade langchain langchain-experimental langchain-openai python-dotenv pyvis json_repair

Collecting langchain-experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.28-py3-none-any.whl.metadata (2.3 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting json_repair
  Downloading json_repair-0.47.7-py3-none-any.whl.metadata (12 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain-experimental)
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community<0.4.0,>=0.3.0->langchain-experimental)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community<0.4.0,>=0.3.0->

# Create Nodes and Relationships

In [13]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_community.chat_models import ChatOllama, ChatOpenAI  # ✅ Ollama wrapper
from google.colab import userdata

llm = ChatOpenAI(
    model="llama-3.1-8b-instant",  # or other Groq-supported models like mixtral-8x7b
    temperature=0,
    openai_api_key=userdata.get('api_key'),
    openai_api_base="https://api.groq.com/openai/v1"
)

allowed_nodes = ["Patent","PatentCategory","CPC","Decision","Inventor","Summary","Year"]
allowed_relationships = [
    ("Patent", "HAS_CATEGORY", "PatentCategory"),
    ("Patent", "HAS_MAIN_CPC", "CPC"),
    ("Patent", "HAS_CPC", "CPC"),
    ("Patent", "HAS_DECISION", "Decision"),
    ("Patent", "HAS_INVENTOR", "Inventor"),
    ("Patent", "HAS_SUMMARY", "Summary"),
    ("Patent", "Filing_IN", "Year")
]
graph_transformer_rel_defined = LLMGraphTransformer(
  llm=llm,
  allowed_nodes=allowed_nodes,
  allowed_relationships=allowed_relationships
)

with open("/content/Documents/14907967.md", "r", encoding="utf-8") as file:
    content = file.read()

    doc = Document(page_content=content)
    graph_documents_rel_defined = await graph_transformer_rel_defined.aconvert_to_graph_documents([doc])

    print(f"Nodes:{graph_documents_rel_defined[0].nodes}")
    print(f"Relationships:{graph_documents_rel_defined[0].relationships}")

Nodes:[Node(id='The device can further detect a short circuit defects present between rows of touch electrodes in a touch electrode matrix, and the device has a simple structure and high reliability.', type='Summary', properties={}), Node(id='Preferably, the cutoff unit is activated by a first voltage signal of the touch electrode, and deactivated by a second voltage signal of the touch electrode.', type='Summary', properties={}), Node(id='GONG Qiang', type='Inventor', properties={}), Node(id='In order to solve the above technical problem, in an embodiment of the present disclosure, a device for detecting defects in a self-capacitive touch panel is first provided, which includes a plurality of drive circuits respectively connected to a plurality of rows of touch electrodes, the drive circuit comprising: a pre-charging unit, for generating a charge control signal for simultaneously presetting a first voltage for the touch electrodes; a synchronization unit, for generating a charge contr

# Store Nodes & relationships in Graph DB

In [8]:
pip install langchain-community



In [9]:
pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m204.8/312.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [14]:
from langchain_core.documents import Document
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from neo4j import GraphDatabase
from typing import Optional


def add_graphdocument_to_neo4j(
    graph_doc: GraphDocument,
    uri: str,
    user: str,
    password: str,
    database: Optional[str] = "neo4j"
):
    driver = GraphDatabase.driver(uri, auth=(user, password))

    def add_graph(tx, graph_doc: GraphDocument):
        # Create nodes
        for node in graph_doc.nodes:
            tx.run(
                f"""
                MERGE (n:{node.type} {{id: $id}})
                SET n += $properties
                """,
                id=node.id,
                properties=node.properties
            )

        # Create relationships
        for rel in graph_doc.relationships:
            tx.run(
                f"""
                MATCH (source:{rel.source.type} {{id: $source_id}})
                MATCH (target:{rel.target.type} {{id: $target_id}})
                MERGE (source)-[r:{rel.type}]->(target)
                SET r += $properties
                """,
                source_id=rel.source.id,
                target_id=rel.target.id,
                properties=rel.properties
            )

    # Run transaction
    with driver.session(database=database) as session:
        session.write_transaction(add_graph, graph_doc)

    driver.close()

# Create graph document
graph_doc = GraphDocument(
    nodes=graph_documents_rel_defined[0].nodes,
    relationships=graph_documents_rel_defined[0].relationships,
    source=doc
)

# Upload to Neo4j
add_graphdocument_to_neo4j(
    graph_doc,
    uri=userdata.get('NEO4J_URI'),
    user="neo4j",
    password=userdata.get("NEO4J_PASSWORD")
)

  session.write_transaction(add_graph, graph_doc)


In [8]:
from dotenv import load_dotenv
load_dotenv() 

import requests
import os

api_key = os.getenv("GROQ_API_KEY")

url = "https://api.groq.com/openai/v1/models"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

response = requests.get(url, headers=headers)

print(response.json())

{'object': 'list', 'data': [{'id': 'compound-beta-mini', 'object': 'model', 'created': 1742953279, 'owned_by': 'Groq', 'active': True, 'context_window': 131072, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'gemma2-9b-it', 'object': 'model', 'created': 1693721698, 'owned_by': 'Google', 'active': True, 'context_window': 8192, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'llama3-70b-8192', 'object': 'model', 'created': 1693721698, 'owned_by': 'Meta', 'active': True, 'context_window': 8192, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'allam-2-7b', 'object': 'model', 'created': 1737672203, 'owned_by': 'SDAIA', 'active': True, 'context_window': 4096, 'public_apps': None, 'max_completion_tokens': 4096}, {'id': 'meta-llama/llama-4-maverick-17b-128e-instruct', 'object': 'model', 'created': 1743877158, 'owned_by': 'Meta', 'active': True, 'context_window': 131072, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'playai-tts-arabic', 'object