In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel, RunnableLambda
from pydantic import BaseModel, Field
from typing import Dict, List, Any, Optional
from collections import Counter
from difflib import SequenceMatcher
import os
import json
import asyncio
import uuid
import logging
from tqdm import tqdm
import time
from dotenv import load_dotenv
import os
import json
import time
import logging
from tqdm import tqdm
import concurrent.futures
from aperturedb import Connector


In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Load environment variables
load_dotenv(override=True) # Add override=True
google_api_key = os.getenv("GOOGLE_API_KEY")  
db_host = os.getenv("APERTUREDB_HOST")
db_password = os.getenv("APERTUREDB_PASSWORD")

In [4]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [5]:
def load_pdf_content(pdf_path, return_single_string=True, extract_metadata=False):
    """
    Load and parse a PDF document, returning its text content.
    
    Args:
        pdf_path (str): Path to the PDF file
        return_single_string (bool): If True, returns the entire PDF content as a single string.
                                    If False, returns a list of strings (one per page).
        extract_metadata (bool): If True, returns metadata along with content
    
    Returns:
        If return_single_string is True and extract_metadata is False:
            str: The entire text content of the PDF
        If return_single_string is False and extract_metadata is False:
            list: List of strings, one for each page
        If extract_metadata is True:
            tuple: (content, metadata) where content is either a string or list based on return_single_string
    """
    
    # Check if the file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    
    # Initialize the loader with the appropriate mode
    mode = "single" if return_single_string else "elements"
    loader = PyPDFLoader(pdf_path, mode=mode)
    
    # Load the documents
    docs = loader.load()
    
    if return_single_string:
        # With mode="single", there should only be one document containing all pages
        content = docs[0].page_content if docs else ""
        metadata = docs[0].metadata if docs else {}
    else:
        # With default mode, each document is a page
        content = [doc.page_content for doc in docs]
        metadata = [doc.metadata for doc in docs]
    
    if extract_metadata:
        return content, metadata
    else:
        return content

In [6]:
doc = load_pdf_content("Cloud Computing Copy Lecture Notes.pdf")

In [7]:
print(doc[:1000])  # Print the first 1000 characters of the loaded document

Cloud Computing Lecture Notes 
Distributed Computing/Systems 
Definition: 
Distributed computing refers to a system where computing resources are distributed 
across multiple locations rather than being centralized in a single system. This enables 
task distribution and efficient resource utilization. 
Why Use Distributed Systems? 
• Scalability Issues: Traditional computing faces bottlenecks due to hardware 
limitations, whereas distributed systems allow for hardware scaling. 
• Connected Devices: In a networked system, connected devices communicate, but 
this does not necessarily make them distributed. 
• IoT (Internet of Things): IoT is one of the largest examples of distributed computing. 
• Multi-layered System Design: Distributed computing enables systems to function 
in multiple layers, with each layer acting as a distributed entity. 
• User Perspective: Although the system consists of multiple machines, distributed 
computing presents a unified system to users. 
 
Parallel Comp

# Step 1

In [8]:
# Define Pydantic model for entity schema parser
class EntitySchema(BaseModel):
    """Entity types and their properties."""
    entities: Dict[str, List[str]] = Field(
        description="Dictionary mapping entity types to their possible properties"
    )

In [9]:
# Create entity extraction chain
def create_entity_extraction_chain():
    parser = JsonOutputParser(pydantic_object=EntitySchema)
    
    # Prompt template
    prompt = PromptTemplate(
        template="""
    You are the first agent in a multi-step workflow to build a Knowledge Graph from raw text.

    Workflow Steps Overview:
    1. Extract high-level entity types and their properties from the text. [CURRENT STEP]
    2. Extract specific instances of entities and their properties based on the identified types.
    3. Deduplicate extracted instances and assign them unique identifiers.
    4. Identify and define relationships between the instances of entities.
    5. Create a structured knowledge graph using the extracted entities and relationships.

    You are the FIRST agent in this workflow.


    YOUR TASK:
    - Identify high-level, general entity types (e.g., Person, Company, Location, Event).
    - For each entity type, list all the possible (available) properties it might have.
    - Focus on information that would be useful for structuring a knowledge graph.
    - Stay general — do not extract specific names, examples, or relationships.
    - Avoid unnecessary details or context-specific examples.

    FORMAT:
    - Return a valid JSON object.
    - Keys = entity types (strings).
    - Values = lists of property names (strings).
    - Use double quotes for all keys and string values.
    - No extra explanation, text, or markdown formatting.

    EXAMPLES:
    {{
        "Person": ["name", "age", "email", "address"],
        "Company": ["name", "industry", "founded_date"],
        "Location": ["name", "coordinates", "population"]
    }}

    Text to process: {input}

    {format_instructions}

    Response:
    """,
        input_variables=["input"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    
    # Build the chain
    chain = prompt | llm | parser
    
    return chain


In [10]:
# Function to extract entities from text with retry logic
def extract_entity_schema(text, max_retries=3):
    """
    Extract entity types and their properties from input text with retry logic.
    
    Args:
        text (str): Input text to analyze
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        dict: Dictionary mapping entity types to lists of properties
    """
    chain = create_entity_extraction_chain()
    
    for attempt in range(max_retries):
        try:
            result = chain.invoke({"input": text})
            # The result is the entities dictionary from the Pydantic model
            return result.get("entities", {})
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying... Error: {str(e)[:100]}...")
            else:
                print(f"All {max_retries} attempts failed. Last error: {str(e)[:100]}...")
                # Return empty dict as fallback
                return {}

In [11]:
sample_text = """
John Doe, a 35-year-old software engineer, works at Google in Mountain View.
He graduated from MIT with a degree in Computer Science and has been with the company for 5 years.
Google, founded in 1998, is a technology company specializing in internet services and products.
John lives in San Francisco and commutes to work daily. His email is john.doe@example.com.
"""

entities = extract_entity_schema(sample_text)
print(entities)

print("Extracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")

{'Person': ['name', 'age', 'occupation', 'employer', 'education', 'residence', 'email', 'tenure'], 'Company': ['name', 'location', 'founded_date', 'industry', 'specialization'], 'Location': ['name'], 'Educational Institution': ['name'], 'Field of Study': ['name']}
Extracted Entity Schema:

Person:
- name
- age
- occupation
- employer
- education
- residence
- email
- tenure

Company:
- name
- location
- founded_date
- industry
- specialization

Location:
- name

Educational Institution:
- name

Field of Study:
- name


In [12]:
# Extract entities from the loaded PDF document
entities = extract_entity_schema(doc)

print("\nExtracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")


Extracted Entity Schema:

Computing Concept:
- definition
- characteristics
- use_cases
- limitations
- aspects
- related_concepts

System Architecture:
- description
- characteristics
- components
- use_cases
- comparison_aspects

Platform:
- overview
- purpose
- architecture
- components
- service_offerings
- deployment_aspects
- management_aspects
- security_aspects
- scalability_aspects
- reliability_aspects
- cost_aspects
- features

Resource:
- description
- characteristics
- management_aspects
- lifecycle_aspects
- allocation_aspects
- pricing_aspects
- type

Storage Type:
- description
- characteristics
- use_cases
- pricing_models
- management_aspects

Database Type:
- description
- characteristics
- use_cases
- management_aspects
- migration_aspects

Network Entity:
- definition
- purpose
- characteristics
- components
- management_aspects
- security_aspects
- type

Service Model:
- definition
- characteristics
- responsibility_division

Deployment Model:
- definition
- char

In [13]:
  # Save entity schema for next step
with open("entity_schema.json", "w") as f:
    json.dump({"entities": entities}, f, indent=2)

# Step 2

In [14]:
# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=5000, chunk_overlap=500):
    """
    Split the input text into manageable chunks using RecursiveCharacterTextSplitter.
    
    Args:
        text: The input text to be split
        chunk_size: Maximum size of each chunk in characters
        chunk_overlap: Overlap between consecutive chunks
        
    Returns:
        list: List of Document objects
    """
    logger.info(f"Splitting text into chunks (size={chunk_size}, overlap={chunk_overlap})")
    
    # Initialize the splitter with paragraph-focused splitting
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],  # Try to split at paragraph boundaries first
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        keep_separator=True,
        add_start_index=True  # Add start position metadata
    )
    
    # Split the text into chunks
    chunks = splitter.create_documents([text])
    
    # Add chunk index as metadata
    for i, chunk in enumerate(chunks):
        chunk.metadata["chunk_id"] = i
        chunk.metadata["total_chunks"] = len(chunks)
    
    logger.info(f"Text split into {len(chunks)} chunks")
    return chunks

In [15]:
# Define Pydantic models for entity instance extraction
class EntityInstances(BaseModel):
    """Instances of a specific entity type."""
    Entity: str = Field(description="The entity type name")
    Instances: Dict[str, Dict[str, Any]] = Field(
        description="Dictionary mapping instance names to their properties"
    )

class ChunkExtractionResult(BaseModel):
    """Result of entity extraction from a single chunk."""
    entities: List[EntityInstances] = Field(
        description="List of entity types and their instances found in this chunk"
    )

In [16]:
# Function to create entity instance extraction chain
def create_entity_instance_extraction_chain():
    """
    Create a chain for extracting entity instances from text chunks.
    
    Returns:
        Chain: A chain that extracts entity instances from text chunks
    """
    # Entity instance extraction result parser
    parser = JsonOutputParser(pydantic_object=ChunkExtractionResult)
    
    # Create prompt template for entity instance extraction
    prompt = PromptTemplate(
        template="""
        You are part of a multi-step workflow to build a Knowledge Graph from raw text.

        Workflow Steps Overview:
        1. Extract high-level entity types and their properties from the text. [COMPLETED]
        2. Extract specific instances of entities and their properties from text chunks. [CURRENT STEP]
        3. Deduplicate extracted instances and assign them unique identifiers.
        4. Identify and define relationships between the instances of entities.
        5. Create a structured knowledge graph using the extracted entities and relationships.

        YOUR TASK:
        You are processing a CHUNK of the full text. Focus ONLY on extracting CONCRETE INSTANCES of entities found in this chunk.

        GIVEN:
        1. A chunk of text
        2. A schema of entity types and their possible properties

        INSTRUCTIONS:
        - Extract ALL instances of the predefined entity types found in this chunk
        - For each instance, extract values for as many properties as are mentioned in the text
        - Be precise - only extract information explicitly stated in this chunk
        - Do NOT make up or infer missing properties
        - If a property is not mentioned, omit it from the output (don't include it with null/empty values)

        INPUT TEXT CHUNK:
        {chunk}

        ENTITY TYPES AND THEIR PROPERTIES:
        {entity_schema}

        FORMAT YOUR RESPONSE AS FOLLOWS:
        - Return a valid JSON object
        - For each entity type found, include its name and an "Instances" object
        - "Instances" should be a dictionary where:
          - Keys are the instance names 
          - Values are objects containing the (available) instance properties 
        - Properties not mentioned should be omitted entirely
        - If no instances of a particular entity type are found, do not include that entity type

        {format_instructions}

        EXAMPLE RESPONSE FOR A CHUNK ABOUT PEOPLE AND COMPANIES:
        {{
        "entities": [
            {{
            "Entity": "Person",
            "Instances": {{
                "John Doe": {{
                    "name": "John Doe",
                    "age": 35,
                    "email": "john@example.com"
                }},
                "Jane Smith": {{
                    "name": "Jane Smith",
                    "email": "jane@example.com"
                }}
            }}
            }},
            {{
            "Entity": "Company",
            "Instances": {{
                "Google": {{
                    "industry": "Technology",
                    "founded": 1998
                }}
            }}
            }}
        ]
        }}
        Begin your extraction now: """, 
    input_variables=["chunk", "entity_schema", "chunk_id"], 
    partial_variables={"format_instructions": parser.get_format_instructions()}, 
    )

    # Build the chain
    chain = prompt | llm | parser

    return chain

In [17]:
# Function to process a single chunk with retry logic
def process_chunk(inputs, max_retries=3):
    """
    Process a single text chunk to extract entity instances with retry logic.
    
    Args:
        inputs: Dictionary containing chunk and entity_schema
        max_retries: Maximum number of retry attempts
        
    Returns:
        dict: Extraction results
    """
    chunk = inputs["chunk"]
    entity_schema = inputs["entity_schema"]
    chunk_id = chunk.metadata.get("chunk_id", 0)
    
    for attempt in range(max_retries):
        try:
            chain = create_entity_instance_extraction_chain()
            result = chain.invoke({
                "chunk": chunk.page_content,
                "entity_schema": json.dumps(entity_schema, indent=2),
                "chunk_id": chunk_id
            })
            logger.info(f"Successfully processed chunk {chunk_id}")
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                logger.warning(f"Attempt {attempt + 1} failed for chunk {chunk_id}. Retrying... Error: {str(e)[:100]}...")
            else:
                logger.error(f"All {max_retries} attempts failed for chunk {chunk_id}. Error: {str(e)[:100]}...")
                return {
                    "entities": [],
                    "chunk_id": chunk_id,
                    "error": str(e)[:200]
                }

In [18]:
# Modify the extract_entity_instances_parallel function to stop saving intermediate files
def extract_entity_instances_parallel(document, entity_schema, max_concurrency=6):
    """
    Extract entity instances from document chunks in parallel using RunnableParallel.
    
    Args:
        document: The full text document
        entity_schema: Dictionary of entity types and their properties
        max_concurrency: Maximum number of chunks to process in parallel
        
    Returns:
        list: List of entity instances extracted from all chunks
    """
    start_time = time.time()
    logger.info("Beginning entity instance extraction (parallel)")
    
    # Split the document into chunks
    chunks = split_text_into_chunks(document, chunk_size=5000, chunk_overlap=500)
    logger.info(f"Document split into {len(chunks)} chunks")
    
    # Prepare inputs for each chunk
    inputs = [{"chunk": chunk, "entity_schema": entity_schema} for chunk in chunks]
    
    # Create a RunnableLambda for chunk processing
    chunk_processor = RunnableLambda(process_chunk)
    
    # Process chunks in batches with progress tracking
    all_results = []
    batch_size = min(max_concurrency, len(chunks))
    
    # Use tqdm for progress tracking in batches
    with tqdm(total=len(chunks), desc="Processing chunks") as progress_bar:
        for i in range(0, len(inputs), batch_size):
            batch_inputs = inputs[i:i+batch_size]
            
            # Process the batch in parallel
            batch_results = chunk_processor.batch(batch_inputs, config={"max_concurrency": max_concurrency})
            all_results.extend(batch_results)
            
            # Update progress bar
            progress_bar.update(len(batch_inputs))
            
            # Log progress without saving intermediate files
            logger.info(f"Processed {min(i+batch_size, len(inputs))}/{len(inputs)} chunks ({min((i+batch_size)/len(inputs), 1.0)*100:.1f}%)")
    
    end_time = time.time()
    logger.info(f"Entity instance extraction completed in {end_time - start_time:.2f} seconds")
    
    return all_results

In [19]:
def merge_chunk_results(chunk_results):
    """
    Merge the results from all chunks.
    
    Args:
        chunk_results: List of extraction results from chunks
        
    Returns:
        list: Combined list of entity instances
    """
    merged_results = []
    
    # Check for errors
    errors = [r for r in chunk_results if r.get("error")]
    if errors:
        logger.warning(f"{len(errors)} chunks had errors during processing")
    
    # Group by entity type
    entity_instances = {}
    for result in chunk_results:
        if "entities" not in result:
            continue
            
        for entity_data in result["entities"]:
            entity_type = entity_data.get("Entity")
            instances = entity_data.get("Instances", {})
            
            if entity_type not in entity_instances:
                entity_instances[entity_type] = {}
                
            # Merge instances from this chunk into the collected instances
            # If instance already exists, update with any new properties
            for instance_name, instance_props in instances.items():
                if instance_name in entity_instances[entity_type]:
                    # Add any new properties from this instance
                    entity_instances[entity_type][instance_name].update(instance_props)
                else:
                    # Add the new instance
                    entity_instances[entity_type][instance_name] = instance_props
    
    # Convert to the expected format
    for entity_type, instances in entity_instances.items():
        merged_results.append({
            "Entity": entity_type,
            "Instances": instances
        })
    
    logger.info(f"Merged results for {len(entity_instances)} entity types")
    return merged_results

In [20]:
# Load the entity schema
with open("entity_schema.json", "r") as f:
    entity_schema_data = json.load(f)
    entity_schema = entity_schema_data.get("entities", {})

# Extract entity instances using parallel processing
chunk_results = extract_entity_instances_parallel(
    document=doc,
    entity_schema=entity_schema,
    max_concurrency=6  # Process 6 chunks in parallel
)

# Merge results from all chunks
merged_results = merge_chunk_results(chunk_results)

# Save only one final result file
output_filepath = "entity_instances.json"
with open(output_filepath, "w") as f:
    json.dump(merged_results, f, indent=2)

print(f"\nExtracted {sum(len(entity['Instances']) for entity in merged_results)} instances across {len(merged_results)} entity types")
print(f"Results saved to {output_filepath}")

# Print sample results
print("\nSample instances:")
for entity in merged_results:
    entity_type = entity["Entity"]
    instances = entity["Instances"]
    print(f"\n{entity_type} ({len(instances)} instances):")
    
    # Get list of instance keys (names) and take first 3
    instance_keys = list(instances.keys())[:3]
    
    # Display up to 3 instances
    for i, instance_name in enumerate(instance_keys):
        instance_data = instances[instance_name]
        print(f"  Instance {i+1}: {instance_name} - {str(instance_data)[:100]}...")
    
    if len(instances) > 3:
        print(f"  ... and {len(instances) - 3} more")

2025-05-21 13:11:39,222 - INFO - Beginning entity instance extraction (parallel)
2025-05-21 13:11:39,224 - INFO - Splitting text into chunks (size=5000, overlap=500)
2025-05-21 13:11:39,229 - INFO - Text split into 11 chunks
2025-05-21 13:11:39,231 - INFO - Document split into 11 chunks


Processing chunks:   0%|          | 0/11 [00:00<?, ?it/s]2025-05-21 13:12:07,798 - INFO - Successfully processed chunk 0
2025-05-21 13:12:10,587 - INFO - Successfully processed chunk 3
2025-05-21 13:12:18,397 - INFO - Successfully processed chunk 2
2025-05-21 13:12:18,490 - INFO - Successfully processed chunk 1
2025-05-21 13:12:25,341 - INFO - Successfully processed chunk 4
2025-05-21 13:12:27,146 - INFO - Successfully processed chunk 5
Processing chunks:  55%|█████▍    | 6/11 [00:47<00:39,  7.98s/it]2025-05-21 13:12:27,175 - INFO - Processed 6/11 chunks (54.5%)
2025-05-21 13:12:52,975 - INFO - Successfully processed chunk 6
2025-05-21 13:13:10,845 - INFO - Successfully processed chunk 10
2025-05-21 13:13:16,781 - INFO - Successfully processed chunk 8
2025-05-21 13:13:27,128 - INFO - Successfully processed chunk 9
2025-05-21 13:13:47,176 - INFO - Successfully processed chunk 7
Processing chunks: 100%|██████████| 11/11 [02:07<00:00, 12.34s/it]2025-05-21 13:13:47,186 - INFO - Processed 1


Extracted 538 instances across 12 entity types
Results saved to entity_instances.json

Sample instances:

Computing Concept (157 instances):
  Instance 1: Distributed Computing - {'definition': 'a system where computing resources are distributed across multiple locations rather ...
  Instance 2: Traditional computing - {'limitations': ['faces bottlenecks due to hardware limitations']}...
  Instance 3: Parallel Computing - {'definition': 'involves executing multiple processes simultaneously to enhance speed and efficiency...
  ... and 154 more

System Architecture (31 instances):
  Instance 1: Distributed Systems - {'description': 'a system where computing resources are distributed across multiple locations rather...
  Instance 2: Clusters - {'description': 'consist of multiple machines with similar hardware and operating systems, working t...
  Instance 3: Grids - {'description': 'consist of heterogeneous systems that may have different hardware, OS, and configur...
  ... and 28 more


# Step 3

In [21]:
def deduplicate_and_assign_ids(entity_data):
    """
    Remove duplicate entity instances and assign incremental integer IDs to each remaining instance.
    
    Args:
        entity_data: List of dictionaries containing entity types and their instances
        
    Returns:
        list: Deduplicated entity instances with unique integer IDs assigned
    """
    logger.info("Beginning deduplication and ID assignment")
    start_time = time.time()
    
    # Create a deep copy of the input data to avoid modifying it
    deduplicated_data = []
    
    # Track statistics
    total_instances_before = 0
    total_instances_after = 0
    duplicates_found = 0
    
    # Use a counter for assigning incremental IDs
    id_counter = 1
    
    # Process each entity type
    for entity in entity_data:
        entity_type = entity["Entity"]
        instances = entity["Instances"]
        total_instances_before += len(instances)
        
        # Create a new dictionary for the deduplicated instances
        deduplicated_instances = {}
        
        # For each instance, check if it already exists and merge if needed
        for instance_name, instance_props in instances.items():
            # If this instance name already exists, merge properties
            if instance_name in deduplicated_instances:
                duplicates_found += 1
                existing_props = deduplicated_instances[instance_name]
                
                # Merge properties, keeping all unique properties
                for key, value in instance_props.items():
                    if key not in existing_props:
                        existing_props[key] = value
                        
                # Log information about the merge
                logger.debug(f"Merged duplicate instance '{instance_name}' in entity type '{entity_type}'")
            else:
                # Add this instance to the deduplicated set
                deduplicated_instances[instance_name] = instance_props.copy()
        
        # Now assign unique integer IDs to each instance
        for instance_name, props in deduplicated_instances.items():
            # Assign an incremental integer ID
            props["id"] = id_counter
            id_counter += 1
        
        # Add the deduplicated entity to the result
        deduplicated_data.append({
            "Entity": entity_type,
            "Instances": deduplicated_instances
        })
        
        total_instances_after += len(deduplicated_instances)
        
        logger.info(f"Processed entity type '{entity_type}': {len(instances)} instances → {len(deduplicated_instances)} unique instances")
    
    end_time = time.time()
    logger.info(f"Deduplication completed in {end_time - start_time:.2f} seconds")
    logger.info(f"Total instances before: {total_instances_before}, after: {total_instances_after}")
    logger.info(f"Removed {duplicates_found} duplicate instances")
    logger.info(f"Assigned IDs from 1 to {id_counter-1}")
    
    return deduplicated_data

In [22]:
def process_step3():
    """
    Execute Step 3: Deduplicate entity instances and assign unique IDs.
    """
    # Load data from Step 2
    logger.info("Loading entity instances from Step 2")
    try:
        with open("entity_instances.json", "r") as f:
            entity_instances = json.load(f)
    except FileNotFoundError:
        logger.error("entity_instances.json not found. Please complete Step 2 first.")
        return None
    
    # Deduplicate and assign IDs
    deduplicated_data = deduplicate_and_assign_ids(entity_instances)
    
    # Save the deduplicated data
    output_filepath = "deduplicated_entities_2.json"
    with open(output_filepath, "w") as f:
        json.dump(deduplicated_data, f, indent=2)
    
    logger.info(f"Deduplicated entities saved to {output_filepath}")
    
    # Print summary
    total_instances = sum(len(entity["Instances"]) for entity in deduplicated_data)
    print(f"\nDeduplication complete: {total_instances} unique instances across {len(deduplicated_data)} entity types")
    print(f"Results saved to {output_filepath}")
    
    # Print sample results
    print("\nSample deduplicated instances:")
    for entity in deduplicated_data:
        entity_type = entity["Entity"]
        instances = entity["Instances"]
        print(f"\n{entity_type} ({len(instances)} instances):")
        
        # Get list of instance keys (names) and take first 3
        instance_keys = list(instances.keys())[:3]
        
        # Display up to 3 instances
        for i, instance_name in enumerate(instance_keys):
            instance_data = instances[instance_name]
            # Show ID and a few other properties 
            props_preview = {k: v for k, v in list(instance_data.items())[:4]}
            print(f"  Instance {i+1}: {instance_name} - {props_preview}")
        
        if len(instances) > 3:
            print(f"  ... and {len(instances) - 3} more")
    
    return deduplicated_data


In [23]:
# Execute Step 3
deduplicated_entities = process_step3()

2025-05-21 13:14:19,603 - INFO - Loading entity instances from Step 2
2025-05-21 13:14:19,607 - INFO - Beginning deduplication and ID assignment
2025-05-21 13:14:19,608 - INFO - Processed entity type 'Computing Concept': 157 instances → 157 unique instances
2025-05-21 13:14:19,609 - INFO - Processed entity type 'System Architecture': 31 instances → 31 unique instances
2025-05-21 13:14:19,611 - INFO - Processed entity type 'Resource': 92 instances → 92 unique instances
2025-05-21 13:14:19,612 - INFO - Processed entity type 'Storage Type': 16 instances → 16 unique instances
2025-05-21 13:14:19,613 - INFO - Processed entity type 'Network Entity': 60 instances → 60 unique instances
2025-05-21 13:14:19,614 - INFO - Processed entity type 'Role': 28 instances → 28 unique instances
2025-05-21 13:14:19,615 - INFO - Processed entity type 'Organization': 23 instances → 23 unique instances
2025-05-21 13:14:19,616 - INFO - Processed entity type 'Software Component': 66 instances → 66 unique instanc


Deduplication complete: 538 unique instances across 12 entity types
Results saved to deduplicated_entities_2.json

Sample deduplicated instances:

Computing Concept (157 instances):
  Instance 1: Distributed Computing - {'definition': 'a system where computing resources are distributed across multiple locations rather than being centralized in a single system.', 'aspects': ['enables task distribution and efficient resource utilization', 'enables systems to function in multiple layers, with each layer acting as a distributed entity', 'presents a unified system to users'], 'use_cases': ['IoT (Internet of Things)'], 'related_concepts': ['Parallel Computing', 'Cloud Computing']}
  Instance 2: Traditional computing - {'limitations': ['faces bottlenecks due to hardware limitations'], 'id': 2}
  Instance 3: Parallel Computing - {'definition': 'involves executing multiple processes simultaneously to enhance speed and efficiency.', 'aspects': ['Distribution & Speed', 'Core Objective'], 'use_ca

# Step 4

In [31]:
# Define Pydantic models for relationship extraction
class EntityReference(BaseModel):
    """Reference to an entity instance"""
    entity: str = Field(description="Entity type name")
    id: int = Field(description="Unique ID of the entity instance")  # Changed from str to int

class Relationship(BaseModel):
    """Relationship between two entity instances"""
    relationship: str = Field(description="Type or name of the relationship")
    source: EntityReference = Field(description="Source entity of the relationship")
    destination: EntityReference = Field(description="Destination entity of the relationship")

class RelationshipExtractionResult(BaseModel):
    """Result of relationship extraction"""
    relationships: List[Relationship] = Field(description="List of extracted relationships between entities")

In [32]:
def create_relationship_extraction_chain():
    # Relationship extraction result parser
    parser = JsonOutputParser(pydantic_object=RelationshipExtractionResult)
    
    # Create prompt template for relationship extraction
    prompt = PromptTemplate(
        template="""
        You are the fourth agent in a multi-step workflow to build a Knowledge Graph from raw text.

        Workflow Steps Overview:
        1. Extract high-level entity types and their properties from the text. [COMPLETED]
        2. Extract specific instances of entities and their properties from text chunks. [COMPLETED]
        3. Deduplicate extracted instances and assign them unique identifiers. [COMPLETED]
        4. Identify and define relationships between the instances of entities. [CURRENT STEP]
        5. Create a structured knowledge graph using the extracted entities and relationships.

        YOUR TASK:
        Extract relationships between the entity instances that have already been identified in the text.

        GIVEN:
        1. Original text document
        2. List of entity types and their instances with unique IDs

        INSTRUCTIONS:
        - Identify relationships between entity instances
        - Focus on explicit relationships mentioned in the text
        - Be precise - only extract relationships clearly stated in the text
        - Use common relationship types like "works_at", "part_of", "authored", "connected_to", etc.
        - Ensure all relationships reference valid entity instances with their correct IDs
        - IMPORTANT: All entity IDs are integers, not strings

        ORIGINAL TEXT:
        {input_text}

        ENTITY INSTANCES:
        {entity_instances}

        FORMAT YOUR RESPONSE:
        Return a list of relationship objects, where each relationship contains:
        1. The relationship type/name (e.g., "works_at", "located_in")
        2. Source entity reference (entity type and instance ID as an integer)
        3. Destination entity reference (entity type and instance ID as an integer)

        {format_instructions}

        EXAMPLE RESPONSE:
        {{
          "relationships": [
            {{
              "relationship": "works_at",
              "source": {{"entity": "Person", "id": 1}},
              "destination": {{"entity": "Company", "id": 3}}
            }},
            {{
              "relationship": "located_in",
              "source": {{"entity": "Company", "id": 3}},
              "destination": {{"entity": "Location", "id": 76}}
            }}
          ]
        }}

        Begin your extraction now:
        """, 
        input_variables=["input_text", "entity_instances"], 
        partial_variables={"format_instructions": parser.get_format_instructions()}, 
    )

    # Build the chain
    chain = prompt | llm | parser

    return chain

In [33]:
def process_chunk_for_relationships(inputs, max_retries=3):
    """
    Process a single text chunk to extract relationships with retry logic.
    
    Args:
        inputs: Dictionary containing chunk, entity_instances, and other parameters
        max_retries: Maximum number of retry attempts
        
    Returns:
        dict: Extracted relationships from this chunk
    """
    chunk = inputs["chunk"]
    entity_instances = inputs["entity_instances"]
    chunk_id = chunk.metadata.get("chunk_id", 0)
    
    for attempt in range(max_retries):
        try:
            chain = create_relationship_extraction_chain()
            
            # Create a simplified version of entities for the prompt
            simplified_entities = []
            entity_id_map = {}
            
            for entity_data in entity_instances:
                entity_type = entity_data["Entity"]
                instances = []
                
                for instance_name, props in entity_data["Instances"].items():
                    instance_id = props.get("id")
                    if instance_id is not None:
                        # Ensure ID is an integer
                        instance_id = int(instance_id)
                        instances.append({
                            "name": instance_name,
                            "id": instance_id
                        })
                        entity_id_map[instance_id] = {"entity_type": entity_type}
                
                simplified_entities.append({
                    "Entity": entity_type,
                    "Instances": instances
                })
            
            result = chain.invoke({
                "input_text": chunk.page_content,
                "entity_instances": json.dumps(simplified_entities, indent=2)
            })
            
            # Validate the relationships against our entity ID map
            valid_relationships = []
            for rel in result.get("relationships", []):
                # Ensure IDs are integers
                source_id = int(rel["source"]["id"])
                dest_id = int(rel["destination"]["id"])
                
                if source_id in entity_id_map and dest_id in entity_id_map:
                    # Add chunk ID metadata for debugging/analysis
                    rel["chunk_id"] = chunk_id
                    valid_relationships.append(rel)
            
            logger.info(f"Successfully processed chunk {chunk_id}, found {len(valid_relationships)} relationships")
            return {"relationships": valid_relationships, "chunk_id": chunk_id}
            
        except Exception as e:
            if attempt < max_retries - 1:
                logger.warning(f"Attempt {attempt + 1} failed for chunk {chunk_id}. Retrying... Error: {str(e)[:100]}...")
            else:
                logger.error(f"All {max_retries} attempts failed for chunk {chunk_id}. Error: {str(e)[:100]}...")
                return {
                    "relationships": [],
                    "chunk_id": chunk_id,
                    "error": str(e)[:200]
                }

In [34]:
def merge_relationship_results(chunk_results):
    """
    Merge relationship extraction results from all chunks.
    
    Args:
        chunk_results: List of relationship extraction results from chunks
        
    Returns:
        dict: Merged relationship extraction results
    """
    # Create a set to track unique relationships and avoid duplicates
    unique_relationships = set()
    merged_relationships = []
    
    # Check for errors
    errors = [r for r in chunk_results if r.get("error")]
    if errors:
        logger.warning(f"{len(errors)} chunks had errors during relationship extraction")
    
    # Merge relationships from all chunks
    for result in chunk_results:
        relationships = result.get("relationships", [])
        
        for rel in relationships:
            # Create a tuple that uniquely identifies this relationship
            rel_key = (
                rel["relationship"], 
                rel["source"]["entity"], 
                int(rel["source"]["id"]),  # Ensure integer
                rel["destination"]["entity"],
                int(rel["destination"]["id"])  # Ensure integer
            )
            
            # Only add if we haven't seen this exact relationship before
            if rel_key not in unique_relationships:
                unique_relationships.add(rel_key)
                merged_relationships.append(rel)
    
    logger.info(f"Merged {len(merged_relationships)} unique relationships from all chunks")
    return {"relationships": merged_relationships}

In [28]:
def extract_relationships_parallel(input_text, entity_instances, max_concurrency=6):
    """
    Extract relationships from document chunks in parallel using RunnableParallel.
    
    Args:
        input_text: The full text document
        entity_instances: Complete list of deduplicated entity instances
        max_concurrency: Maximum number of chunks to process in parallel
        
    Returns:
        dict: Dictionary containing extracted relationships from all chunks
    """
    start_time = time.time()
    logger.info("Beginning relationship extraction (parallel)")
    
    # Split the document into chunks (reusing the function from Step 2)
    chunks = split_text_into_chunks(input_text, chunk_size=5000, chunk_overlap=500)
    logger.info(f"Document split into {len(chunks)} chunks for relationship extraction")
    
    # Prepare inputs for each chunk
    inputs = [{"chunk": chunk, "entity_instances": entity_instances} for chunk in chunks]
    
    # Create a RunnableLambda for chunk processing
    chunk_processor = RunnableLambda(process_chunk_for_relationships)
    
    # Process chunks in batches with progress tracking
    all_results = []
    batch_size = min(max_concurrency, len(chunks))
    
    # Use tqdm for progress tracking in batches
    with tqdm(total=len(chunks), desc="Processing chunks") as progress_bar:
        for i in range(0, len(inputs), batch_size):
            batch_inputs = inputs[i:i+batch_size]
            
            # Process the batch in parallel
            batch_results = chunk_processor.batch(batch_inputs, config={"max_concurrency": max_concurrency})
            all_results.extend(batch_results)
            
            # Update progress bar
            progress_bar.update(len(batch_inputs))
            
            # Log progress
            logger.info(f"Processed {min(i+batch_size, len(inputs))}/{len(inputs)} chunks ({min((i+batch_size)/len(inputs), 1.0)*100:.1f}%)")
    
    end_time = time.time()
    logger.info(f"Relationship extraction completed in {end_time - start_time:.2f} seconds")
    
    # Merge results from all chunks
    merged_results = merge_relationship_results(all_results)
    
    return merged_results

In [35]:
def process_step4():
    """
    Execute Step 4: Extract relationships between entity instances using parallel processing.
    
    Returns:
        dict: Dictionary containing extracted relationships
    """
    # Load data from Step 3
    logger.info("Loading deduplicated entities from Step 3")
    try:
        with open("deduplicated_entities_2.json", "r") as f:
            entity_instances = json.load(f)
    except FileNotFoundError:
        logger.error("Deduplicated entities file not found. Please complete Step 3 first.")
        return None
    
    # Load original text from PDF
    try:
        input_text = load_pdf_content("Cloud Computing Copy Lecture Notes.pdf")
        logger.info(f"Loaded original text: {len(input_text)} characters")
    except Exception as e:
        logger.error(f"Failed to load PDF content: {str(e)}")
        return None
    
    # Extract relationships using parallel processing
    relationships = extract_relationships_parallel(input_text, entity_instances, max_concurrency=6)
    
    # Save the relationships
    output_filepath = "entity_relationships.json"
    with open(output_filepath, "w") as f:
        json.dump(relationships, f, indent=2)
    
    logger.info(f"Extracted relationships saved to {output_filepath}")
    
    # Print summary
    print(f"\nRelationship extraction complete: {len(relationships['relationships'])} relationships found")
    print(f"Results saved to {output_filepath}")
    
    # Print sample results
    print("\nSample relationships:")
    for i, rel in enumerate(relationships["relationships"][:5]):  # Show up to 5 relationships
        print(f"\nRelationship {i+1}:")
        print(f"  Type: {rel['relationship']}")
        print(f"  Source: {rel['source']['entity']} - (ID: {rel['source']['id']})")
        print(f"  Destination: {rel['destination']['entity']} - (ID: {rel['destination']['id']})")
    
    if len(relationships["relationships"]) > 5:
        print(f"\n... and {len(relationships['relationships']) - 5} more relationships")
    
    return relationships

In [36]:
# Execute Step 4 with parallel processing
relationships = process_step4()

2025-05-21 13:21:02,784 - INFO - Loading deduplicated entities from Step 3
2025-05-21 13:21:05,123 - INFO - Loaded original text: 48269 characters
2025-05-21 13:21:05,124 - INFO - Beginning relationship extraction (parallel)
2025-05-21 13:21:05,125 - INFO - Splitting text into chunks (size=5000, overlap=500)
2025-05-21 13:21:05,130 - INFO - Text split into 11 chunks
2025-05-21 13:21:05,132 - INFO - Document split into 11 chunks for relationship extraction
Processing chunks:   0%|          | 0/11 [00:00<?, ?it/s]2025-05-21 13:22:04,673 - INFO - Successfully processed chunk 1, found 70 relationships
2025-05-21 13:22:09,979 - INFO - Successfully processed chunk 3, found 86 relationships
2025-05-21 13:22:21,726 - INFO - Successfully processed chunk 4, found 109 relationships
2025-05-21 13:22:39,737 - INFO - Successfully processed chunk 5, found 116 relationships
2025-05-21 13:23:07,745 - INFO - Successfully processed chunk 2, found 89 relationships
2025-05-21 13:24:55,496 - INFO - Successf


Relationship extraction complete: 1170 relationships found
Results saved to entity_relationships.json

Sample relationships:

Relationship 1:
  Type: implemented_by
  Source: Computing Concept - (ID: 1)
  Destination: System Architecture - (ID: 181)

Relationship 2:
  Type: enables
  Source: Computing Concept - (ID: 1)
  Destination: Computing Concept - (ID: 37)

Relationship 3:
  Type: presents_to
  Source: Computing Concept - (ID: 1)
  Destination: Role - (ID: 358)

Relationship 4:
  Type: required_for
  Source: Resource - (ID: 189)
  Destination: Computing Concept - (ID: 3)

Relationship 5:
  Type: bridges
  Source: Software Component - (ID: 408)
  Destination: Software Component - (ID: 413)

... and 1165 more relationships


In [38]:
import networkx as nx
from pyvis.network import Network
import json
import os
from IPython.display import IFrame, display

def load_graph_data():
    """Load entity and relationship data from JSON files"""
    # Load entities (nodes)
    with open("deduplicated_entities_2.json", "r") as f:
        entities = json.load(f)
    
    # Load relationships (edges)
    with open("entity_relationships.json", "r") as f:
        relationships = json.load(f)
    
    return entities, relationships["relationships"]

def create_knowledge_graph(entities, relationships, max_nodes=None):
    """Create a NetworkX graph from entities and relationships"""
    # Create a new graph
    G = nx.Graph()
    
    # Track entity types for coloring
    entity_types = set()
    entity_type_map = {}
    
    # Add nodes with properties
    node_count = 0
    for entity_group in entities:
        entity_type = entity_group["Entity"]
        entity_types.add(entity_type)
        
        for instance_name, props in entity_group["Instances"].items():
            if max_nodes and node_count >= max_nodes:
                break
                
            # Add node with properties
            node_id = props["id"]
            G.add_node(node_id, 
                       label=instance_name, 
                       title=f"<b>{entity_type}: {instance_name}</b><br>" + 
                             "<br>".join([f"{k}: {v}" for k, v in props.items() if k != "id"]),
                       group=entity_type,
                       entity_type=entity_type)
            
            entity_type_map[node_id] = entity_type
            node_count += 1
    
    # Add edges with relationship types
    edge_count = 0
    for rel in relationships:
        source_id = rel["source"]["id"]
        target_id = rel["destination"]["id"]
        rel_type = rel["relationship"]
        
        # Only add edges between nodes that exist in the graph
        if source_id in G.nodes and target_id in G.nodes:
            G.add_edge(source_id, target_id, 
                       title=rel_type,
                       label=rel_type[:10] + "..." if len(rel_type) > 10 else rel_type)
            edge_count += 1
    
    print(f"Created graph with {len(G.nodes)} nodes and {len(G.edges)} edges")
    return G, entity_types, entity_type_map

def visualize_with_pyvis(G, entity_types, output_path="knowledge_graph.html", height=800, width=1000):
    """Create an interactive visualization using PyVis"""
    # Set up colors for entity types
    color_palette = ["#3da4ab", "#f15025", "#094074", "#ffcc00", "#7b2cbf", "#219ebc", 
                     "#fb8b24", "#8338ec", "#06d6a0", "#ef476f", "#073b4c", "#118ab2"]
    
    color_map = {}
    for i, entity_type in enumerate(entity_types):
        color_map[entity_type] = color_palette[i % len(color_palette)]
    
    # Create PyVis network
    net = Network(height=f"{height}px", width=f"{width}px", 
                 bgcolor="#222222", font_color="white")
    
    # Configure physics for better layout
    net.barnes_hut(gravity=-80000, central_gravity=0.3, spring_length=250, spring_strength=0.001)
    
    # Add nodes and edges from NetworkX graph
    net.from_nx(G)
    
    # Set node colors based on entity type and configure nodes
    for node in net.nodes:
        entity_type = node["group"]
        node["color"] = color_map.get(entity_type, "#666666")
        node["size"] = 15  # Base size
        
        # Adjust size based on node connectivity (larger for more connected nodes)
        if "physics" not in node:
            node["physics"] = True
    
    # Configure edges
    for edge in net.edges:
        edge["color"] = {"color": "#ffffff", "opacity": 0.5}
        edge["width"] = 1.5
    
    # Add legend as HTML
    legend_html = """
    <div style="position: absolute; top: 10px; left: 10px; z-index: 1; background-color: rgba(0,0,0,0.7); padding: 10px; border-radius: 5px;">
        <h3 style="color: white; margin-bottom: 5px;">Entity Types</h3>
    """
    
    for entity_type, color in color_map.items():
        legend_html += f"""
        <div style="margin: 5px 0;">
            <span style="display: inline-block; width: 15px; height: 15px; border-radius: 50%; background-color: {color};"></span>
            <span style="color: white; margin-left: 5px;">{entity_type}</span>
        </div>
        """
    
    legend_html += """
    <div style="margin-top: 10px; color: white;">
        <p><b>Controls:</b><br>
        - Scroll to zoom<br>
        - Drag to pan<br>
        - Click node to focus</p>
    </div>
    </div>
    """
    
    # Add navigation buttons
    buttons_html = """
    <div style="position: absolute; bottom: 10px; left: 10px; z-index: 1;">
        <button id="zoom-in" style="background: #444; color: white; border: none; padding: 5px 10px; margin-right: 5px; cursor: pointer;">Zoom In</button>
        <button id="zoom-out" style="background: #444; color: white; border: none; padding: 5px 10px; margin-right: 5px; cursor: pointer;">Zoom Out</button>
        <button id="reset" style="background: #444; color: white; border: none; padding: 5px 10px; cursor: pointer;">Reset View</button>
    </div>
    
    <script>
        document.getElementById('zoom-in').addEventListener('click', function() {
            network.zoomIn();
        });
        document.getElementById('zoom-out').addEventListener('click', function() {
            network.zoomOut();
        });
        document.getElementById('reset').addEventListener('click', function() {
            network.fit();
        });
    </script>
    """
    
    # Customize the visualization
    net.set_options("""
    {
      "nodes": {
        "borderWidth": 2,
        "borderWidthSelected": 4,
        "font": {
          "size": 12,
          "face": "Tahoma",
          "strokeWidth": 3,
          "strokeColor": "#222222"
        },
        "shadow": true
      },
      "edges": {
        "arrows": {
          "to": {
            "enabled": true,
            "scaleFactor": 0.5
          }
        },
        "smooth": {
          "enabled": true,
          "type": "dynamic",
          "roundness": 0.5
        },
        "font": {
          "size": 10,
          "color": "#ffffff",
          "strokeWidth": 3,
          "strokeColor": "#222222"
        }
      },
      "physics": {
        "stabilization": {
          "iterations": 100
        }
      },
      "interaction": {
        "hover": true,
        "navigationButtons": true,
        "keyboard": {
          "enabled": true
        }
      }
    }
    """)
    
    # Save the visualization to an HTML file
    net.save_graph(output_path)
    
    # Add legend and buttons to the HTML file
    with open(output_path, "r") as f:
        html_content = f.read()
    
    # Inject legend and buttons HTML
    modified_html = html_content.replace("</body>", f"{legend_html}{buttons_html}</body>")
    
    with open(output_path, "w") as f:
        f.write(modified_html)
    
    print(f"Interactive visualization saved to {output_path}")
    
    # Display in notebook
    return IFrame(output_path, width=width, height=height)

def visualize_knowledge_graph():
    """Main function to visualize the knowledge graph"""
    # Load data
    entities, relationships = load_graph_data()
    
    # Create main visualization (limiting to a manageable number of nodes)
    G, entity_types, entity_type_map = create_knowledge_graph(entities, relationships, max_nodes=200)
    
    # Create interactive visualization - only generating the HTML file
    vis = visualize_with_pyvis(G, entity_types)
    
    # Display the interactive visualization
    return vis

# Run the visualization
interactive_graph = visualize_knowledge_graph()
display(interactive_graph)

Created graph with 200 nodes and 189 edges
Interactive visualization saved to knowledge_graph.html


# Step 5

In [7]:


# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ApertureDB connection parameters
db_host = os.getenv("APERTUREDB_HOST")
db_password = os.getenv("APERTUREDB_PASSWORD")
db_user = "admin"

# Create ApertureDB connector
def create_db_connection():
    return Connector.Connector(
        host=db_host,
        user=db_user,
        password=db_password
    )

In [57]:
def load_entities():
    """Load deduplicated entities from JSON file"""
    with open("deduplicated_entities_2.json", "r") as f:
        entities = json.load(f)
    return entities

def ensure_indexes(client, entity_types):
    """Create indexes for each entity type's id property"""
    for entity_type in entity_types:
        # Create index for the "id" property for this entity type
        query = [{
            "CreateIndex": {
                "class_name": entity_type,
                "property_name": "id",
                "index_type": "integer"  # Using integer index for ID values
            }
        }]
        
        try:
            response, _ = client.query(query)
            if "CreateIndex" in response[0] and response[0]["CreateIndex"]["status"] == 0:
                logger.info(f"Created index for {entity_type}.id")
            else:
                logger.warning(f"Index creation response: {response[0]}")
        except Exception as e:
            logger.warning(f"Index for {entity_type}.id may already exist: {str(e)}")

In [58]:
def process_entity(args):
    """
    Process and insert a single entity instance into ApertureDB
    
    Args:
        args: Tuple containing (entity_type, instance_name, properties)
        
    Returns:
        dict: Result of the operation
    """
    entity_type, instance_name, properties = args
    
    try:
        # Create a new connection for this thread
        client = create_db_connection()
        
        # Extract the integer ID
        entity_id = properties.get("id")
        if not entity_id:
            return {
                "status": "error", 
                "entity_type": entity_type, 
                "name": instance_name, 
                "error": "Missing ID"
            }
        
        # Convert all property values to strings - keep the ID as an integer
        string_properties = {}
        for key, value in properties.items():
            if value is not None:
                if key == "id":
                    # Keep ID as an integer
                    string_properties[key] = int(value)
                else:
                    # Convert other properties to strings
                    string_properties[key] = str(value)
        
        # Ensure name property is set
        if "name" not in string_properties and instance_name:
            string_properties["name"] = str(instance_name)
        
        # Construct query for adding the entity - ID is now in properties
        query = [{
            "AddEntity": {
                "class": entity_type,
                "properties": string_properties,
                "if_not_found": {
                    "id": ["==", entity_id]  # Use id for deduplication
                }
            }
        }]
        
        # Execute query
        response, _ = client.query(query)
        
        # Check response
        if "AddEntity" in response[0] and response[0]["AddEntity"]["status"] == 0:
            return {
                "status": "success", 
                "entity_type": entity_type, 
                "id": entity_id,
                "name": instance_name
            }
        else:
            return {
                "status": "error", 
                "entity_type": entity_type, 
                "id": entity_id,
                "name": instance_name, 
                "error": str(response)
            }
    
    except Exception as e:
        return {
            "status": "exception", 
            "entity_type": entity_type, 
            "id": properties.get("id", ""),
            "name": instance_name, 
            "error": str(e)
        }

In [59]:
def insert_entities_parallel(entities, max_workers=10, batch_size=100):
    """
    Insert entities into ApertureDB using parallel processing
    
    Args:
        entities: List of entity data from deduplicated_entities_2.json
        max_workers: Maximum number of parallel workers
        batch_size: Entities to process in each batch
        
    Returns:
        tuple: (success_count, error_count, total_time)
    """
    start_time = time.time()
    logger.info(f"Beginning entity insertion with {max_workers} workers")
    
    # Extract unique entity types
    entity_types = [entity["Entity"] for entity in entities]
    
    # Create main client and ensure indexes
    main_client = create_db_connection()
    ensure_indexes(main_client, entity_types)
    
    # Prepare entity instances for processing
    entity_instances = []
    for entity_data in entities:
        entity_type = entity_data["Entity"]
        instances = entity_data["Instances"]
        
        for instance_name, properties in instances.items():
            entity_instances.append((entity_type, instance_name, properties))
    
    total_entities = len(entity_instances)
    logger.info(f"Found {total_entities} entities to insert")
    
    # Process entities in batches to avoid overwhelming the system
    success_count = 0
    error_count = 0
    
    # Create a map to track UUID to ApertureDB _uniqueid mapping
    uuid_to_uniqueid_map = {}
    
    with tqdm(total=total_entities, desc="Inserting entities") as progress:
        for i in range(0, total_entities, batch_size):
            batch = entity_instances[i:i+batch_size]
            
            # Process batch with parallel workers
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                results = list(executor.map(process_entity, batch))
            
            # Update counts
            batch_success = sum(1 for r in results if r["status"] == "success")
            batch_errors = len(batch) - batch_success
            
            success_count += batch_success
            error_count += batch_errors
            
            # Update progress
            progress.update(len(batch))
            
            # Log errors if any
            if batch_errors > 0:
                errors = [r for r in results if r["status"] != "success"]
                for error in errors[:5]:  # Show first 5 errors
                    logger.warning(f"Error: {error['entity_type']} - {error['name']}: {error.get('error', 'Unknown error')}")
                
                if len(errors) > 5:
                    logger.warning(f"... and {len(errors) - 5} more errors")
            
            logger.info(f"Processed {min(i+batch_size, total_entities)}/{total_entities} entities - Success: {batch_success}, Errors: {batch_errors}")
    
    end_time = time.time()
    total_time = end_time - start_time
    
    logger.info(f"Entity insertion completed in {total_time:.2f} seconds")
    logger.info(f"Total: {total_entities}, Success: {success_count}, Errors: {error_count}")
    
    # Save the UUID mapping file for later use in creating connections
    with open("uuid_mapping.json", "w") as f:
        json.dump(uuid_to_uniqueid_map, f, indent=2)
    
    return success_count, error_count, total_time

In [60]:
def verify_entity_insertion():
    """Verify that entities were properly inserted by checking counts"""
    client = create_db_connection()
    
    # Load original entities for comparison
    entities = load_entities()
    
    # Check a few entity types
    check_types = [entities[i]["Entity"] for i in range(min(3, len(entities)))]
    
    print("\n----- Verification -----")
    for entity_type in check_types:
        # Count entities of this type in ApertureDB
        query = [{
            "FindEntity": {
                "with_class": entity_type,
                "results": {
                    "count": True,
                    "list": ["id"]  # Retrieve id values to verify they exist
                }
            }
        }]
        
        try:
            response, _ = client.query(query)
            if "FindEntity" in response[0] and response[0]["FindEntity"]["status"] == 0:
                count = response[0]["FindEntity"].get("count", 0)
                print(f"{entity_type}: {count} entities found in database")
                
                # If we got entities back, check a few id values
                if "entities" in response[0]["FindEntity"] and len(response[0]["FindEntity"]["entities"]) > 0:
                    sample = response[0]["FindEntity"]["entities"][:2]  # Show first 2 entities
                    print(f"  Sample id values: {[e.get('properties', {}).get('id', 'missing') for e in sample]}")
                
            else:
                print(f"{entity_type}: Failed to get count - {response[0]}")
        except Exception as e:
            print(f"{entity_type}: Error during verification - {str(e)}")

In [61]:
def process_step5():
    """Execute Step 5: Insert all entities into ApertureDB"""
    logger.info("Starting Step 5: Entity insertion into ApertureDB")
    
    # Load entities from JSON
    entities = load_entities()
    if not entities:
        logger.error("Failed to load entities from deduplicated_entities_2.json")
        return
    
    # Count total entities for reporting
    total_instances = sum(len(entity["Instances"]) for entity in entities)
    logger.info(f"Loaded {total_instances} entities across {len(entities)} entity types")
    
    # Insert entities in parallel
    success, errors, duration = insert_entities_parallel(
        entities,
        max_workers=10,  # Adjust based on your system capabilities and API limits
        batch_size=50    # Adjust based on entity complexity and API limits
    )
    
    # Report results
    print("\n----- Entity Insertion Results -----")
    print(f"Total entities: {total_instances}")
    print(f"Successfully inserted: {success}")
    print(f"Errors: {errors}")
    print(f"Total time: {duration:.2f} seconds")
    
    # Verify some data was inserted by checking a few entity types
    if success > 0:
        verify_entity_insertion()
    
    return success, errors, duration

def verify_entity_insertion():
    """Verify that entities were properly inserted by checking counts"""
    client = create_db_connection()
    
    # Load original entities for comparison
    entities = load_entities()
    
    # Check a few entity types
    check_types = [entities[i]["Entity"] for i in range(min(3, len(entities)))]
    
    print("\n----- Verification -----")
    for entity_type in check_types:
        # Count entities of this type in ApertureDB
        query = [{
            "FindEntity": {
                "with_class": entity_type,
                "results": {
                    "count": True,
                    "list": ["_ref"]  # Also retrieve _ref values to verify they exist
                }
            }
        }]
        
        try:
            response, _ = client.query(query)
            if "FindEntity" in response[0] and response[0]["FindEntity"]["status"] == 0:
                count = response[0]["FindEntity"].get("count", 0)
                print(f"{entity_type}: {count} entities found in database")
                
                # If we got entities back, check a few _ref values
                if "entities" in response[0]["FindEntity"] and len(response[0]["FindEntity"]["entities"]) > 0:
                    sample = response[0]["FindEntity"]["entities"][:2]  # Show first 2 entities
                    print(f"  Sample _ref values: {[e.get('_ref', 'missing') for e in sample]}")
                
            else:
                print(f"{entity_type}: Failed to get count - {response[0]}")
        except Exception as e:
            print(f"{entity_type}: Error during verification - {str(e)}")

In [62]:
# Execute Step 5
success, errors, duration = process_step5()

2025-05-21 14:02:41,776 - INFO - Starting Step 5: Entity insertion into ApertureDB
2025-05-21 14:02:41,780 - INFO - Loaded 538 entities across 12 entity types
2025-05-21 14:02:41,782 - INFO - Beginning entity insertion with 10 workers
2025-05-21 14:02:46,519 - INFO - Found 538 entities to insert
Inserting entities:   9%|▉         | 50/538 [00:12<02:01,  4.03it/s]2025-05-21 14:02:58,948 - INFO - Processed 50/538 entities - Success: 50, Errors: 0
Inserting entities:  19%|█▊        | 100/538 [00:28<02:07,  3.45it/s]2025-05-21 14:03:14,934 - INFO - Processed 100/538 entities - Success: 50, Errors: 0
Inserting entities:  28%|██▊       | 150/538 [00:41<01:48,  3.58it/s]2025-05-21 14:03:28,208 - INFO - Processed 150/538 entities - Success: 50, Errors: 0
Inserting entities:  37%|███▋      | 200/538 [00:54<01:30,  3.72it/s]2025-05-21 14:03:40,837 - INFO - Processed 200/538 entities - Success: 50, Errors: 0
Inserting entities:  46%|████▋     | 250/538 [01:06<01:15,  3.83it/s]2025-05-21 14:03:53,


----- Entity Insertion Results -----
Total entities: 538
Successfully inserted: 538
Errors: 0
Total time: 140.98 seconds

----- Verification -----
Computing Concept: 157 entities found in database
  Sample _ref values: [None, None]
System Architecture: 31 entities found in database
  Sample _ref values: [None, None]
Resource: 92 entities found in database
  Sample _ref values: [None, None]


# Step 6

In [8]:
def process_relationship(args):
    """
    Process and create a single relationship between entities in ApertureDB
    
    Args:
        args: Tuple containing (relationship_data, total_relationships, index)
        
    Returns:
        dict: Result of the operation
    """
    relationship, total_relationships, index = args
    
    try:
        # Create a connection for this thread
        client = create_db_connection()
        
        # Extract relationship information
        rel_type = relationship["relationship"]
        source_entity_type = relationship["source"]["entity"]
        source_entity_id = relationship["source"]["id"]
        dest_entity_type = relationship["destination"]["entity"]
        dest_entity_id = relationship["destination"]["id"]
        
        # Create a query that:
        # 1. Finds the source entity by class and ID
        # 2. Finds the destination entity by class and ID
        # 3. Creates a connection between them with relationship type as class
        query = [
            # Find source entity and assign temporary reference 1
            {
                "FindEntity": {
                    "with_class": source_entity_type,
                    "constraints": {
                        "id": ["==", source_entity_id]
                    },
                    "_ref": 1,  # Assign temporary reference 1
                    "results": {
                        "count": True,
                        "list": ["id"]  # Just need minimal data
                    }
                }
            },
            # Find destination entity and assign temporary reference 2
            {
                "FindEntity": {
                    "with_class": dest_entity_type,
                    "constraints": {
                        "id": ["==", dest_entity_id]
                    },
                    "_ref": 2,  # Assign temporary reference 2
                    "results": {
                        "count": True,
                        "list": ["id"]  # Just need minimal data
                    }
                }
            },
            # Create connection from source to destination
            {
                "AddConnection": {
                    "class": rel_type,  # Use relationship type as connection class
                    "src": 1,  # Reference to source entity
                    "dst": 2,  # Reference to destination entity
                    # Optional properties for the connection could be added here
                    "properties": {
                        "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
                    }
                }
            }
        ]
        
        # Execute the query
        response, _ = client.query(query)
        
        # Check for success
        source_found = False
        dest_found = False
        connection_created = False
        
        if len(response) >= 3:
            # Check if source entity was found
            if "FindEntity" in response[0] and response[0]["FindEntity"]["status"] == 0:
                source_count = response[0]["FindEntity"].get("count", 0)
                source_found = source_count > 0
            
            # Check if destination entity was found
            if "FindEntity" in response[1] and response[1]["FindEntity"]["status"] == 0:
                dest_count = response[1]["FindEntity"].get("count", 0)
                dest_found = dest_count > 0
            
            # Check if connection was created
            if "AddConnection" in response[2] and response[2]["AddConnection"]["status"] == 0:
                connection_created = True
        
        # Log progress periodically
        if index % 10 == 0 or index == total_relationships - 1:
            logger.info(f"Processed relationship {index + 1}/{total_relationships} ({(index + 1) / total_relationships * 100:.1f}%)")
            
        if source_found and dest_found and connection_created:
            return {
                "status": "success",
                "relationship": rel_type,
                "source": f"{source_entity_type}:{source_entity_id}",
                "destination": f"{dest_entity_type}:{dest_entity_id}"
            }
        else:
            error_details = {
                "source_found": source_found,
                "dest_found": dest_found,
                "connection_created": connection_created
            }
            return {
                "status": "error",
                "relationship": rel_type,
                "source": f"{source_entity_type}:{source_entity_id}",
                "destination": f"{dest_entity_type}:{dest_entity_id}",
                "error": f"Failed to create connection: {error_details}"
            }
            
    except Exception as e:
        return {
            "status": "exception",
            "relationship": rel_type if 'rel_type' in locals() else "unknown",
            "source": f"{source_entity_type}:{source_entity_id}" if 'source_entity_type' in locals() else "unknown",
            "destination": f"{dest_entity_type}:{dest_entity_id}" if 'dest_entity_type' in locals() else "unknown",
            "error": str(e)
        }

def create_relationships_parallel(relationships, max_workers=10, batch_size=20):
    """
    Create relationships between entities in ApertureDB using parallel processing
    
    Args:
        relationships: List of relationships from entity_relationships.json
        max_workers: Maximum number of parallel workers
        batch_size: Relationships to process in each batch
        
    Returns:
        tuple: (success_count, error_count, total_time)
    """
    start_time = time.time()
    logger.info(f"Beginning relationship creation with {max_workers} workers")
    
    total_relationships = len(relationships)
    logger.info(f"Found {total_relationships} relationships to create")
    
    # Process relationships in batches
    success_count = 0
    error_count = 0
    
    # Prepare relationship data with indices
    relationship_data = [(rel, total_relationships, idx) for idx, rel in enumerate(relationships)]
    
    with tqdm(total=total_relationships, desc="Creating relationships") as progress:
        for i in range(0, total_relationships, batch_size):
            batch = relationship_data[i:i+batch_size]
            
            # Process batch with parallel workers
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                results = list(executor.map(process_relationship, batch))
            
            # Update counts
            batch_success = sum(1 for r in results if r["status"] == "success")
            batch_errors = len(batch) - batch_success
            
            success_count += batch_success
            error_count += batch_errors
            
            # Update progress
            progress.update(len(batch))
            
            # Log errors if any
            if batch_errors > 0:
                errors = [r for r in results if r["status"] != "success"]
                for error in errors[:3]:  # Show first 3 errors
                    logger.warning(f"Error: {error.get('relationship', 'unknown')} - {error.get('source', 'unknown')} → {error.get('destination', 'unknown')}: {error.get('error', 'Unknown error')}")
                
                if len(errors) > 3:
                    logger.warning(f"... and {len(errors) - 3} more errors")
    
    end_time = time.time()
    total_time = end_time - start_time
    
    logger.info(f"Relationship creation completed in {total_time:.2f} seconds")
    logger.info(f"Total: {total_relationships}, Success: {success_count}, Errors: {error_count}")
    
    return success_count, error_count, total_time

def verify_relationships():
    """Verify that relationships were created by checking counts"""
    client = create_db_connection()
    
    # Load relationships data
    with open("entity_relationships.json", "r") as f:
        relationships_data = json.load(f)
        relationships = relationships_data.get("relationships", [])
    
    # Get list of unique relationship types
    relationship_types = set(rel["relationship"] for rel in relationships)
    sample_types = list(relationship_types)[:5]  # Check up to 5 types
    
    print("\n----- Relationship Verification -----")
    for rel_type in sample_types:
        # Count connections of this type in ApertureDB
        query = [{
            "FindConnection": {
                "with_class": rel_type,
                "results": {
                    "count": True
                }
            }
        }]
        
        try:
            response, _ = client.query(query)
            if "FindConnection" in response[0] and response[0]["FindConnection"]["status"] == 0:
                count = response[0]["FindConnection"].get("count", 0)
                print(f"Relationship '{rel_type}': {count} connections found in database")
            else:
                print(f"Relationship '{rel_type}': Failed to get count - {response[0]}")
        except Exception as e:
            print(f"Relationship '{rel_type}': Error during verification - {str(e)}")

def process_step5_2():
    """Execute Step 5.2: Create relationships between entities in ApertureDB"""
    logger.info("Starting Step 5.2: Relationship creation in ApertureDB")
    
    # Load relationships from JSON
    try:
        with open("entity_relationships.json", "r") as f:
            relationships_data = json.load(f)
            relationships = relationships_data.get("relationships", [])
    except FileNotFoundError:
        logger.error("entity_relationships.json not found. Please complete Step 4 first.")
        return None
    
    if not relationships:
        logger.error("No relationships found in entity_relationships.json")
        return None
    
    # Create relationships in parallel
    success, errors, duration = create_relationships_parallel(
        relationships,
        max_workers=10,  # Adjust based on system capabilities and API limits
        batch_size=20    # Smaller batch size for more complex operations
    )
    
    # Report results
    print("\n----- Relationship Creation Results -----")
    print(f"Total relationships: {len(relationships)}")
    print(f"Successfully created: {success}")
    print(f"Errors: {errors}")
    print(f"Total time: {duration:.2f} seconds")
    
    # Verify some relationships were created
    if success > 0:
        verify_relationships()
    
    return success, errors, duration

# Execute Step 5.2
success_rel, errors_rel, duration_rel = process_step5_2()

2025-05-21 14:50:39,113 - INFO - Starting Step 5.2: Relationship creation in ApertureDB
2025-05-21 14:50:39,128 - INFO - Beginning relationship creation with 10 workers
2025-05-21 14:50:39,131 - INFO - Found 1170 relationships to create
Creating relationships:   0%|          | 0/1170 [00:00<?, ?it/s]

2025-05-21 14:50:42,087 - INFO - Processed relationship 1/1170 (0.1%)
2025-05-21 14:50:43,983 - INFO - Processed relationship 11/1170 (0.9%)
Creating relationships:   2%|▏         | 20/1170 [00:05<05:05,  3.77it/s]2025-05-21 14:50:46,848 - INFO - Processed relationship 21/1170 (1.8%)
2025-05-21 14:50:48,965 - INFO - Processed relationship 31/1170 (2.6%)
Creating relationships:   3%|▎         | 40/1170 [00:10<04:50,  3.89it/s]2025-05-21 14:50:52,165 - INFO - Processed relationship 41/1170 (3.5%)
2025-05-21 14:50:54,059 - INFO - Processed relationship 51/1170 (4.4%)
Creating relationships:   5%|▌         | 60/1170 [00:15<04:41,  3.94it/s]2025-05-21 14:50:56,867 - INFO - Processed relationship 61/1170 (5.2%)
2025-05-21 14:50:59,063 - INFO - Processed relationship 71/1170 (6.1%)
Creating relationships:   7%|▋         | 80/1170 [00:20<04:36,  3.95it/s]2025-05-21 14:51:01,967 - INFO - Processed relationship 81/1170 (6.9%)
2025-05-21 14:51:04,129 - INFO - Processed relationship 91/1170 (7.8%)


----- Relationship Creation Results -----
Total relationships: 1170
Successfully created: 1154
Errors: 16
Total time: 365.48 seconds

----- Relationship Verification -----
Relationship 'depicts': 3 connections found in database
Relationship 'helps_with': 1 connections found in database
Relationship 'allows_to_operate': 1 connections found in database
Relationship 'enforces_for': 2 connections found in database
Relationship 'utilized_for': 1 connections found in database
