In [36]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel, RunnableLambda
from pydantic import BaseModel, Field
from typing import Dict, List, Any, Optional
from collections import Counter
from difflib import SequenceMatcher
import os
import json
import asyncio
import uuid
import logging
from tqdm.notebook import tqdm
import time
from dotenv import load_dotenv


In [14]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [15]:
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")  

In [16]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [17]:
def load_pdf_content(pdf_path, return_single_string=True, extract_metadata=False):
    """
    Load and parse a PDF document, returning its text content.
    
    Args:
        pdf_path (str): Path to the PDF file
        return_single_string (bool): If True, returns the entire PDF content as a single string.
                                    If False, returns a list of strings (one per page).
        extract_metadata (bool): If True, returns metadata along with content
    
    Returns:
        If return_single_string is True and extract_metadata is False:
            str: The entire text content of the PDF
        If return_single_string is False and extract_metadata is False:
            list: List of strings, one for each page
        If extract_metadata is True:
            tuple: (content, metadata) where content is either a string or list based on return_single_string
    """
    
    # Check if the file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    
    # Initialize the loader with the appropriate mode
    mode = "single" if return_single_string else "elements"
    loader = PyPDFLoader(pdf_path, mode=mode)
    
    # Load the documents
    docs = loader.load()
    
    if return_single_string:
        # With mode="single", there should only be one document containing all pages
        content = docs[0].page_content if docs else ""
        metadata = docs[0].metadata if docs else {}
    else:
        # With default mode, each document is a page
        content = [doc.page_content for doc in docs]
        metadata = [doc.metadata for doc in docs]
    
    if extract_metadata:
        return content, metadata
    else:
        return content

In [18]:
doc = load_pdf_content("Cloud Computing Copy Lecture Notes.pdf")

In [19]:
print(doc[:1000])  # Print the first 1000 characters of the loaded document

Cloud Computing Lecture Notes 
Distributed Computing/Systems 
Definition: 
Distributed computing refers to a system where computing resources are distributed 
across multiple locations rather than being centralized in a single system. This enables 
task distribution and efficient resource utilization. 
Why Use Distributed Systems? 
• Scalability Issues: Traditional computing faces bottlenecks due to hardware 
limitations, whereas distributed systems allow for hardware scaling. 
• Connected Devices: In a networked system, connected devices communicate, but 
this does not necessarily make them distributed. 
• IoT (Internet of Things): IoT is one of the largest examples of distributed computing. 
• Multi-layered System Design: Distributed computing enables systems to function 
in multiple layers, with each layer acting as a distributed entity. 
• User Perspective: Although the system consists of multiple machines, distributed 
computing presents a unified system to users. 
 
Parallel Comp

# Step 1

In [20]:
# Define Pydantic model for entity schema parser
class EntitySchema(BaseModel):
    """Entity types and their properties."""
    entities: Dict[str, List[str]] = Field(
        description="Dictionary mapping entity types to their possible properties"
    )

In [21]:
# Create entity extraction chain
def create_entity_extraction_chain():
    parser = JsonOutputParser(pydantic_object=EntitySchema)
    
    # Prompt template
    prompt = PromptTemplate(
        template="""
    You are the first agent in a multi-step workflow to build a Knowledge Graph from raw text.

    Workflow Steps Overview:
    1. Extract high-level entity types and their properties from the text. [CURRENT STEP]
    2. Extract specific instances of entities and their properties based on the identified types.
    3. Deduplicate extracted instances and assign them unique identifiers.
    4. Identify and define relationships between the instances of entities.
    5. Create a structured knowledge graph using the extracted entities and relationships.

    You are the FIRST agent in this workflow.


    YOUR TASK:
    - Identify high-level, general entity types (e.g., Person, Company, Location, Event).
    - For each entity type, list all the possible (available) properties it might have.
    - Focus on information that would be useful for structuring a knowledge graph.
    - Stay general — do not extract specific names, examples, or relationships.
    - Avoid unnecessary details or context-specific examples.

    FORMAT:
    - Return a valid JSON object.
    - Keys = entity types (strings).
    - Values = lists of property names (strings).
    - Use double quotes for all keys and string values.
    - No extra explanation, text, or markdown formatting.

    EXAMPLES:
    {{
        "Person": ["name", "age", "email", "address"],
        "Company": ["name", "industry", "founded_date"],
        "Location": ["name", "coordinates", "population"]
    }}

    Text to process: {input}

    {format_instructions}

    Response:
    """,
        input_variables=["input"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    
    # Build the chain
    chain = prompt | llm | parser
    
    return chain


In [22]:
# Function to extract entities from text with retry logic
def extract_entity_schema(text, max_retries=3):
    """
    Extract entity types and their properties from input text with retry logic.
    
    Args:
        text (str): Input text to analyze
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        dict: Dictionary mapping entity types to lists of properties
    """
    chain = create_entity_extraction_chain()
    
    for attempt in range(max_retries):
        try:
            result = chain.invoke({"input": text})
            # The result is the entities dictionary from the Pydantic model
            return result.get("entities", {})
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying... Error: {str(e)[:100]}...")
            else:
                print(f"All {max_retries} attempts failed. Last error: {str(e)[:100]}...")
                # Return empty dict as fallback
                return {}

In [23]:
sample_text = """
John Doe, a 35-year-old software engineer, works at Google in Mountain View.
He graduated from MIT with a degree in Computer Science and has been with the company for 5 years.
Google, founded in 1998, is a technology company specializing in internet services and products.
John lives in San Francisco and commutes to work daily. His email is john.doe@example.com.
"""

entities = extract_entity_schema(sample_text)
print(entities)

print("Extracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")

{'Person': ['name', 'age', 'occupation', 'employer', 'education', 'residence', 'email', 'tenure'], 'Company': ['name', 'location', 'founded_date', 'industry', 'specialization'], 'Location': ['name'], 'Educational Institution': ['name'], 'Field of Study': ['name']}
Extracted Entity Schema:

Person:
- name
- age
- occupation
- employer
- education
- residence
- email
- tenure

Company:
- name
- location
- founded_date
- industry
- specialization

Location:
- name

Educational Institution:
- name

Field of Study:
- name


In [24]:
# Extract entities from the loaded PDF document
entities = extract_entity_schema(doc)

print("\nExtracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")


Extracted Entity Schema:

Computing Concept:
- definition
- characteristics
- use_cases
- limitations
- aspects
- related_concepts

System Architecture:
- description
- characteristics
- components
- use_cases
- comparison_aspects

Platform:
- overview
- purpose
- architecture
- components
- service_offerings
- deployment_aspects
- management_aspects
- security_aspects
- scalability_aspects
- reliability_aspects
- cost_aspects
- features

Resource:
- description
- characteristics
- management_aspects
- lifecycle_aspects
- allocation_aspects
- pricing_aspects
- type

Storage Type:
- description
- characteristics
- use_cases
- pricing_models
- management_aspects

Database Type:
- description
- characteristics
- use_cases
- management_aspects
- migration_aspects

Network Entity:
- definition
- purpose
- characteristics
- components
- management_aspects
- security_aspects
- type

Service Model:
- definition
- characteristics
- responsibility_division

Deployment Model:
- definition
- char

In [25]:
  # Save entity schema for next step
with open("entity_schema.json", "w") as f:
    json.dump({"entities": entities}, f, indent=2)

# Step 2

In [26]:
# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=5000, chunk_overlap=500):
    """
    Split the input text into manageable chunks using RecursiveCharacterTextSplitter.
    
    Args:
        text: The input text to be split
        chunk_size: Maximum size of each chunk in characters
        chunk_overlap: Overlap between consecutive chunks
        
    Returns:
        list: List of Document objects
    """
    logger.info(f"Splitting text into chunks (size={chunk_size}, overlap={chunk_overlap})")
    
    # Initialize the splitter with paragraph-focused splitting
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],  # Try to split at paragraph boundaries first
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        keep_separator=True,
        add_start_index=True  # Add start position metadata
    )
    
    # Split the text into chunks
    chunks = splitter.create_documents([text])
    
    # Add chunk index as metadata
    for i, chunk in enumerate(chunks):
        chunk.metadata["chunk_id"] = i
        chunk.metadata["total_chunks"] = len(chunks)
    
    logger.info(f"Text split into {len(chunks)} chunks")
    return chunks

In [28]:
# Define Pydantic models for entity instance extraction
class EntityInstances(BaseModel):
    """Instances of a specific entity type."""
    Entity: str = Field(description="The entity type name")
    Instances: Dict[str, Dict[str, Any]] = Field(
        description="Dictionary mapping instance names to their properties"
    )

class ChunkExtractionResult(BaseModel):
    """Result of entity extraction from a single chunk."""
    entities: List[EntityInstances] = Field(
        description="List of entity types and their instances found in this chunk"
    )

In [29]:
# Function to create entity instance extraction chain
def create_entity_instance_extraction_chain():
    """
    Create a chain for extracting entity instances from text chunks.
    
    Returns:
        Chain: A chain that extracts entity instances from text chunks
    """
    # Entity instance extraction result parser
    parser = JsonOutputParser(pydantic_object=ChunkExtractionResult)
    
    # Create prompt template for entity instance extraction
    prompt = PromptTemplate(
        template="""
        You are part of a multi-step workflow to build a Knowledge Graph from raw text.

        Workflow Steps Overview:
        1. Extract high-level entity types and their properties from the text. [COMPLETED]
        2. Extract specific instances of entities and their properties from text chunks. [CURRENT STEP]
        3. Deduplicate extracted instances and assign them unique identifiers.
        4. Identify and define relationships between the instances of entities.
        5. Create a structured knowledge graph using the extracted entities and relationships.

        YOUR TASK:
        You are processing a CHUNK of the full text. Focus ONLY on extracting CONCRETE INSTANCES of entities found in this chunk.

        GIVEN:
        1. A chunk of text
        2. A schema of entity types and their possible properties

        INSTRUCTIONS:
        - Extract ALL instances of the predefined entity types found in this chunk
        - For each instance, extract values for as many properties as are mentioned in the text
        - Be precise - only extract information explicitly stated in this chunk
        - Do NOT make up or infer missing properties
        - If a property is not mentioned, omit it from the output (don't include it with null/empty values)

        INPUT TEXT CHUNK:
        {chunk}

        ENTITY TYPES AND THEIR PROPERTIES:
        {entity_schema}

        FORMAT YOUR RESPONSE AS FOLLOWS:
        - Return a valid JSON object
        - For each entity type found, include its name and an "Instances" object
        - "Instances" should be a dictionary where:
          - Keys are the instance names 
          - Values are objects containing the (available) instance properties 
        - Properties not mentioned should be omitted entirely
        - If no instances of a particular entity type are found, do not include that entity type

        {format_instructions}

        EXAMPLE RESPONSE FOR A CHUNK ABOUT PEOPLE AND COMPANIES:
        {{
        "entities": [
            {{
            "Entity": "Person",
            "Instances": {{
                "John Doe": {{
                    "name": "John Doe",
                    "age": 35,
                    "email": "john@example.com"
                }},
                "Jane Smith": {{
                    "name": "Jane Smith",
                    "email": "jane@example.com"
                }}
            }}
            }},
            {{
            "Entity": "Company",
            "Instances": {{
                "Google": {{
                    "industry": "Technology",
                    "founded": 1998
                }}
            }}
            }}
        ]
        }}
        Begin your extraction now: """, 
    input_variables=["chunk", "entity_schema", "chunk_id"], 
    partial_variables={"format_instructions": parser.get_format_instructions()}, 
    )

    # Build the chain
    chain = prompt | llm | parser

    return chain

In [30]:
# Function to process a single chunk with retry logic
def process_chunk(inputs, max_retries=3):
    """
    Process a single text chunk to extract entity instances with retry logic.
    
    Args:
        inputs: Dictionary containing chunk and entity_schema
        max_retries: Maximum number of retry attempts
        
    Returns:
        dict: Extraction results
    """
    chunk = inputs["chunk"]
    entity_schema = inputs["entity_schema"]
    chunk_id = chunk.metadata.get("chunk_id", 0)
    
    for attempt in range(max_retries):
        try:
            chain = create_entity_instance_extraction_chain()
            result = chain.invoke({
                "chunk": chunk.page_content,
                "entity_schema": json.dumps(entity_schema, indent=2),
                "chunk_id": chunk_id
            })
            logger.info(f"Successfully processed chunk {chunk_id}")
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                logger.warning(f"Attempt {attempt + 1} failed for chunk {chunk_id}. Retrying... Error: {str(e)[:100]}...")
            else:
                logger.error(f"All {max_retries} attempts failed for chunk {chunk_id}. Error: {str(e)[:100]}...")
                return {
                    "entities": [],
                    "chunk_id": chunk_id,
                    "error": str(e)[:200]
                }

In [31]:
# Modify the extract_entity_instances_parallel function to stop saving intermediate files
def extract_entity_instances_parallel(document, entity_schema, max_concurrency=6):
    """
    Extract entity instances from document chunks in parallel using RunnableParallel.
    
    Args:
        document: The full text document
        entity_schema: Dictionary of entity types and their properties
        max_concurrency: Maximum number of chunks to process in parallel
        
    Returns:
        list: List of entity instances extracted from all chunks
    """
    start_time = time.time()
    logger.info("Beginning entity instance extraction (parallel)")
    
    # Split the document into chunks
    chunks = split_text_into_chunks(document, chunk_size=5000, chunk_overlap=500)
    logger.info(f"Document split into {len(chunks)} chunks")
    
    # Prepare inputs for each chunk
    inputs = [{"chunk": chunk, "entity_schema": entity_schema} for chunk in chunks]
    
    # Create a RunnableLambda for chunk processing
    chunk_processor = RunnableLambda(process_chunk)
    
    # Process chunks in batches with progress tracking
    all_results = []
    batch_size = min(max_concurrency, len(chunks))
    
    # Use tqdm for progress tracking in batches
    with tqdm(total=len(chunks), desc="Processing chunks") as progress_bar:
        for i in range(0, len(inputs), batch_size):
            batch_inputs = inputs[i:i+batch_size]
            
            # Process the batch in parallel
            batch_results = chunk_processor.batch(batch_inputs, config={"max_concurrency": max_concurrency})
            all_results.extend(batch_results)
            
            # Update progress bar
            progress_bar.update(len(batch_inputs))
            
            # Log progress without saving intermediate files
            logger.info(f"Processed {min(i+batch_size, len(inputs))}/{len(inputs)} chunks ({min((i+batch_size)/len(inputs), 1.0)*100:.1f}%)")
    
    end_time = time.time()
    logger.info(f"Entity instance extraction completed in {end_time - start_time:.2f} seconds")
    
    return all_results

In [32]:
def merge_chunk_results(chunk_results):
    """
    Merge the results from all chunks.
    
    Args:
        chunk_results: List of extraction results from chunks
        
    Returns:
        list: Combined list of entity instances
    """
    merged_results = []
    
    # Check for errors
    errors = [r for r in chunk_results if r.get("error")]
    if errors:
        logger.warning(f"{len(errors)} chunks had errors during processing")
    
    # Group by entity type
    entity_instances = {}
    for result in chunk_results:
        if "entities" not in result:
            continue
            
        for entity_data in result["entities"]:
            entity_type = entity_data.get("Entity")
            instances = entity_data.get("Instances", {})
            
            if entity_type not in entity_instances:
                entity_instances[entity_type] = {}
                
            # Merge instances from this chunk into the collected instances
            # If instance already exists, update with any new properties
            for instance_name, instance_props in instances.items():
                if instance_name in entity_instances[entity_type]:
                    # Add any new properties from this instance
                    entity_instances[entity_type][instance_name].update(instance_props)
                else:
                    # Add the new instance
                    entity_instances[entity_type][instance_name] = instance_props
    
    # Convert to the expected format
    for entity_type, instances in entity_instances.items():
        merged_results.append({
            "Entity": entity_type,
            "Instances": instances
        })
    
    logger.info(f"Merged results for {len(entity_instances)} entity types")
    return merged_results

In [34]:
# Load the entity schema
with open("entity_schema.json", "r") as f:
    entity_schema_data = json.load(f)
    entity_schema = entity_schema_data.get("entities", {})

# Extract entity instances using parallel processing
chunk_results = extract_entity_instances_parallel(
    document=doc,
    entity_schema=entity_schema,
    max_concurrency=6  # Process 6 chunks in parallel
)

# Merge results from all chunks
merged_results = merge_chunk_results(chunk_results)

# Save only one final result file
output_filepath = "entity_instances.json"
with open(output_filepath, "w") as f:
    json.dump(merged_results, f, indent=2)

print(f"\nExtracted {sum(len(entity['Instances']) for entity in merged_results)} instances across {len(merged_results)} entity types")
print(f"Results saved to {output_filepath}")

# Print sample results
print("\nSample instances:")
for entity in merged_results:
    entity_type = entity["Entity"]
    instances = entity["Instances"]
    print(f"\n{entity_type} ({len(instances)} instances):")
    
    # Get list of instance keys (names) and take first 3
    instance_keys = list(instances.keys())[:3]
    
    # Display up to 3 instances
    for i, instance_name in enumerate(instance_keys):
        instance_data = instances[instance_name]
        print(f"  Instance {i+1}: {instance_name} - {str(instance_data)[:100]}...")
    
    if len(instances) > 3:
        print(f"  ... and {len(instances) - 3} more")

2025-04-27 16:15:49,153 - INFO - Beginning entity instance extraction (parallel)
2025-04-27 16:15:49,155 - INFO - Splitting text into chunks (size=5000, overlap=500)
2025-04-27 16:15:49,164 - INFO - Text split into 11 chunks
2025-04-27 16:15:49,165 - INFO - Document split into 11 chunks


Processing chunks:   0%|          | 0/11 [00:00<?, ?it/s]

2025-04-27 16:16:23,535 - INFO - Successfully processed chunk 0
2025-04-27 16:16:24,912 - INFO - Successfully processed chunk 3
2025-04-27 16:16:33,934 - INFO - Successfully processed chunk 1
2025-04-27 16:16:35,341 - INFO - Successfully processed chunk 2
2025-04-27 16:16:42,185 - INFO - Successfully processed chunk 4
2025-04-27 16:16:43,083 - INFO - Successfully processed chunk 5
2025-04-27 16:16:43,097 - INFO - Processed 6/11 chunks (54.5%)
2025-04-27 16:17:11,445 - INFO - Successfully processed chunk 6
2025-04-27 16:17:33,340 - INFO - Successfully processed chunk 10
2025-04-27 16:17:37,597 - INFO - Successfully processed chunk 8
2025-04-27 16:17:53,346 - INFO - Successfully processed chunk 9
2025-04-27 16:18:10,034 - INFO - Successfully processed chunk 7
2025-04-27 16:18:10,042 - INFO - Processed 11/11 chunks (100.0%)
2025-04-27 16:18:10,045 - INFO - Entity instance extraction completed in 140.89 seconds
2025-04-27 16:18:10,051 - INFO - Merged results for 12 entity types



Extracted 538 instances across 12 entity types
Results saved to entity_instances.json

Sample instances:

Computing Concept (157 instances):
  Instance 1: Distributed Computing - {'definition': 'a system where computing resources are distributed across multiple locations rather ...
  Instance 2: Traditional computing - {'limitations': ['faces bottlenecks due to hardware limitations']}...
  Instance 3: Parallel Computing - {'definition': 'involves executing multiple processes simultaneously to enhance speed and efficiency...
  ... and 154 more

System Architecture (31 instances):
  Instance 1: Distributed Systems - {'description': 'a system where computing resources are distributed across multiple locations rather...
  Instance 2: Clusters - {'description': 'consist of multiple machines with similar hardware and operating systems, working t...
  Instance 3: Grids - {'description': 'consist of heterogeneous systems that may have different hardware, OS, and configur...
  ... and 28 more


# Step 3

In [41]:
def deduplicate_and_assign_ids(entity_data):
    """
    Remove duplicate entity instances and assign unique IDs to each remaining instance.
    
    Args:
        entity_data: List of dictionaries containing entity types and their instances
        
    Returns:
        list: Deduplicated entity instances with unique IDs assigned
    """
    logger.info("Beginning deduplication and ID assignment")
    start_time = time.time()
    
    # Create a deep copy of the input data to avoid modifying it
    deduplicated_data = []
    
    # Track statistics
    total_instances_before = 0
    total_instances_after = 0
    duplicates_found = 0
    
    # Process each entity type
    for entity in entity_data:
        entity_type = entity["Entity"]
        instances = entity["Instances"]
        total_instances_before += len(instances)
        
        # Create a new dictionary for the deduplicated instances
        deduplicated_instances = {}
        
        # For each instance, check if it already exists and merge if needed
        for instance_name, instance_props in instances.items():
            # If this instance name already exists, merge properties
            if instance_name in deduplicated_instances:
                duplicates_found += 1
                existing_props = deduplicated_instances[instance_name]
                
                # Merge properties, keeping all unique properties
                for key, value in instance_props.items():
                    if key not in existing_props:
                        existing_props[key] = value
                        
                # Log information about the merge
                logger.debug(f"Merged duplicate instance '{instance_name}' in entity type '{entity_type}'")
            else:
                # Add this instance to the deduplicated set
                deduplicated_instances[instance_name] = instance_props.copy()
        
        # Now assign unique IDs to each instance
        for instance_name, props in deduplicated_instances.items():
            # Generate a UUID for this instance
            instance_id = str(uuid.uuid4())
            
            # Add the ID to the properties
            props["id"] = instance_id
            
            # Ensure 'name' property matches the instance name
            if "name" not in props or props["name"] != instance_name:
                props["name"] = instance_name
        
        # Add the deduplicated entity to the result
        deduplicated_data.append({
            "Entity": entity_type,
            "Instances": deduplicated_instances
        })
        
        total_instances_after += len(deduplicated_instances)
        
        logger.info(f"Processed entity type '{entity_type}': {len(instances)} instances → {len(deduplicated_instances)} unique instances")
    
    end_time = time.time()
    logger.info(f"Deduplication completed in {end_time - start_time:.2f} seconds")
    logger.info(f"Total instances before: {total_instances_before}, after: {total_instances_after}")
    logger.info(f"Removed {duplicates_found} duplicate instances")
    
    return deduplicated_data

In [42]:
def process_step3():
    """
    Execute Step 3: Deduplicate entity instances and assign unique IDs.
    """
    # Load data from Step 2
    logger.info("Loading entity instances from Step 2")
    try:
        with open("entity_instances.json", "r") as f:
            entity_instances = json.load(f)
    except FileNotFoundError:
        logger.error("entity_instances.json not found. Please complete Step 2 first.")
        return None
    
    # Deduplicate and assign IDs
    deduplicated_data = deduplicate_and_assign_ids(entity_instances)
    
    # Save the deduplicated data
    output_filepath = "deduplicated_entities.json"
    with open(output_filepath, "w") as f:
        json.dump(deduplicated_data, f, indent=2)
    
    logger.info(f"Deduplicated entities saved to {output_filepath}")
    
    # Print summary
    total_instances = sum(len(entity["Instances"]) for entity in deduplicated_data)
    print(f"\nDeduplication complete: {total_instances} unique instances across {len(deduplicated_data)} entity types")
    print(f"Results saved to {output_filepath}")
    
    # Print sample results
    print("\nSample deduplicated instances:")
    for entity in deduplicated_data:
        entity_type = entity["Entity"]
        instances = entity["Instances"]
        print(f"\n{entity_type} ({len(instances)} instances):")
        
        # Get list of instance keys (names) and take first 3
        instance_keys = list(instances.keys())[:3]
        
        # Display up to 3 instances
        for i, instance_name in enumerate(instance_keys):
            instance_data = instances[instance_name]
            # Show ID and a few other properties 
            props_preview = {k: v for k, v in list(instance_data.items())[:4]}
            print(f"  Instance {i+1}: {instance_name} - {props_preview}")
        
        if len(instances) > 3:
            print(f"  ... and {len(instances) - 3} more")
    
    return deduplicated_data


In [40]:
# Execute Step 3
deduplicated_entities = process_step3()

2025-04-27 16:58:31,603 - INFO - Loading entity instances from Step 2
2025-04-27 16:58:31,634 - INFO - Beginning deduplication and ID assignment
2025-04-27 16:58:31,640 - INFO - Processed entity type 'Computing Concept': 157 instances → 157 unique instances
2025-04-27 16:58:31,642 - INFO - Processed entity type 'System Architecture': 31 instances → 31 unique instances
2025-04-27 16:58:31,645 - INFO - Processed entity type 'Resource': 92 instances → 92 unique instances
2025-04-27 16:58:31,647 - INFO - Processed entity type 'Storage Type': 16 instances → 16 unique instances
2025-04-27 16:58:31,651 - INFO - Processed entity type 'Network Entity': 60 instances → 60 unique instances
2025-04-27 16:58:31,653 - INFO - Processed entity type 'Role': 28 instances → 28 unique instances
2025-04-27 16:58:31,655 - INFO - Processed entity type 'Organization': 23 instances → 23 unique instances
2025-04-27 16:58:31,659 - INFO - Processed entity type 'Software Component': 66 instances → 66 unique instanc


Deduplication complete: 538 unique instances across 12 entity types
Results saved to deduplicated_entities.json

Sample deduplicated instances:

Computing Concept (157 instances):
  Instance 1: Distributed Computing - {'definition': 'a system where computing resources are distributed across multiple locations rather than being centralized in a single system.', 'aspects': ['enables task distribution and efficient resource utilization', 'enables systems to function in multiple layers, with each layer acting as a distributed entity', 'presents a unified system to users'], 'use_cases': ['IoT (Internet of Things)'], 'related_concepts': ['Parallel Computing', 'Cloud Computing']}
  Instance 2: Traditional computing - {'limitations': ['faces bottlenecks due to hardware limitations'], 'id': 'aeb3fe06-4a88-4285-ba29-90cc292e4b99', 'name': 'Traditional computing'}
  Instance 3: Parallel Computing - {'definition': 'involves executing multiple processes simultaneously to enhance speed and efficien