In [26]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import Dict, List
import os
import json
from dotenv import load_dotenv


In [11]:
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")  

In [12]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [5]:
def load_pdf_content(pdf_path, return_single_string=True, extract_metadata=False):
    """
    Load and parse a PDF document, returning its text content.
    
    Args:
        pdf_path (str): Path to the PDF file
        return_single_string (bool): If True, returns the entire PDF content as a single string.
                                    If False, returns a list of strings (one per page).
        extract_metadata (bool): If True, returns metadata along with content
    
    Returns:
        If return_single_string is True and extract_metadata is False:
            str: The entire text content of the PDF
        If return_single_string is False and extract_metadata is False:
            list: List of strings, one for each page
        If extract_metadata is True:
            tuple: (content, metadata) where content is either a string or list based on return_single_string
    """
    
    # Check if the file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    
    # Initialize the loader with the appropriate mode
    mode = "single" if return_single_string else "elements"
    loader = PyPDFLoader(pdf_path, mode=mode)
    
    # Load the documents
    docs = loader.load()
    
    if return_single_string:
        # With mode="single", there should only be one document containing all pages
        content = docs[0].page_content if docs else ""
        metadata = docs[0].metadata if docs else {}
    else:
        # With default mode, each document is a page
        content = [doc.page_content for doc in docs]
        metadata = [doc.metadata for doc in docs]
    
    if extract_metadata:
        return content, metadata
    else:
        return content

In [6]:
doc = load_pdf_content("Cloud Computing Copy Lecture Notes.pdf")

In [9]:
print(doc[:1000])  # Print the first 1000 characters of the loaded document

Cloud Computing Lecture Notes 
Distributed Computing/Systems 
Definition: 
Distributed computing refers to a system where computing resources are distributed 
across multiple locations rather than being centralized in a single system. This enables 
task distribution and efficient resource utilization. 
Why Use Distributed Systems? 
• Scalability Issues: Traditional computing faces bottlenecks due to hardware 
limitations, whereas distributed systems allow for hardware scaling. 
• Connected Devices: In a networked system, connected devices communicate, but 
this does not necessarily make them distributed. 
• IoT (Internet of Things): IoT is one of the largest examples of distributed computing. 
• Multi-layered System Design: Distributed computing enables systems to function 
in multiple layers, with each layer acting as a distributed entity. 
• User Perspective: Although the system consists of multiple machines, distributed 
computing presents a unified system to users. 
 
Parallel Comp

In [15]:
# Define Pydantic model for entity schema parser
class EntitySchema(BaseModel):
    """Entity types and their properties."""
    entities: Dict[str, List[str]] = Field(
        description="Dictionary mapping entity types to their possible properties"
    )

In [37]:
# Create entity extraction chain
def create_entity_extraction_chain():
    parser = JsonOutputParser(pydantic_object=EntitySchema)
    
    # Prompt template
    prompt = PromptTemplate(
        template="""You are part of a multi-step agentic workflow that processes raw text to build a knowledge graph. Here's how the entire workflow functions:

WORKFLOW OVERVIEW:
1. [CURRENT STEP] Entity Type Extraction: Identify entity types and their possible properties
2. Entity Instance Extraction: Extract specific instances of these entities from text chunks
3. Deduplication & ID Assignment: Remove duplicates and assign unique IDs to entities
4. Relationship Extraction: Identify relationships between entity instances
5. Knowledge Graph Construction: Create the actual graph in ApertureDB

YOUR ROLE: You are the Preprocessing & Entity Type Extraction Agent (FIRST agent in this workflow). 
Your ONLY responsibility is to identify general entity types and their possible properties.

INSTRUCTIONS:
1. Identify general entity types (Person, Organization, Location, Event, etc.)
2. For each entity type, list ALL possible properties these entities might have
3. Be comprehensive - include all properties mentioned or implied in the text
4. Focus on properties that would be useful in a knowledge graph
5. Return ONLY a structured JSON object mapping entity types to their properties

RESPONSE FORMAT:
- Return ONLY a valid JSON object with entity types as keys and arrays of property names as values
- Do not include any explanatory text, markdown formatting, or code block markers
- Ensure your JSON uses double quotes for keys and string values
- The output should be directly parseable by Python's json.loads()

Text to process: {input}

{format_instructions}

EXAMPLES OF EXPECTED OUTPUT:
{{"Person": ["name", "age", "title", "email", "phone", "address", "skills", "education"],
"Company": ["name", "industry", "location", "founded_date", "employees", "revenue"],
"Location": ["name", "address", "coordinates", "type", "population"]}}

Response:""",
        input_variables=["input"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    
    # Build the chain
    chain = prompt | llm | parser
    
    return chain


In [38]:
# Function to extract entities from text with retry logic
def extract_entity_schema(text, max_retries=3):
    """
    Extract entity types and their properties from input text with retry logic.
    
    Args:
        text (str): Input text to analyze
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        dict: Dictionary mapping entity types to lists of properties
    """
    chain = create_entity_extraction_chain()
    
    for attempt in range(max_retries):
        try:
            result = chain.invoke({"input": text})
            # The result is the entities dictionary from the Pydantic model
            return result.get("entities", {})
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying... Error: {str(e)[:100]}...")
            else:
                print(f"All {max_retries} attempts failed. Last error: {str(e)[:100]}...")
                # Return empty dict as fallback
                return {}

In [39]:
sample_text = """
John Doe, a 35-year-old software engineer, works at Google in Mountain View.
He graduated from MIT with a degree in Computer Science and has been with the company for 5 years.
Google, founded in 1998, is a technology company specializing in internet services and products.
John lives in San Francisco and commutes to work daily. His email is john.doe@example.com.
"""

entities = extract_entity_schema(sample_text)
print(entities)

print("Extracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")

{'Person': ['name', 'age', 'title', 'employer', 'education', 'tenure', 'residence', 'email'], 'Organization': ['name', 'location', 'founded_date', 'industry', 'specialization', 'type'], 'Location': ['name']}
Extracted Entity Schema:

Person:
- name
- age
- title
- employer
- education
- tenure
- residence
- email

Organization:
- name
- location
- founded_date
- industry
- specialization
- type

Location:
- name


In [40]:
# Extract entities from the loaded PDF document
entities = extract_entity_schema(doc)

print("\nExtracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")


Extracted Entity Schema:

Distributed System:
- definition
- resource_distribution
- location_distribution
- task_distribution
- resource_utilization
- scalability_issues_addressed
- connected_devices_communication
- design_layers
- user_perspective
- common_characteristics

Parallel Computing:
- definition
- goal
- use_cases
- limitations
- infrastructure_needed

Middleware:
- definition
- role
- role_in_clusters
- role_in_grids
- role_in_cloud_computing
- role_in_distributed_systems

Cluster:
- definition
- key_features
- network_type
- geographical_proximity
- comparison_with_grid
- foundation_of

Grid:
- definition
- key_features
- use_case
- incentives
- resource_selling
- drawbacks
- comparison_with_cluster
- foundation_of

Cloud Computing:
- definition
- characteristics
- autoscaling
- supported_workloads
- middleware_role
- desired_attributes
- essential_characteristics
- common_characteristics
- scale
- homogeneity
- virtualization_role
- resilient_computing
- service_orienta

In [None]:
  # Save entity schema for next step
with open("entity_schema.json", "w") as f:
    json.dump({"entities": entities}, f, indent=2)