In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import Dict, List
import os
import json
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")  

In [3]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-preview-04-17",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [4]:
def load_pdf_content(pdf_path, return_single_string=True, extract_metadata=False):
    """
    Load and parse a PDF document, returning its text content.
    
    Args:
        pdf_path (str): Path to the PDF file
        return_single_string (bool): If True, returns the entire PDF content as a single string.
                                    If False, returns a list of strings (one per page).
        extract_metadata (bool): If True, returns metadata along with content
    
    Returns:
        If return_single_string is True and extract_metadata is False:
            str: The entire text content of the PDF
        If return_single_string is False and extract_metadata is False:
            list: List of strings, one for each page
        If extract_metadata is True:
            tuple: (content, metadata) where content is either a string or list based on return_single_string
    """
    
    # Check if the file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    
    # Initialize the loader with the appropriate mode
    mode = "single" if return_single_string else "elements"
    loader = PyPDFLoader(pdf_path, mode=mode)
    
    # Load the documents
    docs = loader.load()
    
    if return_single_string:
        # With mode="single", there should only be one document containing all pages
        content = docs[0].page_content if docs else ""
        metadata = docs[0].metadata if docs else {}
    else:
        # With default mode, each document is a page
        content = [doc.page_content for doc in docs]
        metadata = [doc.metadata for doc in docs]
    
    if extract_metadata:
        return content, metadata
    else:
        return content

In [5]:
doc = load_pdf_content("Cloud Computing Copy Lecture Notes.pdf")

In [6]:
print(doc[:1000])  # Print the first 1000 characters of the loaded document

Cloud Computing Lecture Notes 
Distributed Computing/Systems 
Definition: 
Distributed computing refers to a system where computing resources are distributed 
across multiple locations rather than being centralized in a single system. This enables 
task distribution and efficient resource utilization. 
Why Use Distributed Systems? 
• Scalability Issues: Traditional computing faces bottlenecks due to hardware 
limitations, whereas distributed systems allow for hardware scaling. 
• Connected Devices: In a networked system, connected devices communicate, but 
this does not necessarily make them distributed. 
• IoT (Internet of Things): IoT is one of the largest examples of distributed computing. 
• Multi-layered System Design: Distributed computing enables systems to function 
in multiple layers, with each layer acting as a distributed entity. 
• User Perspective: Although the system consists of multiple machines, distributed 
computing presents a unified system to users. 
 
Parallel Comp

In [7]:
# Define Pydantic model for entity schema parser
class EntitySchema(BaseModel):
    """Entity types and their properties."""
    entities: Dict[str, List[str]] = Field(
        description="Dictionary mapping entity types to their possible properties"
    )

In [19]:
# Create entity extraction chain
def create_entity_extraction_chain():
    parser = JsonOutputParser(pydantic_object=EntitySchema)
    
    # Prompt template
    prompt = PromptTemplate(
        template="""
    You are the first agent in a multi-step workflow to build a Knowledge Graph from raw text.

    Workflow Steps Overview:
    1. Extract high-level entity types and their properties from the text. [CURRENT STEP]
    2. Extract specific instances of entities and their properties based on the identified types.
    3. Deduplicate extracted instances and assign them unique identifiers.
    4. Identify and define relationships between the instances of entities.
    5. Create a structured knowledge graph using the extracted entities and relationships.

    You are the FIRST agent in this workflow.


    YOUR TASK:
    - Identify high-level, general entity types (e.g., Person, Company, Location, Event).
    - For each entity type, list all the possible (available) properties it might have.
    - Focus on information that would be useful for structuring a knowledge graph.
    - Stay general — do not extract specific names, examples, or relationships.
    - Avoid unnecessary details or context-specific examples.

    FORMAT:
    - Return a valid JSON object.
    - Keys = entity types (strings).
    - Values = lists of property names (strings).
    - Use double quotes for all keys and string values.
    - No extra explanation, text, or markdown formatting.

    EXAMPLES:
    {{
        "Person": ["name", "age", "email", "address"],
        "Company": ["name", "industry", "founded_date"],
        "Location": ["name", "coordinates", "population"]
    }}

    Text to process: {input}

    {format_instructions}

    Response:
    """,
        input_variables=["input"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    
    # Build the chain
    chain = prompt | llm | parser
    
    return chain


In [14]:
# Function to extract entities from text with retry logic
def extract_entity_schema(text, max_retries=3):
    """
    Extract entity types and their properties from input text with retry logic.
    
    Args:
        text (str): Input text to analyze
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        dict: Dictionary mapping entity types to lists of properties
    """
    chain = create_entity_extraction_chain()
    
    for attempt in range(max_retries):
        try:
            result = chain.invoke({"input": text})
            # The result is the entities dictionary from the Pydantic model
            return result.get("entities", {})
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying... Error: {str(e)[:100]}...")
            else:
                print(f"All {max_retries} attempts failed. Last error: {str(e)[:100]}...")
                # Return empty dict as fallback
                return {}

In [15]:
sample_text = """
John Doe, a 35-year-old software engineer, works at Google in Mountain View.
He graduated from MIT with a degree in Computer Science and has been with the company for 5 years.
Google, founded in 1998, is a technology company specializing in internet services and products.
John lives in San Francisco and commutes to work daily. His email is john.doe@example.com.
"""

entities = extract_entity_schema(sample_text)
print(entities)

print("Extracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")

{'Person': ['name', 'age', 'occupation', 'email', 'residence', 'education'], 'Company': ['name', 'industry', 'founded_date', 'location'], 'Location': ['name'], 'Organization': ['name', 'type'], 'Degree': ['name', 'field_of_study', 'institution']}
Extracted Entity Schema:

Person:
- name
- age
- occupation
- email
- residence
- education

Company:
- name
- industry
- founded_date
- location

Location:
- name

Organization:
- name
- type

Degree:
- name
- field_of_study
- institution


In [20]:
# Extract entities from the loaded PDF document
entities = extract_entity_schema(doc)

print("\nExtracted Entity Schema:")
for entity_type, properties in entities.items():
    print(f"\n{entity_type}:")
    for prop in properties:
        print(f"- {prop}")


Extracted Entity Schema:

Computing Concept:
- definition
- characteristics
- use_cases
- limitations
- aspects
- related_concepts

System Architecture:
- description
- characteristics
- components
- use_cases
- comparison_aspects

Platform:
- overview
- purpose
- architecture
- components
- service_offerings
- deployment_aspects
- management_aspects
- security_aspects
- scalability_aspects
- reliability_aspects
- cost_aspects
- features

Resource:
- description
- characteristics
- management_aspects
- lifecycle_aspects
- allocation_aspects
- pricing_aspects
- type

Storage Type:
- description
- characteristics
- use_cases
- pricing_models
- management_aspects

Database Type:
- description
- characteristics
- use_cases
- management_aspects
- migration_aspects

Network Entity:
- definition
- purpose
- characteristics
- components
- management_aspects
- security_aspects
- type

Service Model:
- definition
- characteristics
- responsibility_division

Deployment Model:
- definition
- char

In [None]:
  # Save entity schema for next step
with open("entity_schema.json", "w") as f:
    json.dump({"entities": entities}, f, indent=2)