In [60]:
import os
from dotenv import load_dotenv
from typing import Dict, List
from pydantic import BaseModel, Field
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import PyPDFLoader
from langchain_core.runnables import RunnableParallel
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time


In [3]:
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")  

In [4]:
# Define Pydantic models for structured output
class EntityType(BaseModel):
    """Model for an entity type with its properties."""
    properties: List[str] = Field(description="List of properties for this entity type")

class EntityExtractionOutput(BaseModel):
    """Model for the entity extraction output."""
    entity_types: Dict[str, List[str]] = Field(
        description="Dictionary mapping entity types to their properties"
    )
    
    @classmethod
    def from_dict(cls, data: Dict[str, List[str]]) -> "EntityExtractionOutput":
        """Create an EntityExtractionOutput from a dictionary."""
        return cls(entity_types=data)

In [5]:
# Initialize the LLM with Gemini 2.0 Flash
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",  # Using Gemini 2.0 Flash
    temperature=0,
    google_api_key=google_api_key
)

# Create the output parser
parser = PydanticOutputParser(pydantic_object=EntityExtractionOutput)

# Define the prompt template
template = """You are part of an agentic workflow that processes data input by a user step by step. 
The end result of the workflow is a detailed knowledge graph generated from the input data.

You are the Preprocessing & Entity Extraction Agent, which is the FIRST agent in this workflow. Your responsibilities are:
- Identify entity types and associated properties (e.g., name, age, industry)
- Return a global entity list for further processing

Given the following text, extract broad, general categories of entities and list the general types of properties associated with each category.

Focus on identifying general categories of entities and generalizable properties that could apply to those categories.

Text to process: {input}

{format_instructions}

Response:"""

# Create the prompt
prompt = ChatPromptTemplate.from_template(template=template)

# Create the chain
chain = (
    {"input": RunnablePassthrough(), "format_instructions": lambda _: parser.get_format_instructions()}
    | prompt
    | llm
    | parser
)

In [6]:
def extract_entity_types(text: str) -> Dict[str, List[str]]:
    """Extract entity types and their properties from text.
    
    Args:
        text: The input text to process
        
    Returns:
        A dictionary mapping entity types to their properties
    """
    result = chain.invoke(text)
    return result.entity_types

def workflow_step_1(text: str) -> Dict[str, List[str]]:
    """First step in the knowledge graph workflow.
    
    Args:
        text: The input text to process
        
    Returns:
        A dictionary mapping entity types to their properties
    """
    return extract_entity_types(text)

In [20]:
def extract_text_from_file(file_path: str) -> str:
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith('.docx'):
        loader = DocxLoader(file_path)
    else:
        raise ValueError("Unsupported file type. Only PDF and DOCX are supported.")
    
    # Load the document and extract text
    document = loader.load()
    extracted_text = "\n".join(page.page_content for page in document)
    
    # Return the extracted text
    return extracted_text

In [21]:
extracted_text = extract_text_from_file("2202.04903v1.pdf")

print(extracted_text)

Investigating Explainability of Generative AI for Code through
Scenario-based Design
Jiao Sun∗
University of Southern California
Los Angeles, USA
jiaosun@usc.edu
Q. Vera Liao†
Microsoft Research
Montréal, Canada
veraliao@microsoft.com
Michael Muller
IBM Research AI
Yorktown Heights, USA
michael_muller@us.ibm.com
Mayank Agarwal
IBM Research AI
Yorktown Heights, USA
Mayank.Agarwal@ibm.com
Stephanie Houde
IBM Research AI
Yorktown Heights, USA
Stephanie.Houde@ibm.com
Kartik Talamadupula
IBM Research AI
Yorktown Heights, USA
krtalamad@us.ibm.com
Justin D. Weisz
IBM Research AI
Yorktown Heights, USA
jweisz@us.ibm.com
ABSTRACT
What does it mean for a generative AI model to be explainable?
The emergent discipline of explainable AI (XAI) has made great
strides in helping people understand discriminative models. Less
attention has been paid to generative models that produce arti-
facts, rather than decisions, as output. Meanwhile, generative AI
(GenAI) technologies are maturing and being applied

In [22]:

# Now pass the extracted text to the existing workflow
result = workflow_step_1(extracted_text)

# Display the result
print(json.dumps(result, indent=2))

{
  "Person": [
    "name",
    "affiliation",
    "email",
    "location"
  ],
  "Organization": [
    "name",
    "location"
  ],
  "Publication": [
    "title",
    "authors",
    "date",
    "venue",
    "keywords",
    "abstract",
    "doi",
    "isbn"
  ],
  "AI_Model": [
    "name",
    "type",
    "application_domain",
    "training_data",
    "capabilities",
    "limitations",
    "performance_metrics"
  ],
  "Software_Engineering_Task": [
    "name",
    "description",
    "programming_languages",
    "input",
    "output"
  ],
  "Conference": [
    "name",
    "date",
    "location"
  ]
}


In [6]:
 # Example text
sample_text = """
John Smith works at Acme Corporation in New York. 
He is a senior software engineer with 10 years of experience.
Acme Corporation is a technology company founded in 2010.
"""

# Process the text
result = workflow_step_1(sample_text)

# Display the result
print(json.dumps(result, indent=2))

{
  "Person": [
    "name",
    "occupation",
    "experience",
    "location"
  ],
  "Organization": [
    "name",
    "industry",
    "founding_year",
    "location"
  ]
}


In [7]:
# Another example
another_text = """
The Great Wall of China is one of the most impressive architectural achievements in history.
It was built during the Ming Dynasty (1368-1644) to protect China from invasions.
The wall stretches over 13,000 miles and was constructed using various materials including stone, brick, and wood.
Today, it is a UNESCO World Heritage site and attracts millions of tourists annually.
"""

# Process the text
result = workflow_step_1(another_text)

# Display the result
print("\nSecond example:")
print(json.dumps(result, indent=2)) 


Second example:
{
  "ArchitecturalStructure": [
    "name",
    "location",
    "construction_materials",
    "length",
    "construction_period",
    "historical_significance",
    "status"
  ],
  "Dynasty": [
    "name",
    "start_year",
    "end_year",
    "rulers",
    "significant_events"
  ],
  "GeographicLocation": [
    "name",
    "type"
  ],
  "Organization": [
    "name",
    "type"
  ]
}


In [50]:
def split_text_into_chunks(text: str, chunk_size: int = 1000) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    return text_splitter.split_text(text)

In [51]:
class EntityInstance(BaseModel):
    entity: str = Field(description="The entity type")
    instances: List[Dict[str, str]] = Field(description="List of instances with available properties")

class EntityInstancesExtractionOutput(BaseModel):
    entity_instances: List[EntityInstance] = Field(description="List of entity instances with properties")

In [52]:
workflow_2_template = """You are part of an agentic workflow that processes data input by a user step by step. 
The end result of the workflow is a detailed knowledge graph generated from the input data.

You are the Entity Instances Extraction Agent, which is the SECOND agent in this workflow. Your responsibilities are:
- Extract instances of entities and their available properties from the given text chunk
- Use the global entity list from the previous step to identify relevant entities and properties
- Only include instances of entities that are present in the text chunk, don't make up instances for entities that are not present in the text chunk
- Only include properties of instances that are present in the text chunk, don't make up properties for instances that are not present in the text chunk
- If any property of any instance is not present in the text chunk, don't include it in the response at all (even the key)

Given the following text chunk and entity list, extract instances of entities and their available properties.

Text chunk: {chunk}
Entity list: {entity_list}

{format_instructions}

Response:"""

In [56]:
workflow_2_parser = PydanticOutputParser(pydantic_object=EntityInstancesExtractionOutput)


workflow_2_prompt = ChatPromptTemplate.from_template(template=workflow_2_template)

# Create the chain for workflow 2
workflow_2_chain = (
    {"chunk": RunnablePassthrough(), "entity_list": lambda _: result, "format_instructions": lambda _: workflow_2_parser.get_format_instructions()}
    | workflow_2_prompt
    | llm
    | workflow_2_parser
)

In [61]:
def run_workflow_2_in_parallel(text: str, workflow_1_result: Dict[str, List[str]], parallelism: int = 3) -> List[EntityInstancesExtractionOutput]:
    """Run workflow 2 with rate limiting to avoid API quota issues."""
    chunks = split_text_into_chunks(text)

    print(len(chunks))
    
    # Create inputs for each chunk
    inputs = [{"chunk": chunk, "entity_list": workflow_1_result} for chunk in chunks]
    
    # Process chunks in smaller batches with delays
    results = []
    batch_size = min(parallelism, 3)  # Limit batch size to 3 for free tier
    
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i + batch_size]
        # Process current batch
        batch_results = workflow_2_chain.batch(batch, max_concurrent=batch_size)
        results.extend(batch_results)
        
        # Add delay between batches if not the last batch
        if i + batch_size < len(inputs):
            time.sleep(2)  # 2 second delay between batches
    
    return results

In [62]:
result_2 = run_workflow_2_in_parallel(extracted_text, result)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 33
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 33
}
].


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 31
}
]