In [1]:
import os
from pypdf import PdfReader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
# !pip install langchain_chroma


In [None]:
# This is not required to rerun as db is created

# #source of data

document_dir = "../data"
filename = "Handbook of Plant Disease Identification and Management.pdf"
file_path = os.path.join(document_dir, filename)

reader = PdfReader(file_path)
text = "".join([page.extract_text() for page in  reader.pages])
len(text)

len(reader.pages)

text_split = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=0)
docs = text_split.split_text(text)
print(f"Loaded and split document into {len(docs)} chunks.")

<h2> Setting Up Embeddings</h2>

In [None]:
with open("openai.txt","r") as file:
  key = file.readline().strip()

# Initialize OpenAI Embeddings
model_name = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=model_name, api_key=key)
print("OpenAI embeddings initialized.")



db = Chroma.from_texts(docs, embeddings, persist_directory="./chroma_db_oa") # once DB is created this is not required
db = Chroma(persist_directory="/Users/gnancy/work/5h1va/DSML/plat_disease/code/plant-doctor/src/agents/plant_doctor_agent/sub_agents/symptom_to_disease_agent/tools/chroma_db_oa", embedding_function=embeddings)
print("ChromaDB created with document embeddings.")

## Retrieving Documents from ChromaDB

# User's query
user_question = "What are the potato diseases names with dotted leaves "

# Perform similarity search
retrieved_docs = db.similarity_search(user_question, k=10)

# Display top results
for i, doc in enumerate(retrieved_docs[:3]):
    print(f"Document {i+1}:\n{doc.page_content}\n{'-'*80}")

## Preparing Content for GenAI

def get_document_prompt(docs):
    prompt = "\n"
    for doc in docs:
        prompt += "\nContent:\n"
        prompt += doc.page_content + "\n\n"
    return prompt

# Generate a formatted context from the retrieved documents
formatted_context = get_document_prompt(retrieved_docs)
print("Context formatted for GPT model.")

OpenAI embeddings initialized.
ChromaDB created with document embeddings.
Document 1:
Black dot disease of potato caused by the fungus Colletotrichum coccodes  is generally considered 
to be a weak root pathogen of potato. Recent studies have revealed, however, that this disease must 
be considered as part of the total disease complex affecting potato.
Although not as serious a tuber- or soilborne pathogen as black scurf (Rhizoctonia solani), silver 
scurf (H. solani), or common scab (S. scabies), Colletotrichum can cause severe rotting of below -
ground plant parts and early plant decline leading to discolored tubers and reduced yields. The 
same black dot organism causes anthracnose or ripe-fruit disease of tomato and can occur on other 
solanaceous crops and weed species.
7.8.1  Causal Organism
Species Associated Disease Phase Economic Importance
C. coccodes Small, Black, Dot-like Sclerotia on the Surface of Infected Stem, 
Stolon, and Tubers
Severe
7.8.2  s ymptOms
Black dot sympto

In [4]:
## Constructing the Query Prompt

prompt = f"""
## INTRODUCTION
You are a Chatbot designed to help answer technical questions about plant dieases base don the symptoms or specific plan asks.
The user asked: "{user_question}"

## CONTEXT
Technical Documentation:
'''
{formatted_context}
'''

## RESTRICTIONS
Refer to the plant specific diseases by their names.
Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.

## TASK
First, answer directly to the user, if possible highlighting key steps.
Second, point the user in the right direction of the documentation.
Lastly, answer in Markdown format.

## RESPONSE STRUCTURE:
'''
# [Answer Title]
[answer text]

Source:
• From The Handbook of Plant Disease Identification and Management
'''
"""
print("Prompt constructed.")

Prompt constructed.


In [2]:
# User's query
user_question = "What are the potato diseases names with dotted leaves "

# Perform similarity search
retrieved_docs = db.similarity_search(user_question, k=10)

# Prepare the context for the LLM
formatted_context = "\n\n".join([f"Content:\n{doc.page_content}" for doc in retrieved_docs])

## Constructing the Query Prompt

prompt = f"""
## INTRODUCTION
You are a Chatbot designed to help answer technical questions about plant dieases base don the symptoms or specific plan asks.
The user asked: "{user_question}"

## CONTEXT
Technical Documentation:
'''
{formatted_context}
'''

## RESTRICTIONS
Refer to the plant specific diseases by their names.
Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.

## TASK
First, answer directly to the user, if possible highlighting key steps.
Second, point the user in the right direction of the documentation.
Lastly, answer in Markdown format.

## RESPONSE STRUCTURE:
'''
# [Answer Title]
[answer text]

Source:
• From The Handbook of Plant Disease Identification and Management
'''
"""
print("Prompt constructed.")



NameError: name 'db' is not defined

In [7]:
import openai

# Set up GPT client and parameters
client = openai.OpenAI(api_key=key)

# Send the prompt to GPT
completion = client.chat.completions.create(
    messages = [
       {'role': 'user', 'content': prompt}
    ],
    model = 'gpt-4o-mini',
    temperature = 0.01,
    max_tokens = 3000)

# Extract the answer
answer = completion.choices[0].message.content
print("Generated response from GPT:")
print(answer)

Generated response from GPT:
# Potato Diseases with Dotted Leaves
The potato disease associated with dotted leaves is **Black Dot Disease**, caused by the fungus *Colletotrichum coccodes*. This disease manifests as small, black, dot-like sclerotia on the surface of infected stems, stolons, and tubers. Symptoms typically appear in mid to late summer, starting with yellowing and wilting of the leaves.

Source:
• From The Handbook of Plant Disease Identification and Management


In [21]:
import openai

# Set up GPT client and parameters
client = openai.OpenAI(api_key=key)

# Send the prompt to GPT
completion = client.chat.completions.create(
    messages = [
       {'role': 'user', 'content': prompt}
    ],
    model = 'gpt-4o-mini',
    temperature = 0.01,
    max_tokens = 3000)

# Extract the answer
answer = completion.choices[0].message.content
print("Generated response from GPT:")
print(answer)

Generated response from GPT:
# Potato Diseases with Dotted Leaves
The potato disease characterized by dotted leaves is known as **Black Dot Disease**, caused by the fungus *Colletotrichum coccodes*. This disease manifests as small, black, dot-like sclerotia on the surface of infected stems, stolons, and tubers. Symptoms typically appear in mid to late summer, starting with yellowing and wilting of the foliage.

Source:
• From The Handbook of Plant Disease Identification and Management


In [3]:
import openai
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings




def ask_document_question(question: str) -> str:
    """
    Answers a question by retrieving information from the local ChromaDB
    and generating a response using an LLM.

    Args:
        question: The user's question about the document.

    Returns:
        A generated answer based on the document content.
    """

    print(f"Tool received question: '{question}'")

    # Initialize the components once to avoid re-creating them on every tool call.
    # This assumes the ChromaDB has already been created and populated at './chroma_db_oa'.
    # The database will be loaded from the specified path.

        
    with open("/Users/gnancy/work/5h1va/DSML/plat_disease/code/openai.txt","r") as file:
        key = file.readline().strip()

    # Initialize OpenAI Embeddings
    model_name = "text-embedding-3-large"
    embeddings = OpenAIEmbeddings(model=model_name, api_key=key)
    client = openai.OpenAI(api_key=key)
    
    # Load the existing ChromaDB from the persistence directory
    # Note: No documents are added here; it just connects to the existing collection.
    db = Chroma(persist_directory="./chroma_db_oa", embedding_function=embeddings)
    print("Connected to pre-existing ChromaDB.")


    # Step 4: Retrieve relevant documents from ChromaDB
    retrieved_docs = db.similarity_search(question, k=10)

    # Prepare the context for the LLM
    formatted_context = "\n\n".join([f"Content:\n{doc.page_content}" for doc in retrieved_docs])

    # Step 5: Construct the query prompt
    prompt = f"""
    ## INTRODUCTION
    You are a Chatbot designed to help answer technical questions about plant diseases based on the symptoms or specific plant asks.

    ## ROLE
    You are a symptom analysis specialist. Your primary role is to diagnose plant diseases and recommend management strategies based on a user's description of symptoms. You will use the provided document context to find relevant information.

    ## CONTEXT
    Technical Documentation:
    '''
    {formatted_context}
    '''

    ## USER QUERY
    The user has described the  symptoms or asked this question:{question}

    ## TASK
    Based *only* on the provided context:
    1.  Identify the potential plant disease(s) that match the described symptoms.
    2.  If the disease is identified, list its key characteristics and symptoms as described in the text.
    3.  If available, provide management or control recommendations.
    4.  If the context does not contain relevant information, state that the information is not available in the provided documentation.
    5. answer in Markdown format.

    ## RESTRICTIONS
    * Do not use any outside knowledge. Your entire response must be derived from the provided 'Technical Documentation'.
    * Present your findings in a structured, easy-to-read format.
    Refer to the plant-specific diseases by their names.
    * Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
    * Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.

    ## RESPONSE STRUCTURE
    '''
    ### [Disease Name or Diagnosis Title]

    **Symptoms:**
    - [Symptom 1]
    - [Symptom 2]
    ...

    **Management/Recommendations:**
    - [Recommendation 1]
    - [Recommendation 2]
    ...


    Source:
    • From The Handbook of Plant Disease Identification and Management
    '''
    """

    # Step 6: Call the LLM to generate the answer
    try:
        # Send the prompt to GPT
        completion = client.chat.completions.create(
            messages = [
            {'role': 'user', 'content': prompt}
            ],
            model = 'gpt-4o-mini',
            temperature = 0.01,
            max_tokens = 3000)

        # Extract the answer
        answer = completion.choices[0].message.content
        return answer
    except Exception as e:
        return f"An error occurred while generating the answer: {e}"

In [4]:
# Assuming you have already executed the cells that define and load the tool
# from .tools.document_qa_tool import ask_document_question
# import google.auth
# _, project_id = google.auth.default()
# os.environ.setdefault("GOOGLE_CLOUD_PROJECT", project_id)
# os.environ.setdefault("GOOGLE_CLOUD_LOCATION", "global")
# os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")


# Define a question to ask the agent
test_question = "what Circular tan spots with dark rings and yellow halos diease in pepper?"

# Simulate the agent's behavior by calling the tool directly with the question
print("Simulating agent query...")
response = ask_document_question(test_question)

# Print the response to see the agent's output
print("\n--- Agent Response ---")
print(response)

# Example of another question
test_question_2 = "What causes black dot disease in potatoes?"

print("\nSimulating another agent query...")
response_2 = ask_document_question(test_question_2)

print("\n--- Agent Response ---")
print(response_2)

Simulating agent query...
Tool received question: 'What are the symptoms and management for tomato diseases with pale green to brownish-black lesions?'
Connected to pre-existing ChromaDB.

--- Agent Response ---
### Potential Tomato Diseases with Pale Green to Brownish-Black Lesions

**Symptoms:**
- Lesions on the leaves begin as circular, water-soaked spots that become necrotic with a brown center and thin chlorotic borders.
- Spots may enlarge and develop straw-colored centers.
- Severely affected leaves turn yellow and drop, leading to defoliation.
- Fruit symptoms occur as raised, brown lesions that are wart-like in appearance.
- Narrow, elongated lesions or streaks may develop on stems, petioles, and fruits.

**Management/Recommendations:**
- Obtain the best-certified seed or transplants to prevent seedling infection.
- Practice crop rotation; avoid planting tomatoes in areas where susceptible vegetables have been grown in the previous three or four years.
- Destroy solanaceous we

## GCP

In [3]:
import os
import google.generativeai as genai
from langchain_chroma import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.messages import HumanMessage

def ask_document_question(question: str) -> str:
    """
    Answers a question by retrieving information from the local ChromaDB
    and generating a response using the Gemini LLM.

    Args:
        question: The user's question about the document.

    Returns:
        A generated answer based on the document content.
    """
    print(f"Tool received question: '{question}'")

    # --- Initialize components using Google Gemini ---
    # The API key will be read from the GOOGLE_API_KEY environment variable by default.
    # You can also set it explicitly: genai.configure(api_key="YOUR_API_KEY")
    
    # Initialize Gemini Embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    # Initialize the Gemini Chat model
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.01)
    
    # Load the existing ChromaDB from the persistence directory
    # Note: No documents are added here; it just connects to the existing collection.
    db = Chroma(persist_directory="./chroma_db_oa", embedding_function=embeddings)
    print("Connected to pre-existing ChromaDB.")

    # Step 4: Retrieve relevant documents from ChromaDB
    retrieved_docs = db.similarity_search(question, k=10)

    # Prepare the context for the LLM
    formatted_context = "\n\n".join([f"Content:\n{doc.page_content}" for doc in retrieved_docs])

    # Step 5: Construct the query prompt
    prompt_text = f"""
    ## INTRODUCTION
    You are a Chatbot designed to help answer technical questions about plant diseases based on the symptoms or specific plant asks.

    ## ROLE
    You are a symptom analysis specialist. Your primary role is to diagnose plant diseases and recommend management strategies based on a user's description of symptoms. You will use the provided document context to find relevant information.

    ## CONTEXT
    Technical Documentation:
    '''
    {formatted_context}
    '''

    ## USER QUERY
    The user has described the symptoms or asked this question:{question}

    ## TASK
    Based *only* on the provided context:
    1.  Identify the potential plant disease(s) that match the described symptoms.
    2.  If the disease is identified, list its key characteristics and symptoms as described in the text.
    3.  If available, provide management or control recommendations.
    4.  If the context does not contain relevant information, state that the information is not available in the provided documentation.
    5. answer in Markdown format.

    ## RESTRICTIONS
    * Do not use any outside knowledge. Your entire response must be derived from the provided 'Technical Documentation'.
    * Present your findings in a structured, easy-to-read format.
    Refer to the plant-specific diseases by their names.
    * Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
    * Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.

    ## RESPONSE STRUCTURE
    '''
    ### [Disease Name or Diagnosis Title]

    **Symptoms:**
    - [Symptom 1]
    - [Symptom 2]
    ...

    **Management/Recommendations:**
    - [Recommendation 1]
    - [Recommendation 2]
    ...

    Source:
    • From The Handbook of Plant Disease Identification and Management
    '''
    """

    # Step 6: Call the LLM to generate the answer
    try:
        # Pass the formatted prompt to the Gemini LLM
        messages = [HumanMessage(content=prompt_text)]
        completion = llm.invoke(messages)
        answer = completion.content
        return answer
    except Exception as e:
        return f"An error occurred while generating the answer: {e}"

In [5]:
# Assuming you have already executed the cells that define and load the tool
# from .tools.document_qa_tool import ask_document_question
import google.auth
_, project_id = google.auth.default()
os.environ.setdefault("GOOGLE_CLOUD_PROJECT", project_id)
os.environ.setdefault("GOOGLE_CLOUD_LOCATION", "global")
os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")



# Define a question to ask the agent
test_question = "what Circular tan spots with dark rings and yellow halos diease in pepper?"

# Simulate the agent's behavior by calling the tool directly with the question
print("Simulating agent query...")
response = ask_document_question(test_question)

# Print the response to see the agent's output
print("\n--- Agent Response ---")
print(response)

# Example of another question
test_question_2 = "What causes black dot disease in potatoes?"

print("\nSimulating another agent query...")
response_2 = ask_document_question(test_question_2)

print("\n--- Agent Response ---")
print(response_2)

I0000 00:00:1756528347.856888 5114107 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Simulating agent query...
Tool received question: 'What are the symptoms and management for tomato diseases with pale green to brownish-black lesions?'


I0000 00:00:1756528348.206760 5114107 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1756528348.669431 5114107 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1756528349.032593 5114107 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Connected to pre-existing ChromaDB.


GoogleGenerativeAIError: Error embedding content: 403 Request had insufficient authentication scopes. [reason: "ACCESS_TOKEN_SCOPE_INSUFFICIENT"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "method"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.EmbedContent"
}
]