In [1]:
!pip install langchain-google-genai langchain_community tiktoken langchain
!pip install faiss-cpu wikipedia arxiv langchain-cohere



In [2]:
import pandas as pd

In [3]:
df = pd.read_excel('/content/urls.xlsx')
urls = df['url'].values.tolist()
urls[:5]
# urls = ['https://plantix.net/en/library/plant-diseases/300037/aster-yellows-phytoplasma/',
#         'https://plantix.net/en/library/plant-diseases/100323/anthracnose/',
#         'https://plantix.net/en/library/plant-diseases/500008/yellow-vine-mite/',
#         'https://plantix.net/en/library/plant-diseases/100348/ring-spot-of-sugarcane/',
#         'https://plantix.net/en/library/plant-diseases/100213/lentil-rust/',
#         'https://plantix.net/en/library/plant-diseases/800049/physiological-leaf-spot/']

['https://plantix.net/en/library/plant-diseases/100323/anthracnose/',
 'https://plantix.net/en/library/plant-diseases/500008/yellow-vine-mite/',
 'https://plantix.net/en/library/plant-diseases/100348/ring-spot-of-sugarcane/',
 'https://plantix.net/en/library/plant-diseases/100213/lentil-rust/',
 'https://plantix.net/en/library/plant-diseases/800049/physiological-leaf-spot/']

# web scraper
## 1st mathod: using AsyncHtmlLoader and BeautifulSoupTransformer

Doing so does extract every page's information, but it also might extract the irrelevant information, such as header/footer of the webpage.
This noise in the data makes the search process difficult.


In [4]:
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.document_loaders import AsyncHtmlLoader



In [5]:
loader = AsyncHtmlLoader(urls[:5])
docs = loader.load()
tags = ["p", "li", 'div']
html = docs[0].page_content
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(
    docs, tags_to_extract=tags
)

Fetching pages: 100%|##########| 5/5 [00:02<00:00,  2.36it/s]


In [6]:
docs[0]

Document(metadata={'source': 'https://plantix.net/en/library/plant-diseases/100323/anthracnose/', 'title': 'Anthracnose | Pests & Diseases', 'description': 'Water-soaked lesions on leaves, stems, pods or fruits. Oval lesions surrounded by vividly colored margin. The lower stem dark-brown and rough. Defoliation, lodging of plants or top die back of branches in severe cases. Crops Banana, Other, Mango, ...', 'language': 'en'}, page_content='App     Library     News     Company     B2B Solutions           Get the app      Anthracnose  Library   Pests & Diseases      Anthracnose  Guava      Anthracnose Colletotrichum spp. Fungus Heal your crop Take a picture See diagnosis Get medicine Get Plantix app In a Nutshell Water-soaked lesions on leaves, stems, pods or fruits. Oval lesions surrounded by vividly colored margin. The lower stem dark-brown and rough. Defoliation, lodging of plants or top die back of branches in severe cases.  Can also be found in  24 Crops  Almond   Apple     Apricot  

## 2nd mathod: using requests.get and BeautifulSoup library with a custom CSS Selector

A custom CCS selector was generated using the SelectorGadget in extension in Chrome to extract only the informative sections.

By using a custom selector we had to use an explicit for loop, but it greatly enhances the search result.

In [7]:
!pip install beautifulsoup4 requests



In [8]:
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document

In [9]:
selector = ".trigger-card p, .trigger-card h2, .host-heading, .crop-info, .preventive-measures-list li, .product-recommendations-header, .product-recommendations, .symptoms p, .symptoms h2, #overview li, .in-a-nutshell-header, .class_text, .scientific-name, .zoo-cookie-consent-popup, .disease-name"

docs_transformed = []

# estimated time of run: 5min. Use the 6 test urls if you need to see quick results.
for url in urls:
    response = requests.get(url, timeout=10)
    # response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    selected_elements = soup.select(selector)
    selected_texts = [el.get_text(strip=True) for el in selected_elements]
    transformed_content = "\n".join(selected_texts)
    docs_transformed.append(Document(page_content=transformed_content, metadata={"source": url}))

In [10]:
docs_transformed[0]

Document(metadata={'source': 'https://plantix.net/en/library/plant-diseases/100323/anthracnose/'}, page_content='Anthracnose\nColletotrichum spp.\nFungus\nIn a Nutshell\nWater-soaked lesions on leaves, stems, pods or fruits.\nOval lesions surrounded by vividly colored margin.\nThe lower stem dark-brown and rough.\nDefoliation, lodging of plants or top die back of branches in severe cases.\nCan also be found in\nAlmond\nApple\nApricot\nBanana\nMore\nSymptoms\nType of crop, variety and environmental conditions will influence the severity of symptoms. Gray to tan-coloured lesions appear on leaves, stems, pods or fruits. These spots can be circular, oval or irregular in shape and with dark brown, reddish or purplish margins. Under favorable weather conditions, they become more numerous, enlarge and coalesce, turning dark brown or black in the process. Their center gradually becomes grayish and, in the later phases of the infection, it may show tiny dispersed black flecks. A reddish discolo

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=20)

text_chunks = text_splitter.split_documents(docs_transformed)

In [13]:
import os
from google.colab import userdata
os.environ['GOOGLE_API_KEY'] = userdata.get("GOOGLE_API_KEY")
os.environ['COHERE_API_KEY'] = userdata.get("CO_API_KEY")

In [14]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [15]:
from langchain.vectorstores import FAISS
vs = FAISS.from_documents(text_chunks, embedding)

In [16]:
vs.save_local('/content/drive/MyDrive/plantix/plantix_faiss')

In [17]:
vs = FAISS.load_local('/content/drive/MyDrive/plantix/plantix_faiss', embedding, allow_dangerous_deserialization=True)
vs.similarity_search_with_relevance_scores('aster yellows phytoplasma', k=3)

[(Document(id='6edd0688-2bc8-47fd-95e7-85e4f0b68058', metadata={'source': 'https://plantix.net/en/library/plant-diseases/200026/citrus-leprosis/'}, page_content='the citrus leprosis virus.'),
  np.float32(0.8006729)),
 (Document(id='70eb5203-e3ab-480e-aa9b-0942b9ba4227', metadata={'source': 'https://plantix.net/en/library/plant-diseases/300037/aster-yellows-phytoplasma/'}, page_content='Aster Yellows Phytoplasma\nPhytoplasma asteris\nBacteria\nIn a Nutshell\nClearing of the leaf veins that extends to the rest of the leaf blade.\nDeformation and greening of flowers, development of leaf-like flower petals and formation of sterile flowers.\nOverall, plants have a reduced root system and a stunted appearance.\nCan also be found in\nBitter Gourd\nCabbage\nCarrot\nLettuce\nMore\nSymptoms'),
  np.float32(0.79328173)),
 (Document(id='871e854e-4453-44bc-8c42-5630d56d7e71', metadata={'source': 'https://plantix.net/en/library/plant-diseases/100144/anthracnose-of-almond/'}, page_content='bloom as 

In [18]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.docstore.document import Document
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage # Import if needed, good for clarity


In [19]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)


In [20]:
retriever_tool = create_retriever_tool(
    vs.as_retriever(), "plantix", "Search for plant diseases and related information. Useful for when you need to answer questions about gardening, plants, and plant deseases."
)

tools = [retriever_tool]

In [21]:
for tool in tools:
  print("tool 's name: ",tool.name,)
  print("tool 's description", tool.description,'\n')

tool 's name:  plantix
tool 's description Search for plant diseases and related information. Useful for when you need to answer questions about gardening, plants, and plant deseases. 



In [22]:
from langchain.agents import AgentExecutor, create_react_agent
from langchain import hub
prompt = hub.pull("hwchase17/react")
print(prompt.template)


Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}




In [23]:
agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=20)

In [24]:
j = 1
queries = [
    "What is the primary cause of aster yellows-phytoplasma in plants?",
    'Which plants are commonly affected by aster yellows phytoplasma?',
    'How does aster yellows-phytoplasma spread from plant to plant?',
    'Are there any specific symptoms or signs that indicate the presence of aster yellows-phytoplasma in plants?',
         ' What are the potential economic impacts of aster yellows-phytoplasma on crop yields?',
           'what are the recommended control measures for managing aster yellows-phytoplasma in affected crops?'
]
query  = queries[j]

In [25]:
agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find out which plants are commonly affected by aster yellows phytoplasma. I can use the plantix tool to search for this information.
Action: plantix
Action Input: "plants affected by aster yellows phytoplasma"[0m[36;1m[1;3mRemove and burn infected plants residues.

(discoloration). Overall, plants wilt and in severe cases, damping off can be observed.

the citrus leprosis virus.

spreads further, large yellow patches of dying plants can be observed from the distance in the field.[0m[32;1m[1;3mThe information returned by the plantix tool is not specific enough to answer the question. I need to refine my search to get a list of plants affected by aster yellows phytoplasma.
Action: plantix
Action Input: "aster yellows phytoplasma host plants"[0m[36;1m[1;3mthe citrus leprosis virus.

of alternative hosts in the cereal family.

Aster Yellows Phytoplasma
Phytoplasma asteris
Bacteria
In a Nutshell
Clearing of the 

{'input': 'Which plants are commonly affected by aster yellows phytoplasma?',
 'output': 'Bitter Gourd, Cabbage, Carrot, and Lettuce are some plants that can be affected by aster yellows phytoplasma.'}