In [1]:
# please enable if you are using a package manager like pip or conda
# Rajiv is using uv for package management hence these are commented out
# !pip install langchain_openai
# !pip install langchain_chroma
# !pip install langchain_groq
# !pip install langchain_community
# !pip install langchain_core
# !pip install langchain_ray
# !pip install chromadb


In [2]:
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.vectorstores import chroma
from langchain_community.llms import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.llms import HuggingFacePipeline

# deprecated legacy library
# from langchain.embeddings import HuggingFaceEmbeddings
import ray
import os
import dask.dataframe as dd
import logging
import torch
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
import chromadb
from tqdm import tqdm


In [3]:
logging.basicConfig(
    level=logging.INFO,  # Set the minimum level of messages to display
    format="%(asctime)s - %(levelname)s - %(message)s",
)


In [4]:
load_dotenv()
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
HUB_MODEL_ID = os.getenv("HUB_MODEL_ID")
LOCAL_VECTOR_DB_PATH = os.getenv("LOCAL_VECTOR_DB_PATH")


In [5]:
logging.info(f"LOCAL_MODEL_PATH: {LOCAL_MODEL_PATH}")
logging.info(f"HUB_MODEL_ID: {HUB_MODEL_ID}")
logging.info(f"LOCAL_VECTOR_DB_PATH: {LOCAL_VECTOR_DB_PATH}")


2025-09-12 14:27:20,038 - INFO - LOCAL_MODEL_PATH: ./multi-qa-MiniLM-L6-cos-v1-local
2025-09-12 14:27:20,039 - INFO - HUB_MODEL_ID: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
2025-09-12 14:27:20,039 - INFO - LOCAL_VECTOR_DB_PATH: ./chroma_capstone_db_new_small


In [6]:
if torch.cuda.is_available():
    device = "cuda"
    logging.info("Found NVIDIA GPU, using 'cuda' as the device.")
    logging.info(f"CUDA device count: {torch.cuda.device_count()}")
elif torch.backends.mps.is_available():
    device = "mps"
    logging.info("Found Apple Silicon GPU, using 'mps' as the device.")
    logging.info(f"MPS device count: 1")
else:
    device = "cpu"
    logging.warning("No GPU found, falling back to 'cpu'. This will be slower.")
    logging.info(f"CPU device count: {torch.device_count()}")


2025-09-12 14:27:20,056 - INFO - Found Apple Silicon GPU, using 'mps' as the device.
2025-09-12 14:27:20,056 - INFO - MPS device count: 1


In [7]:
df = pd.read_csv("filtered_data.csv")


In [8]:
print(f"data frame shape: {df.shape} \n")
print(f"data frame columns: {df.columns} \n")


data frame shape: (154081, 21) 

data frame columns: Index(['Unnamed: 0', 'BlockName', 'Category', 'Year', 'Month', 'Day', 'Crop',
       'DistrictName', 'QueryType', 'Season', 'Sector', 'StateName',
       'QueryText', 'KccAns', 'latitude', 'longitude', 'Disease',
       'kcc_word_count', 'QueryText_word_count', 'Season English',
       'BlockNamenew'],
      dtype='object') 



In [9]:
df.head()


Unnamed: 0.1,Unnamed: 0,BlockName,Category,Year,Month,Day,Crop,DistrictName,QueryType,Season,...,StateName,QueryText,KccAns,latitude,longitude,Disease,kcc_word_count,QueryText_word_count,Season English,BlockNamenew
0,0,0,0,2010,6,23,Apple,ANANTNAG,0,JAYAD,...,JAMMU AND KASHMIR,control of alternaria leaf blotch in apple tree,keep 60 cm soil around trees trunks undisturde...,33.7461,75.1854,Alternaria_leaf_blotch,9,8,Summer Cropping Season,0
1,1,0,0,2010,6,18,Apple,BARAMULLA,0,JAYAD,...,JAMMU AND KASHMIR,farmer wants information about the control of ...,the control of alternaria leaf blotch in apple...,34.2087,74.3435,Alternaria_leaf_blotch,19,13,Summer Cropping Season,0
2,23,DACHNIPORA,Fruits,2020,7,31,Apple,ANANTNAG,Plant Protection,,...,JAMMU AND KASHMIR,farmer asked the query regarding problem alter...,it is advised to the farmer spray one of the f...,33.7461,75.1854,Alternaria_leaf_blotch,73,11,,DACHNIPORA
3,24,SHOPIAN,Fruits,2020,7,2,Apple,SHUPIYAN,Plant Protection,,...,JAMMU AND KASHMIR,farmer asked the query regarding problem of al...,-- it is advised to the farmer spray in case r...,33.6649,75.163,Alternaria_leaf_blotch,65,13,,SHOPIAN
4,25,SOPORE,Fruits,2022,7,6,Apple,BARAMULLA,Plant Protection,,...,JAMMU AND KASHMIR,farmer asked a query regarding alternaria leaf...,fruit develop ment-iv x 12-18 days after v...,34.2087,74.3435,Alternaria_leaf_blotch,70,12,,SOPORE


In [10]:
unique_crops = df["Crop"].unique()
print(unique_crops)


['Apple' 'Coconut' 'Paddy Dhan' 'Potato' 'Tomato']


In [11]:
df["Crop"] = df["Crop"].replace("Paddy Dhan", "Paddy_Dhan")


In [12]:
categories = df["Crop"].unique()
# categories[2] = categories[2].replace("Paddy Dhan","Paddy_Dhan")
category_groups = {cat: df[df["Crop"] == cat] for cat in categories}


In [13]:
category_groups


{'Apple':       Unnamed: 0   BlockName Category  Year  Month  Day   Crop DistrictName  \
 0              0        0           0  2010      6   23  Apple     ANANTNAG   
 1              1        0           0  2010      6   18  Apple    BARAMULLA   
 2             23  DACHNIPORA   Fruits  2020      7   31  Apple     ANANTNAG   
 3             24     SHOPIAN   Fruits  2020      7    2  Apple     SHUPIYAN   
 4             25      SOPORE   Fruits  2022      7    6  Apple    BARAMULLA   
 ...          ...         ...      ...   ...    ...  ...    ...          ...   
 5660       15872        TRAL   Fruits  2024     12   13  Apple      PULWAMA   
 5661       15873      KANGAN   Fruits  2024     12   10  Apple    GANDERBAL   
 5662       15884  DACHNIPORA   Fruits  2025      2   16  Apple     ANANTNAG   
 5663       15885      BUDGAM   Fruits  2025      2   15  Apple       BADGAM   
 5664       15886    CHADOORA   Fruits  2025      2   17  Apple       BADGAM   
 
              QueryType Seaso

In [14]:
df = df.rename(columns={"Season English": "Season_English"})


In [15]:
if os.path.isdir(LOCAL_MODEL_PATH):
    logging.info(f"✅ Loading model from local path: {LOCAL_MODEL_PATH}")
    model_to_use = LOCAL_MODEL_PATH
else:
    logging.warning(
        f"⚠️ Local model not found at {LOCAL_MODEL_PATH}. Downloading from Hub: {HUB_MODEL_ID}"
    )
    !uv add "huggingface_hub[cli]"
    # !hf download BAAI/bge-large-en-v1.5 --local-dir ./bge-large-en-v1.5-local
    !hf download {HUB_MODEL_ID} --local-dir {LOCAL_MODEL_PATH}
    model_to_use = LOCAL_MODEL_PATH
    # model_to_use = HUB_MODEL_ID

embedding = None
try:
    embedding = HuggingFaceEmbeddings(
        model_name=model_to_use, model_kwargs={"device": device}
    )
    logging.info("Embedding model loaded successfully.")
except Exception as e:
    logging.error(f"Failed to load embedding model: {e}")
    embedding = None
logging.info(f"embedding: {embedding}")




[2mResolved [1m223 packages[0m [2min 3ms[0m[0m
[2mAudited [1m202 packages[0m [2min 0.03ms[0m[0m
Fetching 29 files:   0%|                                 | 0/29 [00:00<?, ?it/s]Downloading 'model.safetensors' to 'multi-qa-MiniLM-L6-cos-v1-local/.cache/huggingface/download/xGOKKLRSlIhH692hSVvI1-gpoa8=.7bec4fd9eba43073d5c5dcf1b79b0a3397608fa063e6f626d6f8fd70a81f2d8c.incomplete'
Downloading 'data_config.json' to 'multi-qa-MiniLM-L6-cos-v1-local/.cache/huggingface/download/x2Rt2861-Bgu_eYvy93NIBMPOFg=.a3294881c9834ac6fb99d420008f30b741fe7e86.incomplete'
Downloading '1_Pooling/config.json' to 'multi-qa-MiniLM-L6-cos-v1-local/.cache/huggingface/download/1_Pooling/8_PA_wEVGiVa2goH2H4KQOQpvVY=.d1514c3162bbe87b343f565fadc62e6c06f04f03.incomplete'
Downloading '.gitattributes' to 'multi-qa-MiniLM-L6-cos-v1-local/.cache/huggingface/download/wPaCkH-WbT7GsmxMKKrNZTV4nSM=.8d1395b12188a31e8a22acea211f312d818744e9.incomplete'

data_config.json: 25.5kB [00:00, 142MB/s]
Download complete. Movi

2025-09-12 14:27:38,135 - INFO - Load pretrained SentenceTransformer: ./multi-qa-MiniLM-L6-cos-v1-local
2025-09-12 14:27:38,242 - INFO - Embedding model loaded successfully.
2025-09-12 14:27:38,243 - INFO - embedding: model_name='./multi-qa-MiniLM-L6-cos-v1-local' cache_folder=None model_kwargs={'device': 'mps'} encode_kwargs={} query_encode_kwargs={} multi_process=False show_progress=False


#### Renaming the calender names

In [16]:
import calendar

month_number = 3
month_name = calendar.month_name[month_number]


In [17]:
df["Month"] = df["Month"].apply(lambda x: calendar.month_name[x])
df["Month"][:5]


0    June
1    June
2    July
3    July
4    July
Name: Month, dtype: object

In [18]:
df[df["Crop"] == "Tomato"].head(5)


Unnamed: 0.1,Unnamed: 0,BlockName,Category,Year,Month,Day,Crop,DistrictName,QueryType,Season,...,StateName,QueryText,KccAns,latitude,longitude,Disease,kcc_word_count,QueryText_word_count,Season_English,BlockNamenew
140468,568214,0,0,2009,May,3,Tomato,PULWAMA,29,JAYAD,...,JAMMU AND KASHMIR,farmer wants to control over aphids in tomato,spray with endosulfan 35ec 3ml in 1litre of w...,33.949,75.0418,Aphids,9,8,Summer Cropping Season,0
140469,568215,0,0,2009,April,11,Tomato,BADGAM,29,KHARIF,...,JAMMU AND KASHMIR,how to control aphids in thge seedlings of tomato,spray melathion 50 ec 1 ml in 1 liter of water,34.0385,74.736,Aphids,11,9,Monsoon Cropping Season,0
140470,568217,0,0,2009,June,8,Tomato,BADGAM,29,JAYAD,...,JAMMU AND KASHMIR,information about the control of aphids in tomato,spray with endosulfan 35 ec 2mllt of water,34.0385,74.736,Aphids,8,8,Summer Cropping Season,0
140471,568218,0,0,2009,June,18,Tomato,BADGAM,29,JAYAD,...,JAMMU AND KASHMIR,information regarding control of aphids in tomato,spray malathion 35ec 25mllitre of water,34.0385,74.736,Aphids,6,7,Summer Cropping Season,0
140472,568220,0,0,2009,January,20,Tomato,JAMMU,29,KHARIF,...,JAMMU AND KASHMIR,control of aphids in tomato crop,spray metasystox 25ec 2-3mllt of water,32.7186,74.8581,Aphids,6,6,Monsoon Cropping Season,0


## Loading data to vector database

In [19]:
category_groups = {cat: df[df["Crop"] == cat] for cat in categories}


In [20]:
# chromadb.api.client.SharedSystemClient.clear_system_cache()


In [21]:
# Store documents per category
for cat, df_cat in tqdm(category_groups.items(), desc="Processing categories"):
    print(len(df_cat))
    docs = [
        Document(
            page_content=f"DistrictName:{row['DistrictName']}\nStateName:{row['StateName']}\nSeason_English:{row['Season_English']}\nMonth:{row['Month']}\nDisease:{row['Disease']}\nQueryText:{row['QueryText']}\nKccAns:{row['KccAns']}",
            metadata={"category": cat},
        )
        for _, row in df_cat.iterrows()
    ]

    chroma_collection = Chroma.from_documents(
        docs,
        embedding=embedding,
        persist_directory=LOCAL_VECTOR_DB_PATH,
        collection_name=cat,
    )

    # chroma_collection.persist()


Processing categories:   0%|          | 0/5 [00:00<?, ?it/s]2025-09-12 14:27:59,926 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


5665


Processing categories:  20%|██        | 1/5 [00:07<00:29,  7.30s/it]

7453


Processing categories:  40%|████      | 2/5 [00:15<00:23,  7.74s/it]

111609


Processing categories:  60%|██████    | 3/5 [01:57<01:41, 50.83s/it]

15741


Processing categories:  80%|████████  | 4/5 [02:11<00:36, 36.14s/it]

13613


Processing categories: 100%|██████████| 5/5 [02:22<00:00, 28.46s/it]


In [22]:
chroma_db = Chroma(
    persist_directory=LOCAL_VECTOR_DB_PATH,
    embedding_function=embedding,
    collection_name="Tomato",  # Specify which collection to load
)


In [23]:
print(chroma_db._collection.count())


13613


In [24]:
question = "give me the cure for tomato plant"
docs = chroma_db.max_marginal_relevance_search(question, k=3, fetch_k=6)


In [25]:
retriever = chroma_db.as_retriever(
    search_type="mmr", search_kwargs={"k": 3, "fetch_k": 6}
)


In [26]:
docs = retriever.invoke(question)
docs


[Document(id='e54f6267-aae6-4534-84a9-4fb8fa303559', metadata={'category': 'Tomato'}, page_content='DistrictName:CHITTOOR\nStateName:ANDHRA PRADESH\nSeason_English:nan\nMonth:February\nDisease:Leaf_miner\nQueryText:tomato leaf miner management\nKccAns:300    200         recommended to spray novaluron 525  emamectin benzoate 09barazide 320ml200 liter of wateracre'),
 Document(id='ba0415a0-e46c-4a2c-b38b-ead221767478', metadata={'category': 'Tomato'}, page_content='DistrictName:KEONJHAR\nStateName:ODISHA\nSeason_English:nan\nMonth:February\nDisease:Leaf_miner\nQueryText:leaf miner in tomato\nKccAns:recommended to spray copper oxychloride 50wp  500gm  plantomycin  200gm with 200 liter of water per acre for prevention of root rot in tomato'),
 Document(id='2ec5ed39-7b5f-4e74-8a0b-6f254acc8272', metadata={'category': 'Tomato'}, page_content='DistrictName:WARANGAL RURAL\nStateName:TELANGANA\nSeason_English:nan\nMonth:December\nDisease:Fruit_borer\nQueryText:tomato fruit borer management -\nK

### Performance test the chromadb on the embedding model that we are using

In [27]:
import time
from collections import deque
from datetime import datetime


class ChromaDBPerformanceMonitor:
    """
    Real-time performance monitoring for ChromaDB queries
    """

    def __init__(self, window_size=100):
        self.window_size = window_size
        self.query_times = deque(maxlen=window_size)
        self.query_history = []

    def timed_query(self, retriever, question, metadata=None):
        """
        Execute query with timing and store results
        """
        timestamp = datetime.now()
        start_time = time.perf_counter()

        docs = retriever.invoke(question)

        end_time = time.perf_counter()
        query_time = end_time - start_time

        # Store timing data
        self.query_times.append(query_time)

        query_record = {
            "timestamp": timestamp,
            "question": question,
            "query_time": query_time,
            "doc_count": len(docs),
            "metadata": metadata or {},
        }
        self.query_history.append(query_record)

        return docs, query_record

    def get_stats(self):
        """
        Get current performance statistics
        """
        if not self.query_times:
            return None

        recent_times = list(self.query_times)

        stats = {
            "total_queries": len(self.query_history),
            "recent_queries": len(recent_times),
            "avg_time": sum(recent_times) / len(recent_times),
            "min_time": min(recent_times),
            "max_time": max(recent_times),
            "queries_per_second": 1 / (sum(recent_times) / len(recent_times)),
            "last_query_time": recent_times[-1] if recent_times else None,
        }

        return stats

    def print_stats(self):
        """
        Print current performance statistics
        """
        stats = self.get_stats()
        if not stats:
            print("No queries executed yet")
            return

        print(
            f"\n📈 ChromaDB Performance Stats (last {stats['recent_queries']} queries)"
        )
        print("-" * 50)
        print(f"Average query time: {stats['avg_time']:.4f}s")
        print(f"Fastest query: {stats['min_time']:.4f}s")
        print(f"Slowest query: {stats['max_time']:.4f}s")
        print(f"Queries per second: {stats['queries_per_second']:.2f}")
        print(f"Total queries: {stats['total_queries']}")


In [28]:
# Usage
monitor = ChromaDBPerformanceMonitor()

# Execute queries with monitoring
questions = [
    "I dont know my state name. I have tomato plants with disease. What should I do?",
    "Coconut palm pest control",
    "Tomato disease treatment methods",
    "Paddy disease treatment methods",
    "It is summer in karnataka state. I have tomato plants with disease. What should I do?",
]

for question in questions:
    docs, record = monitor.timed_query(retriever, question)
    print(
        f"Query: {question[:40]}... ({record['query_time']:.4f}s, {record['doc_count']} docs)"
    )

# Get overall statistics
monitor.print_stats()


Query: I dont know my state name. I have tomato... (0.0647s, 3 docs)
Query: Coconut palm pest control... (0.0338s, 3 docs)
Query: Tomato disease treatment methods... (0.0085s, 3 docs)
Query: Paddy disease treatment methods... (0.0083s, 3 docs)
Query: It is summer in karnataka state. I have ... (0.0260s, 3 docs)

📈 ChromaDB Performance Stats (last 5 queries)
--------------------------------------------------
Average query time: 0.0283s
Fastest query: 0.0083s
Slowest query: 0.0647s
Queries per second: 35.38
Total queries: 5


## Call the LLM to get prescription

In [29]:
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.output_parsers import StrOutputParser


In [30]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

QA_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)


In [31]:
retrieval = RunnableParallel(
    {
        "context": RunnablePassthrough(context=lambda x: x["question"] | retriever),
        "question": RunnablePassthrough(),
    }
)


In [32]:
from langchain_huggingface import HuggingFacePipeline

chat_llm = HuggingFacePipeline.from_model_id(
    model_id="Qwen/Qwen3-4B-Thinking-2507",
    task="text-generation",
    model_kwargs={
        # "max_new_tokens": 100,
        "top_k": 30,
        "temperature": 0.5,
        "repetition_penalty": 1.03,
    },
)


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use mps:0


In [33]:
# from langchain_community.chat_models import ChatHuggingFace
from langchain_huggingface import ChatHuggingFace
from langchain.chains import RetrievalQA


In [34]:
llm = ChatHuggingFace(llm=chat_llm)


In [35]:
question = "give me the cure for tomato plant in JAMMU AND KASHMIR for Aphids"

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=chroma_db.as_retriever(), return_source_documents=True
)

result = qa_chain.invoke({"query": question})


In [36]:
print(result["result"])


<|im_start|>system
Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
DistrictName:BADGAM
StateName:JAMMU AND KASHMIR
Season_English:Monsoon Cropping Season
Month:April
Disease:Aphids
QueryText:how to control aphids in thge seedlings of tomato
KccAns:spray melathion 50 ec  1 ml in 1 liter of water

DistrictName:JAMMU
StateName:JAMMU AND KASHMIR
Season_English:Summer Cropping Season
Month:April
Disease:Aphids
QueryText:farmer wants information about the control of aphids in tomato
KccAns:spray melathion  250 ml in 100 lt of wateracre

DistrictName:JAMMU
StateName:JAMMU AND KASHMIR
Season_English:Summer Cropping Season
Month:April
Disease:Aphids
QueryText:information abot the control of aphids in tomato
KccAns:spray with endosulfan 35 ec 2mllt of water

DistrictName:PULWAMA
StateName:JAMMU AND KASHMIR
Season_English:Summer Cropping Season
Month:May
Disease:Aphids
Q

In [None]:
import shutil

# Zip the Chroma DB folder
shutil.make_archive("chroma_capstone_db_new", 'zip', "./chroma_capstone_db_new")


In [None]:
from google.colab import files
files.download("chroma_capstone_db_new.zip")


In [None]:
category_groups.keys()
