In [2]:
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate
from langchain_community.vectorstores import chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain import HuggingFacePipeline
from transformers import pipeline
# RAG Chain
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM,AutoTokenizer
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain import HuggingFacePipeline
import transformers
import torch  
import os
from langchain_huggingface import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Defining the model

In [3]:
model = 'microsoft/phi-2'

tokenizer = AutoTokenizer.from_pretrained(model)

model = AutoModelForCausalLM.from_pretrained(model,torch_dtype="auto",device_map='auto')

pipe = pipeline("text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=256,
    temperature=0.5)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


#### Embbeding and using vector database

In [4]:
embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct",model_kwargs={"device": "cuda"})
chroma_db = Chroma(
    persist_directory="./chroma_capstone_db_new",
    embedding_function=embedding,
    collection_name="Tomato"  # Specify which collection to load
)

### Chromadb as Retriever using mmr

In [None]:
question = "give me the cure for tomato plant in Kashmir in the summer months"
chroma_retriever = chroma_db.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k":12})

chroma_retriever.get_relevant_documents(question)

  chroma_retriever.get_relevant_documents(question)


[Document(id='3eb7f99e-c0ae-41a9-9b49-33183493dabb', metadata={'category': 'Tomato'}, page_content='DistrictName:JAMMU\nStateName:JAMMU AND KASHMIR\nSeason_English:Summer Cropping Season\nMonth:July\nDisease:Late_blight\nQueryText:inforation regarding control of late blight in tomato\nKccAns:spray indofil m45  25mllitre of water'),
 Document(id='294f8de4-bb06-4dc3-95f4-a176f171ee5c', metadata={'category': 'Tomato'}, page_content='DistrictName:BADGAM\nStateName:JAMMU AND KASHMIR\nSeason_English:nan\nMonth:June\nDisease:Late_blight\nQueryText:late blight in tomato\nKccAns:mancozeb 2gltr of water'),
 Document(id='54169f6e-84c5-4e53-9499-3dabbb200d97', metadata={'category': 'Tomato'}, page_content='DistrictName:BADGAM\nStateName:JAMMU AND KASHMIR\nSeason_English:Summer Cropping Season\nMonth:June\nDisease:Aphids\nQueryText:control for aphids in tomato\nKccAns:chlorpyriphos 2 ml in lt water'),
 Document(id='8fa90dce-6146-4834-8bbe-73b2febae6d1', metadata={'category': 'Tomato'}, page_content

#### Prompting

In [6]:
prompt_template = """
You are an agricultural assistant specialized in answering questions about plant diseases.  
Your task is to provide answers strictly based on the provided context when possible.  

Each document contains the following fields:  
- DistrictName  
- StateName  
- Season_English  
- Month  
- Disease  
- QueryText  
- KccAns (this is the official response section from source documents)

Guidelines for answering:
1. If a relevant answer is available in KccAns, use that with minimal changes.  
2. Use DistrictName, StateName, Season_English, Month, and Disease only to help interpret the question and select the correct KccAns, but **do not include these details in the final answer unless the question explicitly asks for them**.  
3. If the answer is not available in the context, then rely on your own agricultural knowledge to provide the best possible answer.  
4. Do not invent or assume information when KccAns is present; only fall back to your own knowledge when the context has no suitable answer.  

CONTEXT:
{context}

QUESTION:
{question}

OUTPUT:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

#### Retriever

In [8]:
h_retrieval_QA1 = RetrievalQA.from_chain_type(
    llm=HuggingFacePipeline(pipeline=pipe),
    chain_type="stuff",
    retriever=chroma_retriever,
    input_key="query",
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

  llm=HuggingFacePipeline(pipeline=pipe),


In [9]:
h_retrieval_QA1.invoke({"query": question})["result"]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'\nYou are an agricultural assistant specialized in answering questions about plant diseases.  \nYour task is to provide answers strictly based on the provided context when possible.  \n\nEach document contains the following fields:  \n- DistrictName  \n- StateName  \n- Season_English  \n- Month  \n- Disease  \n- QueryText  \n- KccAns (this is the official response section from source documents)\n\nGuidelines for answering:\n1. If a relevant answer is available in KccAns, use that with minimal changes.  \n2. Use DistrictName, StateName, Season_English, Month, and Disease only to help interpret the question and select the correct KccAns, but **do not include these details in the final answer unless the question explicitly asks for them**.  \n3. If the answer is not available in the context, then rely on your own agricultural knowledge to provide the best possible answer.  \n4. Do not invent or assume information when KccAns is present; only fall back to your own knowledge when the conte