In [1]:
import getpass
import os

if not os.environ.get('GOOGLE_API_KEY'):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

In [2]:
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings

llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

In [None]:
from langchain_postgres import PGVector

connection = os.environ['POSTGRES_URL']
collection_name = "my_docs"

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [4]:
import bs4
from tqdm import tqdm
import requests
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
url = "https://www.ams.usda.gov/selling-food/product-specs"
source = requests.get(url)
soup = bs4.BeautifulSoup(source.text, "html.parser")

In [6]:
vegetable_pdfs = [
    tag.get('href') for tag in soup.find_all("a")
    if tag.get('href', '').endswith('.pdf')
]
vegetable_pdfs

['/sites/default/files/media/qasp.pdf',
 '/sites/default/files/media/AnimalHandlingandWelfareApril2025.pdf',
 '/sites/default/files/media/FPPS_AHW_November2020.pdf',
 '/sites/default/files/media/BJENYKosherResource.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedFruitJune2023_0.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedFruitAugust2021.pdf',
 '/sites/default/files/media/Commodity_Specification_for_Canned_Fruit_May_2021.pdf',
 '/sites/default/files/media/Amendment_1_to_the_Commodity_Specification_for_Canned_Fruit_June_2021.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedFruitMarch2020.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedFruitMarch2019.pdf',
 '/sites/default/files/media/CommoditySpecificationforFrozenFruitFebruary2022.pdf',
 '/sites/default/files/media/CommoditySpecificationsforFrozenFruitApril2020.pdf',
 '/sites/default/files/media/FinalSection32SpecificationforMandarinOrangesandTangerines.pdf',
 '/

In [7]:
from langchain_community.document_loaders import PyPDFLoader

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)

for subdomain in tqdm(vegetable_pdfs):
    pages = []
    url = f"https://www.ams.usda.gov/{subdomain}"
    loader = PyPDFLoader(url)
    async for page in loader.alazy_load():
        pages.append(page)
    
    splits = text_splitter.split_documents(pages)
    
    _ = vector_store.add_documents(documents = splits)

100%|██████████| 158/158 [05:45<00:00,  2.19s/it]


In [8]:
prompt = hub.pull(
    "rlm/rag-prompt",
    api_url = "https://api.smith.langchain.com",
)

In [9]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state['question'])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state['context'])
    messages = prompt.invoke({
        'question': state['question'],
        'context': docs_content
    })
    response = llm.invoke(messages)
    return {'answer': response.content}

In [10]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [17]:
from IPython.display import display, Markdown

print("Enter your agricultural tech specification question:")
question = input("Question: ")
response = graph.invoke({"question": question})
display(Markdown(f"**Answer:**\n\n{response['answer']}"))

Enter your agricultural tech specification question:


**Answer:**

Shell egg packaging must use clean, new materials without objectionable odors. The packing materials must be tamper-evident and withstand humidity, temperature, handling, shipping, stacking, and storage. Final closure of fiberboard shipping containers requires a secure, tamper-evident seal using specified tapes applicable to refrigerated temperatures.