In [1]:
import getpass
import os

if not os.environ.get('GOOGLE_API_KEY'):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

In [2]:
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings

llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

In [3]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [23]:
import bs4
import requests
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

In [24]:
url = "https://www.ams.usda.gov/selling-food/product-specs"
source = requests.get(url)
soup = bs4.BeautifulSoup(source.text, "html.parser")

In [28]:
soup

<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
<head>
<meta charset="utf-8"/>
<style>body #backtotop {left: 50%; margin-left: -50px;}</style>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-4FZL5M3EFM"></script>
<script>window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "G-4FZL5M3EFM", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location"});</script>
<meta content="Drupal 10 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content

In [30]:
vegetable_pdfs = [
    tag.get('href') for tag in soup.find_all("a")
    if tag.get('href', '').endswith('.pdf')
    and "vegetable" in tag.get('href', '').lower()
]
vegetable_pdfs

['/sites/default/files/media/CommoditySpecificationforCannedVegetablesJanuary2023.pdf',
 '/sites/default/files/media/Commodity_Specification_for_Canned_Vegetables_April_2021.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedVegetablesMarch2020.pdf',
 '/sites/default/files/media/CommoditySpecificationforCannedVegetablesJanuary2020.pdf',
 '/sites/default/files/media/CannedVegetablesAmendment1February2020.pdf',
 '/sites/default/files/media/CommoditySpecificationforFrozenStirFryVegetableBlend%28Broccoli%2CCarrots%2CGreen%20Beans%29September2023.pdf',
 '/sites/default/files/media/Commodity_Specification_for_Frozen_Vegetables_March_2022.pdf',
 '/sites/default/files/media/CommoditySpecificationforFrozenVegetablesMarch2020.pdf',
 '/sites/default/files/media/CommoditySpecificationforFreshVegetablesMay2019.pdf',
 '/sites/default/files/media/VegetableOilProductsJuly2023.pdf']

In [34]:
from langchain_community.document_loaders import PyPDFLoader

pages = []
for subdomain in vegetable_pdfs:
    url = f"https://www.ams.usda.gov/{subdomain}"
    loader = PyPDFLoader(url)
    async for page in loader.alazy_load():
        pages.append(page)

In [35]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)
splits = text_splitter.split_documents(pages)

_ = vector_store.add_documents(documents = splits)

prompt = hub.pull(
    "rlm/rag-prompt",
    api_url = "https://api.smith.langchain.com",

)

In [36]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state['question'])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state['context'])
    messages = prompt.invoke({
        'question': state['question'],
        'context': docs_content
    })
    response = llm.invoke(messages)
    return {'answer': response.content}

In [38]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [41]:
response = graph.invoke({"question": "For frozen stir fry vegetables, what is the packaging and labeing criteria?"})
print(response["answer"])

The label must include at a minimum, two additional colors other than black and/or white. It must also incorporate a graphic depicting the product/contents on the principal display panel if the packages are not clear or contain an adequate viewing window. Products requiring Kosher or Halal certification must bear the corresponding certification symbols.
