In [25]:
import os
import uuid
from typing import List
from dotenv import load_dotenv
from tqdm import tqdm

import pymupdf4llm as pfllm
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec


def extract_markdown_from_pdf(pdf_path: str) -> str:
    return pfllm.to_markdown(pdf_path)


def chunk_markdown(md_text: str, source_name="sr28_doc.pdf") -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunks = splitter.create_documents([md_text])

    # Attach metadata
    for chunk in chunks:
        chunk.metadata["source"] = source_name
        chunk.metadata["type"] = "usda"

    return chunks

def embed_chunks(chunks: List[Document]):
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    texts = [chunk.page_content for chunk in chunks]
    return model.encode(texts, convert_to_numpy=True)












In [26]:
def upsert_to_pinecone(chunks: List[Document], embeddings, index_name="nutrition-index"):
    load_dotenv(dotenv_path="FoodNutritionAssistant/.env")
    pinecone_api_key = os.getenv("PINECONE_API_KEY")

    pc = Pinecone(api_key=pinecone_api_key)

    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )

    index = pc.Index(index_name)

    vectors = []
    for chunk, embedding in tqdm(zip(chunks, embeddings), total=len(chunks)):
        vectors.append({
            "id": str(uuid.uuid4()),
            "values": embedding.tolist(),
            "metadata": chunk.metadata
        })

    # Batch upsert
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        res = index.upsert(vectors=batch)
        print(f"Upserted batch {i // batch_size + 1}: {res}")

    print("Index stats:", index.describe_index_stats())


In [27]:
if __name__ == "__main__":
    pdf_path = "/Users/arvindranganathraghuraman/Desktop/Personal_Projects/Nutrition_Dataset/sr28_doc.pdf"

    md_text = extract_markdown_from_pdf(pdf_path)
    chunks = chunk_markdown(md_text)
    embeddings = embed_chunks(chunks)
    upsert_to_pinecone(chunks, embeddings)


100%|██████████| 1817/1817 [00:00<00:00, 52649.74it/s]


Upserted batch 1: {'upserted_count': 100}
Upserted batch 2: {'upserted_count': 100}
Upserted batch 3: {'upserted_count': 100}
Upserted batch 4: {'upserted_count': 100}
Upserted batch 5: {'upserted_count': 100}
Upserted batch 6: {'upserted_count': 100}
Upserted batch 7: {'upserted_count': 100}
Upserted batch 8: {'upserted_count': 100}
Upserted batch 9: {'upserted_count': 100}
Upserted batch 10: {'upserted_count': 100}
Upserted batch 11: {'upserted_count': 100}
Upserted batch 12: {'upserted_count': 100}
Upserted batch 13: {'upserted_count': 100}
Upserted batch 14: {'upserted_count': 100}
Upserted batch 15: {'upserted_count': 100}
Upserted batch 16: {'upserted_count': 100}
Upserted batch 17: {'upserted_count': 100}
Upserted batch 18: {'upserted_count': 100}
Upserted batch 19: {'upserted_count': 17}
Index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3506}},
 'total_vector_count': 3506}


In [33]:
from langchain.vectorstores import Pinecone
from langchain_pinecone import Pinecone as PineconeVectorStore
from langchain_ollama.chat_models import ChatOllama
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings

# 1. LLM
llm = ChatOllama(model="llama3.2")

# 2. Embedding model (correct wrapper)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# 3. Pinecone vector store
vectorstore = PineconeVectorStore(
    index_name="nutrition-index",
    embedding=embedding_model
)

# 4. Retriever + RAG pipeline
retriever = vectorstore.as_retriever(search_type="similarity")
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# 5. Ask your question
prompt = "What is the nutritional value of a banana?"
result = qa({"query": prompt})
print("QA Response:", result["result"])


Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.


QA Response: A medium-sized banana typically contains:

* Calories: 105
* Carbohydrates: 26.9 grams (mainly starch and sugars)
* Fiber: 3.1 grams
* Protein: 1.3 grams
* Fat: 0.5 grams
* Potassium: 422 milligrams (about 12% of the Daily Value (DV))
* Vitamin C: 10.3 milligrams (about 17% of the DV)
* Vitamin B6: 0.5 milligrams (about 25% of the DV)

Please note that these values can vary depending on the ripeness, variety, and growing conditions of the banana.

Would you like to know more about a specific aspect of banana nutrition?
