In [1]:
# # FILE: RAG_Workflow.ipynb

# # Step 0: Activate environment if needed
# # Make sure your virtual environment is active:
# # .\venv\Scripts\Activate.ps1
# # and all dependencies are installed:
# !pip install -r requirements.txt
# # pip install huggingface-hub==0.16.4

In [1]:
# Step 1: Imports
import asyncio
from pathlib import Path
from dotenv import load_dotenv
import os
import json

# Import the modules from your app
from app.ingestion.ingest import ingest_texts
from app.retrieval.retriever import retrieve_top_k
from app.generation.generator import generate_answer

# Load environment variables
load_dotenv()  # expects .env in repo root




True

In [None]:
# Step 2: Prepare example text files
import random

# Folder to save documents
example_docs_folder = Path("example_docs")
example_docs_folder.mkdir(exist_ok=True)

# Sample ingredients, methods, contexts, and properties
ingredients = ["water", "glycerin", "sodium chloride", "sorbitol", "lecithin", "ethanol", "magnesium stearate",
               "ascorbic acid", "citric acid", "hydroxypropyl cellulose", "shea butter", "caffeine", "retinol"]

methods = ["stirring at room temperature", "high-shear mixing", "sonication", "emulsification", "freeze-drying",
           "spray drying", "homogenization", "pH adjustment", "solvent evaporation"]

contexts = ["pharmaceutical tablet", "cosmetic cream", "beverage formulation", "nutraceutical capsule",
            "topical gel", "oral suspension", "functional food", "encapsulated nutrient", "flavored syrup"]

properties = ["stability under heat", "viscosity optimization", "particle size distribution", "solubility enhancement",
              "shelf-life improvement", "bioavailability increase", "texture improvement", "homogeneity", "color stability"]

# Generate 100 documents
for i in range(1, 101):
    doc_text = (
        f"Document {i}: A study on the formulation of a {random.choice(contexts)} using {random.choice(ingredients)}, "
        f"prepared via {random.choice(methods)}. The formulation was tested for {random.choice(properties)} "
        f"and optimized for performance and safety."
    )
    
    # Save each document as a text file
    file_path = os.path.join(example_docs_folder, f"document_{i}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(doc_text)

print(f"✅ 100 documents generated and saved in '{example_docs_folder}' folder.")

# Display sublist of generated documents
doc_files = list(example_docs_folder.glob("*.txt"))
docs = [f.read_text() for f in doc_files]
docs[:5]  # Display first 3 documents

✅ 100 documents generated and saved in 'example_docs' folder.


['A typical formulation for hand cream includes water, glycerin, emulsifying wax, and preservatives.',
 'A standard shampoo formulation may contain surfactants like SLS, conditioning agents, and fragrance.',
 'Document 1: A study on the formulation of a nutraceutical capsule using shea butter, prepared via sonication. The formulation was tested for bioavailability increase and optimized for performance and safety.',
 'Document 10: A study on the formulation of a encapsulated nutrient using caffeine, prepared via freeze-drying. The formulation was tested for color stability and optimized for performance and safety.',
 'Document 100: A study on the formulation of a cosmetic cream using magnesium stearate, prepared via sonication. The formulation was tested for particle size distribution and optimized for performance and safety.']

In [15]:
# example_docs_folder = Path("example_docs")
# example_docs_folder.mkdir(exist_ok=True)

# # Example content
# (example_docs_folder / "doc1.txt").write_text(
#     "A typical formulation for hand cream includes water, glycerin, emulsifying wax, and preservatives."
# )
# (example_docs_folder / "doc2.txt").write_text(
#     "A standard shampoo formulation may contain surfactants like SLS, conditioning agents, and fragrance."
# )

# # Read all text files in the folder
# doc_files = list(example_docs_folder.glob("*.txt"))
# docs = [f.read_text() for f in doc_files]

# print(f"Loaded {len(docs)} documents for ingestion.")

In [21]:
# Step 3: Ingest documents asynchronously
async def ingest_docs():
    count = await ingest_texts(docs)
    print(f"✅ Ingested {count} documents.")

await ingest_docs()

✅ Ingested 102 documents.


In [3]:
# Step 4: Query RAG system
query_text = "What are the formulations discussed here?"
top_k = 10

async def query_rag(q, k):
    # Retrieve relevant documents
    retrieved_docs = await retrieve_top_k(q, k)
    print("\nTop-k retrieved documents:")
    for doc in retrieved_docs:
        print("-", doc)
    
    # Generate answer from retrieved docs
    answer = await generate_answer(q, retrieved_docs)
    print("\nGenerated answer:")
    print(answer)

await query_rag(query_text, top_k)



Top-k retrieved documents:
- Document 93: A study on the formulation of a functional food using water, prepared via solvent evaporation. The formulation was tested for shelf-life improvement and optimized for performance and safety.
- Document 74: A study on the formulation of a beverage formulation using water, prepared via solvent evaporation. The formulation was tested for texture improvement and optimized for performance and safety.
- Document 43: A study on the formulation of a pharmaceutical tablet using water, prepared via solvent evaporation. The formulation was tested for shelf-life improvement and optimized for performance and safety.
- A typical formulation for hand cream includes water, glycerin, emulsifying wax, and preservatives.
- A typical formulation for hand cream includes water, glycerin, emulsifying wax, and preservatives.
- A typical formulation for hand cream includes water, glycerin, emulsifying wax, and preservatives.
- A standard shampoo formulation may contai