In [6]:
import requests
import pandas as pd
import time

# Your USDA API Key
API_KEY = "JoNfGEtoZbg4ryrpthXncEVY9DVONjHaPX6GbxUb"

# Base API URL
BASE_URL = "https://api.nal.usda.gov/fdc/v1/foods/search"

# List of food categories (expandable)
food_queries = [
    "chicken", "beef", "pork", "fish", "rice", "bread", "milk", "egg", "banana",
    "apple", "orange", "cheese", "butter", "yogurt", "potato", "carrot", "onion",
    "tomato", "chocolate", "coffee", "soda", "pasta", "beans", "peanuts", "almonds",
    "oatmeal", "lettuce", "cucumber", "mushroom", "strawberry", "blueberry", "watermelon"
]

# List to store food data
food_data = []

# Function to fetch food data with proper pagination handling
def fetch_food_data(query, max_pages=5):
    page_number = 1  # Start from page 1

    while page_number <= max_pages:  # Stop after max_pages
        try:
            # API Request
            url = f"{BASE_URL}?query={query}&pageNumber={page_number}&api_key={API_KEY}"
            response = requests.get(url)
            data = response.json()

            # Check if there are food items
            if "foods" not in data or not data["foods"]:
                print(f"✅ No more data for {query} (Page {page_number}). Stopping search.")
                break  # Stop fetching

            # Extract relevant fields
            for food in data["foods"]:
                nutrients = {n["nutrientName"]: n["value"] for n in food.get("foodNutrients", [])}

                # Store only useful nutrients
                food_data.append({
                    "Food Name": food.get("description", "Unknown"),
                    "Calories (kcal)": nutrients.get("Energy", None),
                    "Protein (g)": nutrients.get("Protein", None),
                    "Carbs (g)": nutrients.get("Carbohydrate, by difference", None),
                    "Fat (g)": nutrients.get("Total lipid (fat)", None),
                    "FDC ID": food.get("fdcId", None)  # Unique food ID
                })

            print(f"✅ Fetched {len(data['foods'])} items for {query} (Page {page_number})")
            page_number += 1  # Go to next page
            time.sleep(1)  # Avoid rate limiting

        except Exception as e:
            print(f"❌ Error fetching data for {query}: {e}")
            break

# Fetch food data for each query
for query in food_queries:
    fetch_food_data(query)

# Convert to DataFrame
df = pd.DataFrame(food_data)

# Drop duplicate foods
df.drop_duplicates(subset=["Food Name"], keep="first", inplace=True)

# Save to CSV
df.to_csv("usda_food_dataset.csv", index=False)

print("✅ Dataset saved successfully! Total foods collected:", len(df))

✅ Fetched 50 items for chicken (Page 1)
✅ Fetched 50 items for chicken (Page 2)
✅ Fetched 50 items for chicken (Page 3)
✅ Fetched 50 items for chicken (Page 4)
✅ Fetched 50 items for chicken (Page 5)
✅ Fetched 50 items for beef (Page 1)
✅ Fetched 50 items for beef (Page 2)
✅ Fetched 50 items for beef (Page 3)
✅ Fetched 50 items for beef (Page 4)
✅ Fetched 50 items for beef (Page 5)
✅ Fetched 50 items for pork (Page 1)
✅ Fetched 50 items for pork (Page 2)
✅ Fetched 50 items for pork (Page 3)
✅ Fetched 50 items for pork (Page 4)
✅ Fetched 50 items for pork (Page 5)
✅ Fetched 50 items for fish (Page 1)
✅ Fetched 50 items for fish (Page 2)
✅ Fetched 50 items for fish (Page 3)
❌ Error fetching data for fish: 'value'
✅ Fetched 50 items for rice (Page 1)
✅ Fetched 50 items for rice (Page 2)
✅ Fetched 50 items for rice (Page 3)
✅ Fetched 50 items for rice (Page 4)
✅ Fetched 50 items for rice (Page 5)
✅ Fetched 50 items for bread (Page 1)
✅ Fetched 50 items for bread (Page 2)
✅ Fetched 50 items

In [7]:
!pip install sentence-transformers



In [8]:
!pip install faiss-cpu



In [20]:
# import faiss
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# import numpy as np

# # Load dataset
# df = pd.read_csv("/content/drive/MyDrive/usda_food_dataset.csv")

# # Load sentence transformer model
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # Generate food embeddings
# food_names = df["Food Name"].tolist()
# embeddings = model.encode(food_names, convert_to_numpy=True)

# # Normalize embeddings for cosine similarity
# embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# # Create FAISS index with Inner Product (Cosine Similarity)
# dimension = embeddings.shape[1]
# index = faiss.IndexFlatIP(dimension)  # IP (Inner Product) works for cosine similarity

# # Add embeddings to FAISS index
# index.add(embeddings)

# # Save FAISS index to Google Drive
# faiss.write_index(index, "/content/drive/MyDrive/faiss_food_index")

# print("✅ FAISS index rebuilt with corrected embeddings!")

import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/usda_food_dataset.csv")

# Load improved sentence transformer model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Generate food embeddings
food_names = df["Food Name"].tolist()
embeddings = model.encode(food_names, convert_to_numpy=True)

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index with Inner Product (Cosine Similarity)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

# Save FAISS index to Google Drive
faiss.write_index(index, "/content/drive/MyDrive/faiss_food_index")

print("✅ FAISS index rebuilt with improved embeddings!")

✅ FAISS index rebuilt with improved embeddings!


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
# def search_food(query, top_k=5):
#     # Convert query to vector embedding
#     query_embedding = model.encode([query], convert_to_numpy=True)
#     query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize

#     # Search FAISS index
#     distances, indices = index.search(query_embedding, top_k)

#     # Retrieve top food names and nutrition info
#     results = []
#     for i in range(top_k):
#         idx = indices[0][i]
#         if idx >= 0:  # Valid result
#             food_name = df.iloc[idx]["Food Name"]
#             calories = df.iloc[idx]["Calories (kcal)"]
#             protein = df.iloc[idx]["Protein (g)"]
#             fat = df.iloc[idx]["Fat (g)"]
#             carbs = df.iloc[idx]["Carbs (g)"]
#             results.append({
#                 "Food": food_name,
#                 "Calories": calories,
#                 "Protein": protein,
#                 "Fat": fat,
#                 "Carbs": carbs
#             })

#     return results

def search_food(query, top_k=5, dietary_preference=None):
    # Convert query to vector embedding
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Ensure the embedding has the correct shape
    query_embedding = np.asarray(query_embedding, dtype=np.float32).reshape(1, -1)

    # Normalize the query embedding (same as stored embeddings)
    query_embedding /= np.linalg.norm(query_embedding)

    # Verify that dimensions match before searching
    assert query_embedding.shape[1] == index.d, f"Dimension mismatch: Query {query_embedding.shape[1]}, Index {index.d}"

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve top food names and nutrition info
    results = []
    for i in range(top_k):
        idx = indices[0][i]
        if idx >= 0:
            food_name = df.iloc[idx]["Food Name"]
            calories = df.iloc[idx]["Calories (kcal)"]
            protein = df.iloc[idx]["Protein (g)"]
            fat = df.iloc[idx]["Fat (g)"]
            carbs = df.iloc[idx]["Carbs (g)"]

            # Apply dietary preference filtering
            if dietary_preference == "low_calorie" and calories > 200:
                continue
            if dietary_preference == "high_protein" and protein < 10:
                continue

            results.append({
                "Food": food_name,
                "Calories": calories,
                "Protein": protein,
                "Fat": fat,
                "Carbs": carbs
            })

    return results if results else "No suitable options found. Try adjusting dietary preferences."

In [22]:
query = "grilled chicken"
results = search_food(query)

print("🔍 Closest Matches for:", query)
for r in results:
    print(f"{r['Food']} - {r['Calories']} kcal, Protein: {r['Protein']}g, Fat: {r['Fat']}g, Carbs: {r['Carbs']}g")

🔍 Closest Matches for: grilled chicken
Barbecue chicken - 167.0 kcal, Protein: 19.0g, Fat: 4.64g, Carbs: 12.23g
Chicken wing, grilled with sauce - 250.0 kcal, Protein: 19.03g, Fat: 15.49g, Carbs: 7.28g
Chicken fillet, grilled - 151.0 kcal, Protein: 22.31g, Fat: 5.81g, Carbs: 2.36g
Chicken, chicken roll, roasted - 164.0 kcal, Protein: 26.68g, Fat: 6.33g, Carbs: 0.0g
Chicken with gravy - 129.0 kcal, Protein: 18.88g, Fat: 5.21g, Carbs: 1.58g


In [23]:
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the FAISS index
faiss_index_path = "/content/drive/MyDrive/faiss_food_index"
index = faiss.read_index(faiss_index_path)

# Load the food dataset
df = pd.read_csv("/content/drive/MyDrive/usda_food_dataset.csv")

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

In [13]:
!pip install groq

Collecting groq
  Downloading groq-0.20.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.20.0-py3-none-any.whl (124 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.20.0


In [None]:
from groq import Groq

# Initialize the Groq client
groq_client = Groq(api_key="API_KEY")

In [25]:
# def detect_query_intent(query):
#     # Keywords for low-calorie queries
#     low_calorie_keywords = ["low-calorie", "low calorie", "healthy", "light", "diet"]

#     # Keywords for high-calorie queries
#     high_calorie_keywords = ["high-calorie", "high calorie", "most calories", "energy-dense", "calorie-dense"]

#     # Check for low-calorie intent
#     if any(keyword in query.lower() for keyword in low_calorie_keywords):
#         return "low_calorie"

#     # Check for high-calorie intent
#     elif any(keyword in query.lower() for keyword in high_calorie_keywords):
#         return "high_calorie"

#     # Default to low-calorie if intent is unclear
#     else:
#         return "low_calorie"

In [26]:
import numpy as np

def retrieve_food_items(query, top_k=5):
    # Convert query to vector embedding
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve top food items
    results = []
    for i in range(top_k):
        if indices[0][i] != -1:  # Ensure valid result
            food_name = df.iloc[indices[0][i]]["Food Name"]
            calories = df.iloc[indices[0][i]]["Calories (kcal)"]
            protein = df.iloc[indices[0][i]]["Protein (g)"]
            carbs = df.iloc[indices[0][i]]["Carbs (g)"]
            fat = df.iloc[indices[0][i]]["Fat (g)"]

            results.append({
                "Food": food_name,
                "Calories": calories,
                "Protein": protein,
                "Carbs": carbs,
                "Fat": fat
            })

    return results

In [27]:
def generate_response(query, retrieved_items):
    # Prepare the context for the Groq API
    context = "Here are some relevant food items based on your query:\n"
    for item in retrieved_items:
        context += f"- {item['Food']}: {item['Calories']} kcal, {item['Protein']}g protein, {item['Carbs']}g carbs, {item['Fat']}g fat\n"

    # Generate a response using the Groq API
    response = groq_client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a specialized assistant that provides nutritional insights based on the provided food data. Use only the given context to answer user queries. Do not generate information beyond the provided dataset. Focus on evaluating the nutritional content of the retrieved food items, explaining their health benefits or concerns, comparing their macronutrient values."
            },
            {
                "role": "user",
                "content": f"{query}\n\n{context}"
            }
        ],
        model="llama-3.3-70b-versatile",  # Use the appropriate Groq model
    )

    return response.choices[0].message.content

In [28]:
def rag_agent(query, top_k=5):
    # Retrieve relevant food items
    retrieved_items = retrieve_food_items(query, top_k)

    # Generate a response using the Groq API
    response = generate_response(query, retrieved_items)

    return response

In [29]:
# Test the RAG agent with a query
query = "What are some healthy low-calorie food options?"
response = rag_agent(query)

print("Query:", query)
print("Response:", response)

Query: What are some healthy low-calorie food options?
Response: Based on the provided food items, here are some relatively healthy low-calorie food options:

1. Milk, chocolate, lowfat, reduced sugar: With 237.0 kcal, this option is relatively low in calories and also provides 3.43g of protein. The carb content is moderate, and the fat content is low.

2. Bread, reduced-calorie, oatmeal: This option has 210.0 kcal, which is relatively low in calories, and also provides 7.6g of protein. The carb content is moderate, but it is a good source of fiber due to the oatmeal content.

In comparison, the other options are higher in calories:
- Cookies, oatmeal, reduced fat: 1530.0 kcal (very high)
- Snacks, potato chips, reduced fat: 471.0 kcal (moderately high)
- Cookie, oatmeal, reduced fat, NS as to raisins: 365.0 kcal (moderately high)

Between the two relatively low-calorie options, the milk has a better macronutrient balance with lower carbs and fat, while the reduced-calorie oatmeal brea