In [1]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

# Set your API key
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=gemini_api_key)

# Initialize the Gemini model
llm = genai.GenerativeModel('gemini-2.0-flash')

In [1]:
import pdfplumber
import pandas as pd

def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            table = page.extract_table()
            if table:
                df = pd.DataFrame(table[1:], columns=table[0])
                tables.append(df)
    return pd.concat(tables, ignore_index=True) if tables else pd.DataFrame()

In [2]:
pdf_path = "data/pizza_ingredients.pdf"

In [3]:
df = extract_tables_from_pdf(pdf_path)

In [4]:
texts = df.apply(lambda row: " | ".join(row.astype(str)), axis=1).tolist()

In [13]:
json = df.to_dict(orient="records")

In [10]:
from embed_store import build_faiss_index, get_embeddings
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
json[0]

{'Pizza Name': 'Margherita',
 'Crust\nType': 'Thin',
 'Sauce': 'Tomato',
 'Cheese': 'Mozzarella',
 'Toppings': 'Basil'}

In [20]:
get_embeddings(json)

array([[-0.0038396 ,  0.07903948, -0.11241125, ..., -0.01579538,
         0.0650669 ,  0.05390403],
       [-0.07182054,  0.02036707, -0.05242678, ...,  0.0403441 ,
         0.07116739,  0.0442771 ],
       [-0.01778399,  0.00353835, -0.07590371, ...,  0.01286497,
        -0.03862418, -0.00466059],
       ...,
       [-0.07004339, -0.06089054, -0.05413391, ...,  0.0301772 ,
         0.03380238,  0.06286311],
       [ 0.03541187,  0.04458203, -0.02035068, ...,  0.03373704,
         0.02461265, -0.04009247],
       [ 0.04753882, -0.0023236 , -0.0302921 , ...,  0.0099019 ,
         0.10350329, -0.00841073]], dtype=float32)

In [22]:
from rag_pipeline import retrieve_context, generate_answer

In [26]:
index = build_faiss_index(json)

In [31]:
index.search(get_embeddings([json]), 3)

(array([[0.47110212, 0.34942874, 0.34083316]], dtype=float32),
 array([[1, 0, 6]], dtype=int64))

In [28]:
retrieve_context("What are the main ingredients in a Margherita pizza?", index, json, k=5)

[{'Pizza Name': 'Margherita',
  'Crust\nType': 'Thin',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Basil'},
 {'Pizza Name': 'Pepperoni',
  'Crust\nType': 'Classic',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Pepperoni'},
 {'Pizza Name': 'Mushroom\nTruffle',
  'Crust\nType': 'Thin',
  'Sauce': 'White\nSauce',
  'Cheese': 'Mozzarella',
  'Toppings': 'Mushrooms, Truffle Oil'},
 {'Pizza Name': 'Veggie\nSupreme',
  'Crust\nType': 'Whole\nWheat',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Bell Peppers, Olives, Mushrooms, Onions'},
 {'Pizza Name': 'Four Cheese',
  'Crust\nType': 'Thin',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella,\nParmesan,\nGorgonzola,\nRicotta',
  'Toppings': '—'}]