### Landing AI Parser

In [52]:
pdf_path = 'data/pizza mizza.pdf'
from pdf_parser import pdf_parser_func
from llm_engine import generate_content

In [None]:
parsing_response = pdf_parser_func(pdf_path)
prompt= f"how much protein does this pizza contain in total: {parsing_response}.Justify your answers based only on the text"
result = generate_content(prompt)
result

E0000 00:00:1760711105.331985 1555448 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


"Based on the provided table, here's the total protein content of the pizza:\n\n*   Whole wheat flour: 6g\n*   Tomato sauce: 2g\n*   Motal cheese: 12g\n*   Mozzarella cheese: 10g\n*   Cooked red beans: 5g\n*   Chicken breast: 20g\n*   Spinach: 1g\n*   Red onion: 0.5g\n*   Water, olive oil, salt, and oregano & basil: 0g\n\nAdding those values together: 6 + 2 + 12 + 10 + 5 + 20 + 1 + 0.5 = **56.5g**\n\nTherefore, the pizza contains a total of **56.5 grams of protein**.\n"

In [49]:
print(parsing_response)

<a id='e093d7a4-e4b5-4161-b840-bef670140ce2'></a>

Pizza "Mizza"

<a id='5d0a889b-1b1f-47a1-8108-1272a253e06b'></a>

O Cooking Instructions

1.  **Prepare the dough**: Mix whole wheat flour, water, olive oil, and salt. Knead until smooth and let it rest for 1 hour.
2.  **Preheat oven** to 220°C (428°F).
3.  **Roll out the dough** into a thin base and place on a baking tray.
4.  **Spread tomato sauce** evenly over the base.
5.  **Layer toppings**: Start with spinach, beans, chicken, red onion, and sprinkle both Motal and mozzarella cheese.
6.  **Season** with oregano and basil.
7.  **Bake** for 12-15 minutes until crust is golden and cheese is bubbling.

<a id='57d1da43-953c-4c39-aa42-c0aa86772bc1'></a>

🍕 Pizza Ingredient Table
<table id="0-1">
<tr><td id="0-2">Ingredient</td><td id="0-3">Grams</td><td id="0-4">Protein (g)</td></tr>
<tr><td id="0-5">Whole wheat flour</td><td id="0-6">150</td><td id="0-7">6</td></tr>
<tr><td id="0-8">Water</td><td id="0-9">100</td><td id="0-a">0</td></t

### PDFPlumber

In [None]:
import os
from pdfminer.high_level import extract_text
import camelot
import tabula
from pdf2image import convert_from_path
import pytesseract
import pandas as pd

def extract_tables_from_pdf(pdf_path, output_dir="output_tables"):
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Detect if PDF is scanned or digital
    text = extract_text(pdf_path)
    is_scanned = not text.strip()

    all_tables = []

    if not is_scanned:
        # Step 2a: Try Camelot first
        try:
            tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
            if len(tables) == 0:
                tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
            for table in tables:
                df = table.df
                all_tables.append(df)
        except Exception:
            # Fallback to Tabula
            dfs = tabula.read_pdf(pdf_path, pages="all", multiple_tables=True)
            all_tables.extend(dfs)
    else:
        # Step 2b: OCR for scanned PDFs
        images = convert_from_path(pdf_path)
        for page_num, image in enumerate(images, start=1):
            text = pytesseract.image_to_string(image, config="--psm 6")
            # Basic text-based parsing (can be LLM-refined later)
            lines = [line.strip() for line in text.split("\n") if line.strip()]
            df = pd.DataFrame(lines, columns=["raw_text"])
            all_tables.append(df)
    return all_tables

def convert_tables_into_json(tables):
    prompt= f"Convert the following semi-structured text into a clean JSON table with proper headers: {tables}"
    response = llm.generate_content(prompt)
    return response.text

In [26]:
pdf_path = "pizza meat-meat.pdf"
tables = extract_tables_from_pdf(pdf_path)
table_json = convert_tables_into_json(tables)

E0000 00:00:1760623902.883975 1419050 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [6]:
import pdfplumber

def extract_text_only(pdf_path, output_txt="output_text.txt"):
    # Detect if PDF is scanned or digital
    text = extract_text(pdf_path)
    is_scanned = not text.strip()

    extracted_text = ""

    if not is_scanned:
        # Digital PDF — use pdfplumber for accurate text layout
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text(layout=True) or ""
                extracted_text += page_text + "\n"
    else:
        # Scanned PDF — use OCR
        images = convert_from_path(pdf_path)
        for img in images:
            page_text = pytesseract.image_to_string(img, lang="eng")
            extracted_text += page_text + "\n"

    # Clean up text (remove extra blank lines)
    cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])
    return cleaned_text


In [None]:
# pdf_path = "pizza meat-meat.pdf"
# extracted_text = extract_text_only(pdf_path)
# prompt= f"how much onion is used in this pizza: {extracted_text}"
# response = llm.generate_content(prompt)
# print(response.text)

### RAG part

In [4]:
texts = df.apply(lambda row: " | ".join(row.astype(str)), axis=1).tolist()

In [9]:
json = df.to_dict(orient="records")

  json = df.to_dict(orient="records")


In [10]:
from embed_store import build_faiss_index, get_embeddings
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
json[0]

{'Pizza Name': 'Margherita',
 'Crust\nType': 'Thin',
 'Sauce': 'Tomato',
 'Cheese': 'Mozzarella',
 'Toppings': 'Basil'}

In [20]:
get_embeddings(json)

array([[-0.0038396 ,  0.07903948, -0.11241125, ..., -0.01579538,
         0.0650669 ,  0.05390403],
       [-0.07182054,  0.02036707, -0.05242678, ...,  0.0403441 ,
         0.07116739,  0.0442771 ],
       [-0.01778399,  0.00353835, -0.07590371, ...,  0.01286497,
        -0.03862418, -0.00466059],
       ...,
       [-0.07004339, -0.06089054, -0.05413391, ...,  0.0301772 ,
         0.03380238,  0.06286311],
       [ 0.03541187,  0.04458203, -0.02035068, ...,  0.03373704,
         0.02461265, -0.04009247],
       [ 0.04753882, -0.0023236 , -0.0302921 , ...,  0.0099019 ,
         0.10350329, -0.00841073]], dtype=float32)

In [22]:
from rag_pipeline import retrieve_context, generate_answer

In [26]:
index = build_faiss_index(json)

In [31]:
index.search(get_embeddings([json]), 3)

(array([[0.47110212, 0.34942874, 0.34083316]], dtype=float32),
 array([[1, 0, 6]], dtype=int64))

In [28]:
retrieve_context("What are the main ingredients in a Margherita pizza?", index, json, k=5)

[{'Pizza Name': 'Margherita',
  'Crust\nType': 'Thin',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Basil'},
 {'Pizza Name': 'Pepperoni',
  'Crust\nType': 'Classic',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Pepperoni'},
 {'Pizza Name': 'Mushroom\nTruffle',
  'Crust\nType': 'Thin',
  'Sauce': 'White\nSauce',
  'Cheese': 'Mozzarella',
  'Toppings': 'Mushrooms, Truffle Oil'},
 {'Pizza Name': 'Veggie\nSupreme',
  'Crust\nType': 'Whole\nWheat',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella',
  'Toppings': 'Bell Peppers, Olives, Mushrooms, Onions'},
 {'Pizza Name': 'Four Cheese',
  'Crust\nType': 'Thin',
  'Sauce': 'Tomato',
  'Cheese': 'Mozzarella,\nParmesan,\nGorgonzola,\nRicotta',
  'Toppings': '—'}]