# Demonstration of the Method

In [None]:
import pymupdf
import re
from rapidfuzz import fuzz, process
import os
import json
import ollama
import chromadb

### Extracting Text from the PDF

In [None]:
doc = pymupdf.open("a guide to modern cookery.pdf")
chapter_one_pages = [1+26,14+26] # chapter one is pages 1 to 14, page 1 begins at 27, -1 for 0 indexing

# read the pages of chapter one
pages = []
for i in range(chapter_one_pages[0], chapter_one_pages[1]):
    page = doc[i]
    pages.append(page)

# clean the pages by removing unnecessary text
cleaned_pages = []
target_phrases = ["GUIDE TO MODERN COOKERY", "FONDS  DE  CUISINE"]  # remove the titles that appear
for page in pages:
    text = page.get_text()
    lines = text.splitlines()
    filtered_lines = [
            line for line in lines
            if not any(fuzz.partial_ratio(line.strip(), phrase) > 80 for phrase in target_phrases)
        ]
    cleaned_pages.append("\n".join(filtered_lines))
chapter_text = "\n".join(cleaned_pages)

### Extracting Recipes from the Chapter

In [None]:
# extracting recipes
recipes = []

# extract number and title
recipe_pattern = re.compile(
    r'(?P<header>[A-Za-z0-9]{1,4}—\s+.+?)(?=\n[A-Za-z0-9]{1,4}—|\Z)',
re.DOTALL)  # 1 to 4 character identifiers,the em dash, spaces, look ahead to stop capturing when another identifier is found

for block in recipe_pattern.finditer(chapter_text):
    block_text = block.group(0) # extract whole match
    current_recipe = {}

    # extract title and id
    header_patter = re.compile(
        r'^(?P<id>[A-Za-z0-9]{1,4})—(?P<title>.*)$',
        re.MULTILINE
    )
    header_match = header_patter.search(block_text)
    if header_match:
        current_recipe["recipe_id"] = header_match.group("id")
        current_title = header_match.group("title").strip()
        # extract everything until the next recipe match and add to current_recipe as "instructions"
        current_instructions = block_text[header_match.end():].strip().replace("\n","")
        
        # remove the extra spaces and swap '^' with 'e' 
        current_title = current_title.replace('^', 'e')
        current_instructions = current_instructions.replace('^', 'e')
        current_title = re.sub(r'\s+', ' ', current_title).strip()
        current_instructions = re.sub(r'\s+', ' ', current_instructions).strip()

        current_recipe['title'] = current_title
        current_recipe['instructions'] = current_instructions
    else:
        # if no header is found, just store everything as instructions
        current_instructions = block_text.strip().replace("\n", "")
        current_instructions = current_instructions.replace('^', 'e')
        current_instructions = re.sub(r'\s+', ' ', current_instructions).strip()
        current_recipe["instructions"] = current_instructions

    # add to list of recipes for this chapter
    recipes.append(current_recipe)
    

### Computing and Storing Embeddings for Each Recipe

In [None]:
# create lists of documents and ids for each recipe
documents = []
ids = []
for recipe in recipes:
    text = f"Title: {recipe['title']} \n Instructions: {recipe['instructions']}"
    documents.append(text)
    ids.append(recipe['recipe_id'])

# get or create the collection for the recipes
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="recipes")

# for each recipe, generate the embedding and store it 
for recipe_id, recipe in zip(ids, documents):
    if i % 100 == 0:
        print(f"Embedding recipe: {i+1}.")
    response = ollama.embed(model="mxbai-embed-large:latest", input=recipe)
    embeddings = response["embeddings"]
    # break
    collection.add(
        ids=[str(recipe_id)],  # use the recipe ID from the book
        embeddings=embeddings[0],
        documents=[recipe]
        # can add recipe metadata here such as ingredients and the chapter title
    )



### Testing Recipe Retrieval

In [None]:
# an example input
input = "What should I cook with thyme and basil?"

# generate an embedding for the input and retrieve the most relevant doc
embedded_input = ollama.embed(
  model="mxbai-embed-large",
  input=input
)

# use the embedded input to query for the most similar recipes
results = collection.query(
  query_embeddings=[embedded_input["embeddings"][0]],
  n_results=3
)

# format the extracted similar text
data = results['documents'][0]
formatted_data = "\n\n".join([f"Document {i+1}: {doc}" for i, doc in enumerate(data)])

# generate a response combining the prompt and data we retrieved 
output = ollama.generate(
  model="llama3",
  prompt=f"Using these recipes: {formatted_data}. Respond to this prompt using one or some of the techniques contained: {input}"
)

print(output['response'])