## Import libraries

In [4]:
import pdfplumber
import pandas as pd
import re

## Define the regular expressions


In [5]:
pdf_path = '/Users/elhamali/Documents/Data Projects/clf-climate-label-study/sample-3-recipe-cards.pdf'

recipe_name_re = r'Monday \d{1,2}/\d{1,2}/\d{4} Dinner\n(.+)\n'
yield_re = r'Yield: (.+?)\n'
portions_re = r'Portions: (\d+) +(\d+ oz)'
ingredients_re = r'-\s(.+?)\s(\d+\.?\d*|\d+ \d+/\d+)\s(.+)'

## Open the PDF with pdfplumber

In [8]:
# Initialize the dataframe to store data
data = []

In [None]:
with pdfplumber.open(pdf_path) as pdf:
    # Loop through each page in the PDF
    for page in pdf.pages:
        # Extract text from the current page
        text = page.extract_text()

        # Use regular expressions to match and parse the text
        recipe_name_match = re.search(recipe_name_re, text)
        yield_match = re.search(yield_re, text)
        portions_match = re.search(portions_re, text)
        site = re.search(site_re, text).group(0) if re.search(site_re, text) else 'Unknown Site'
        date = re.search(date_re, text).group(1) if re.search(date_re, text) else 'Unknown Date'
        recipe_note = re.search(recipe_note_re, text).group(1) if re.search(recipe_note_re, text) else 'No Note'
        meal_type = re.search(meal_type_re, text).group(1) if re.search(meal_type_re, text) else 'Unknown Meal Type'
        
        # Only proceed if all parts were found
        if recipe_name_match and yield_match and portions_match:
            recipe_name = recipe_name_match.group(1)
            yield_amount = yield_match.group(1)
            portions = portions_match.group(1)
            serving_portion_size = portions_match.group(2)
            
            # Find all ingredients
            ingredients_matches = re.findall(ingredients_re, text)
            
            # Process each ingredient match
            for ingredient_match in ingredients_matches:
                ingredient, portion_used, unit = ingredient_match
                # Add the details to the data list
                data.append({
                    'Ingredient': ingredient,
                    'Portion Used': portion_used,
                    'Portion Size': portion_size,
                    'Unit': unit,
                    'Recipe Name': recipe_name,
                    'Yield': yield_amount,
                    'Portions': portions,
                    'Serving Portion Size': serving_portion_size,
                    'Site': site,
                    'Date': date,
                    'Recipe Note': recipe_note,
                    'Meal Type': meal_type
                })

## Convert the data into a pandas DataFrame

In [None]:
df = pd.DataFrame(data)

## Save the dataframe to an Excel file

In [None]:
output_path = '/Users/elhamali/Documents/Data Projects/clf-climate-label-study/recipe-extraction-sheet.xlsx'
df.to_excel(output_path, index=False)