## Import libraries

In [2]:
import pdfplumber
import pandas as pd
import re

## Define regular expressions for ingredients, portions and units

In [7]:
# Define a function to process a line of text and extract quantities and units
def extract_quantities_units_corrected(line):
    # This pattern is for a quantity that may be followed by a unit (e.g., "15 4 OZ" or "100 3 oz")
    pattern_with_count_or_pack_size = r'(.+?)\s+(\d+)\s+(\d+\.?\d*\s*[a-zA-Z]+)$'
    
    # Try to find a match with the count/pack size pattern
    match = re.search(pattern_with_count_or_pack_size, line)
    if match:
        ingredient_name = match.group(1).strip()
        quantity_and_unit_1 = match.group(2).strip()  # The count or pack size
        quantity_and_unit_2 = match.group(3).strip()  # The actual quantity and unit
        return match.group(2), match.group(3)
    
    # If the pattern is not matched, try to match other patterns like PATTERN 2 and PATTERN 3
    # Define regex patterns for PATTERN 2 and PATTERN 3 with correct capturing groups
    pattern_2 = r'(\d+\.\d+)\s(\d+\s[a-zA-Z]+)\s([a-zA-Z]+)'
    pattern_3 = r'(\d+\.\d+)\s(\d+\s[a-zA-Z]+)'

    # Check for PATTERN 2
    matches = re.search(pattern_2, line)
    if matches:
        return matches.group(1), matches.group(2) + ' ' + matches.group(3)
    
    # Check for PATTERN 3
    matches = re.search(pattern_3, line)
    if matches:
        return matches.group(1), matches.group(2)
    
    # If no specific pattern is matched, try to extract any number and unit pairs
    matches = re.findall(r'(\d+\s\d+/\d+|\d+/\d+|\d+\.?\d*)\s([a-zA-Z]+)', line)
    if matches:
        if len(matches) > 1:
            quantity_and_unit_1 = " ".join(sum(matches[:-1], ()))
            quantity_and_unit_2 = " ".join(matches[-1])
        else:
            quantity_and_unit_1 = " ".join(matches[0])
            quantity_and_unit_2 = ''
        return quantity_and_unit_1, quantity_and_unit_2
    
    return '', ''

## Define regular expressions for recipe name, yield and portions

In [8]:
def extract_recipe_name_yield_adjusted(text):
    # Adjusted regex to capture the line immediately preceding "Cooking Time"
    recipe_name_pattern = r'([^\n]*)\nCooking Time:'
    recipe_name_match = re.search(recipe_name_pattern, text)
    recipe_name = recipe_name_match.group(1).strip() if recipe_name_match else ''

    # Yield extraction remains the same
    yield_pattern = r'Yield:\s*(.*?)\n'
    yield_match = re.search(yield_pattern, text)
    yield_value = yield_match.group(1).strip() if yield_match else ''

    return recipe_name, yield_value

## Open PDF file, initialize dataframe and store data


In [9]:
pdf_path = '/Users/elhamali/Documents/Data Projects/clf-climate-label-study/meal_plans_recipes/week-1-menus/Hopkins Cafe_Breakfast_Full Recipe.pdf'
ingredients_re_inclusive = r'(?:\*|\-)\s(?!do not|Note:|.*\b(do not|Note:)\b|\d+\.\s|CCP:|CCP\s:)([^\n]+)'
data = []

## Define the regular expressions

In [11]:
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()

        # Extract Recipe Name and Yield
        recipe_name, yield_value = extract_recipe_name_yield_adjusted(text)

        # Extract Ingredients 
        ingredients_re = r'(?:\*|\-)\s(?!\d+\.|CCP:|CCP\s:)([^\n]+)'
        ingredients = re.findall(ingredients_re, text)

        # Append extracted ingredients to the data list
        for ingredient in ingredients:
            # Use the new function to extract quantity and unit information
            quantity_and_unit_1, quantity_and_unit_2 = extract_quantities_units_corrected(ingredient)
            
            # The remaining text is considered the ingredient name
            # We replace the extracted quantities with an empty string to get the ingredient name
            ingredient_name = re.sub(rf'\b{quantity_and_unit_1}\b|\b{quantity_and_unit_2}\b', '', ingredient).strip()

            data_entry = {
                'Recipe Name': recipe_name,
                'Yield': yield_value,
                'Ingredient': ingredient_name,
                'Quantity and Unit 1': quantity_and_unit_1,
                'Quantity and Unit 2': quantity_and_unit_2
            }
            data.append(data_entry)
            print("Data Entry Added:", data_entry)

        # Optional: Break after first page for initial testing
        # break

Data Entry Added: {'Recipe Name': 'Cheese Cream Cinnamon Brown Sugar Whipped', 'Yield': '4.38 Pound', 'Ingredient': 'Plain Cream Cheese', 'Quantity and Unit 1': '4.38 Pound', 'Quantity and Unit 2': ''}
Data Entry Added: {'Recipe Name': 'Cheese Cream Cinnamon Brown Sugar Whipped', 'Yield': '4.38 Pound', 'Ingredient': 'Milk Whole Gallon', 'Quantity and Unit 1': '1/4 Cup', 'Quantity and Unit 2': '1/3 Tablespoon'}
Data Entry Added: {'Recipe Name': 'Cheese Cream Cinnamon Brown Sugar Whipped', 'Yield': '4.38 Pound', 'Ingredient': 'Light Brown Sugar', 'Quantity and Unit 1': '2 Cup', 'Quantity and Unit 2': '3 Tablespoon'}
Data Entry Added: {'Recipe Name': 'Cheese Cream Cinnamon Brown Sugar Whipped', 'Yield': '4.38 Pound', 'Ingredient': 'Ground Cinnamon', 'Quantity and Unit 1': '2 2/3 Tablespoon', 'Quantity and Unit 2': ''}
Data Entry Added: {'Recipe Name': 'Cheese Cream Cinnamon Brown Sugar Whipped', 'Yield': '4.38 Pound', 'Ingredient': 'Light Amber Honey', 'Quantity and Unit 1': '1 Cup', 'Qua

## Convert the data into a pandas dataframe

In [12]:
df = pd.DataFrame(data)

## Save the dataframe to Excel file

In [13]:
output_path = '/Users/elhamali/Documents/Data Projects/clf-climate-label-study/recipe-extraction-sheet.xlsx'
df.to_excel(output_path, index=False)