## Import libraries

In [56]:
import pdfplumber
import pandas as pd
import re
import glob

## Functions to extract ingredients, portions and units

In [57]:
def extract_quantities_units_corrected(line):
    # This pattern is for a quantity that may be followed by a unit (e.g., "15 4 OZ" or "100 3 oz")
    pattern_with_count_or_pack_size = r'(.+?)\s+(\d+)\s+(\d+\.?\d*\s*[a-zA-Z]+)$'
    
    match = re.search(pattern_with_count_or_pack_size, line)
    if match:
        ingredient_name = match.group(1).strip()
        quantity_and_unit_1 = match.group(2).strip()  # The count or pack size
        quantity_and_unit_2 = match.group(3).strip()  # The actual quantity and unit
        return match.group(2), match.group(3)
    
    # If the pattern is not matched, try to match other patterns like PATTERN 2 and PATTERN 3
    # Define regex patterns for PATTERN 2 and PATTERN 3 with correct capturing groups
    pattern_2 = r'(\d+\.\d+)\s(\d+\s[a-zA-Z]+)\s([a-zA-Z]+)'
    pattern_3 = r'(\d+\.\d+)\s(\d+\s[a-zA-Z]+)'

    # Check for PATTERN 2
    matches = re.search(pattern_2, line)
    if matches:
        return matches.group(1), matches.group(2) + ' ' + matches.group(3)
    
    # Check for PATTERN 3
    matches = re.search(pattern_3, line)
    if matches:
        return matches.group(1), matches.group(2)
    
    # If no specific pattern is matched, try to extract any number and unit pairs
    matches = re.findall(r'(\d+\s\d+/\d+|\d+/\d+|\d+\.?\d*)\s([a-zA-Z]+)', line)
    if matches:
        if len(matches) > 1:
            quantity_and_unit_1 = " ".join(sum(matches[:-1], ()))
            quantity_and_unit_2 = " ".join(matches[-1])
        else:
            quantity_and_unit_1 = " ".join(matches[0])
            quantity_and_unit_2 = ''
        return quantity_and_unit_1, quantity_and_unit_2
    
    return '', ''

## Functions to extract recipe name, yield and portions

In [58]:
def extract_recipe_name_yield_adjusted(text):
    # Adjust regex to capture the line immediately preceding "Cooking Time"
    recipe_name_pattern = r'([^\n]*)\nCooking Time:'
    recipe_name_match = re.search(recipe_name_pattern, text)
    recipe_name = recipe_name_match.group(1).strip() if recipe_name_match else ''

    # Yield extraction remains the same
    yield_pattern = r'Yield:\s*(.*?)\n'
    yield_match = re.search(yield_pattern, text)
    yield_value = yield_match.group(1).strip() if yield_match else ''

    return recipe_name, yield_value

In [59]:
def extract_portions(text):
    # Use Case 1: Number followed by a fraction with a unit (e.g., "500 1/2 cup")
    pattern_1 = r'Portions:\s*(\d+)\s+(\d+/\d+\s[a-zA-Z]+)'
    # Use Case 2: Number followed by text (e.g., "500 Slice", "50 Each")
    pattern_2 = r'Portions:\s*(\d+\s[a-zA-Z]+)'
    # Use Case 3: Number followed by a number and multiple texts (e.g., "50 4 oz ladle")
    pattern_3 = r'Portions:\s*(\d+)\s+(\d+\s[a-zA-Z]+\s[a-zA-Z]+)'
    # Use Case 4: Decimal number followed by text (e.g., "4.5 Pound")
    pattern_4 = r'Portions:\s*(\d+\.\d+\s[a-zA-Z]+)'
    # Use Case 5: Number followed by decimal and text (e.g., "400 5.5 oz Portion")
    pattern_5 = r'Portions:\s*(\d+)\s+(\d+\.\d+\s[a-zA-Z]+\s[a-zA-Z]+)'
    # Use Case 6: Text only (e.g., "(see below)")
    pattern_6 = r'Portions:\s*([a-zA-Z\(\)\s]+)'

    # Check for each pattern in order
    match = re.search(pattern_1, text)
    if match:
        return match.group(1), match.group(2)
    
    match = re.search(pattern_2, text)
    if match:
        return match.group(1), ''

    match = re.search(pattern_3, text)
    if match:
        return match.group(1), match.group(2)

    match = re.search(pattern_4, text)
    if match:
        return match.group(1), ''

    match = re.search(pattern_5, text)
    if match:
        return match.group(1), match.group(2)

    match = re.search(pattern_6, text)
    if match:
        return match.group(1), ''

    # Default case if none of the patterns match
    return '', ''

## Functions to extract site name, date, station and meal type

In [60]:
def extract_site_date_station_meal_type(text):
    site, date, station, meal_type = '', '', '', ''

    # Define patterns for Site, Date, and Meal Type
    site_pattern = r'(JHU Hopkins Cafe|JHU Nolans on 33rd)'
    date_pattern = r'(\w+day \d{1,2}/\d{1,2}/\d{4})'  # Pattern for "Day MM/DD/YYYY"
    meal_type_pattern = r'(Breakfast|Lunch|Dinner|All Meals|Late Night)'

    # Extract Site
    site_match = re.search(site_pattern, text)
    if site_match:
        site = site_match.group(1)
        site_index = site_match.end()  

    # Extract Date
    date_match = re.search(date_pattern, text)
    if date_match:
        date = date_match.group(1)

    # Extract Meal Type
    meal_type_match = re.search(meal_type_pattern, text)
    if meal_type_match:
        meal_type = meal_type_match.group(1)
        meal_type_index = meal_type_match.start()  

    # Extract Station - located to the right of Site and above Meal Type
    if site_match and meal_type_match and site_index < meal_type_index:
        station_line = text[site_index:meal_type_index].split('\n')[0]  
        station = station_line.strip()

    return site, date, station, meal_type

## Open PDF files, initialize dataframe and store data


In [61]:
folder_paths = [
    'meal_plans_recipes/week-1-menus',
    'meal_plans_recipes/week-2-menus',
    'meal_plans_recipes/week-3-menus',
    'meal_plans_recipes/week-4-menus',
]

ingredients_re_inclusive = r'(?:\*|\-)\s(?!do not|Note:|.*\b(do not|Note:)\b|\d+\.\s|CCP:|CCP\s:)([^\n]+)'
data = []

## Data extraction process

In [62]:
# Populate pdf_paths with every PDF file in each folder
pdf_paths = []
for folder_path in folder_paths:
    pdf_paths.extend(glob.glob(f"{folder_path}/*.pdf"))

# Loop through each PDF file
for pdf_path in pdf_paths:
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()

            # Extract Site, Date, Station, and Meal Type
            site, date, station, meal_type = extract_site_date_station_meal_type(text)

            # Extract Recipe Name, Yield and Portions
            recipe_name, yield_value = extract_recipe_name_yield_adjusted(text)
            portion_1, portion_2 = extract_portions(text)

            # Extract Ingredients 
            ingredients_re = r'(?:\*|\-)\s(?!\d+\.|CCP:|CCP\s:)([^\n]+)'
            ingredients = re.findall(ingredients_re, text)

            # Append extracted ingredients to the data list
            for ingredient in ingredients:
                quantity_and_unit_1, quantity_and_unit_2 = extract_quantities_units_corrected(ingredient)
                ingredient_name = re.sub(rf'\b{quantity_and_unit_1}\b|\b{quantity_and_unit_2}\b', '', ingredient).strip()

                data_entry = {
                    'Site': site,
                    'Date': date,
                    'Station': station,
                    'Meal Type': meal_type,
                    'Recipe Name': recipe_name,
                    'Yield': yield_value,
                    'Portion 1': portion_1,
                    'Portion 2': portion_2,
                    'Ingredient': ingredient_name,
                    'Quantity and Unit 1': quantity_and_unit_1,
                    'Quantity and Unit 2': quantity_and_unit_2
                }
                data.append(data_entry)
                print("Data Entry Added:", data_entry)

            # Optional: Break after first page for initial testing
            # break

Data Entry Added: {'Site': 'JHU Hopkins Cafe', 'Date': 'Monday 11/6/2023', 'Station': '[None]', 'Meal Type': 'Late Night', 'Recipe Name': 'Grill Plant Based Perfect Burger', 'Yield': '5 4 oz', 'Portion 1': '5', 'Portion 2': '4 oz\nInternal', 'Ingredient': 'Plant Based Perfect Burger', 'Quantity and Unit 1': '5', 'Quantity and Unit 2': '4 OZ'}
Data Entry Added: {'Site': 'JHU Hopkins Cafe', 'Date': 'Monday 11/6/2023', 'Station': '[None]', 'Meal Type': 'Late Night', 'Recipe Name': 'Grill Vegan Cheddar Cheese', 'Yield': '2 slice', 'Portion 1': '2 slice', 'Portion 2': '', 'Ingredient': 'Vegan Cheddar Cheese', 'Quantity and Unit 1': '2 Slice', 'Quantity and Unit 2': ''}
Data Entry Added: {'Site': 'JHU Hopkins Cafe', 'Date': 'Monday 11/6/2023', 'Station': '[None]', 'Meal Type': 'Late Night', 'Recipe Name': 'Lemonade Blue Jay', 'Yield': '2 Cup', 'Portion 1': '1', 'Portion 2': '8 oz\nInternal', 'Ingredient': 'Drink Lemonade Powder', 'Quantity and Unit 1': '0.07', 'Quantity and Unit 2': '14 Oz P

## Dataframe creation

In [53]:
df = pd.DataFrame(data)

## Dataframe export to Excel file

In [54]:
output_path = '/Users/elhamali/Documents/Data Projects/clf-climate-label-study/recipe-extraction-sheet.xlsx'
df.to_excel(output_path, index=False)