## Preprocessing Merged Dataset

### Import Dependencies

In [574]:
import pandas as pd
from IPython.display import Image
import re
import ast
import fractions
from fractions import Fraction

### Import dataset

In [575]:
# Load Dataset Allrecipes
food_df = pd.read_csv('Food_Dataset.csv', index_col=None)
food_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,15 Best Air Fryer Thanksgiving Recipes,,,,,,,,https://www.allrecipes.com/thmb/zNe_lQRZgjj1rS...,https://www.allrecipes.com/gallery/best-air-fr...
1,Air Fryer Turkey Breast,"{'quantity': '1', 'unit': 'tablespoon', 'name'...",,263.0,0g,40g,10g,6.0,https://www.allrecipes.com/thmb/PaF8nNOY0bLCvo...,https://www.allrecipes.com/recipe/275372/air-f...
2,16 Quick-and-Easy Side Dish Recipes for the Ai...,,,,,,,,https://www.allrecipes.com/thmb/91y3R4leqrUtBV...,https://www.allrecipes.com/gallery/air-fryer-s...
3,Best Holiday Party Appetizers to Make in the A...,,,,,,,,https://cdn.jwplayer.com/v2/media/rggkwMPu/pos...,https://www.allrecipes.com/article/best-holida...
4,Air Fryer Lemon Garlic Parmesan Chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365.0,8g,46g,17g,4.0,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...


In [576]:
food_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,24162,23091,20595,22841,22818,22774,22619,22834,24018,24162
unique,19282,18327,17042,2870,902,746,721,201,19091,19317
top,Gnocchi,"{'quantity': '4', 'unit': '', 'name': 'apples ...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/17445/grille...
freq,9,9,8,88,415,1490,813,4794,10,9


### Remove NaN

In [577]:
# Count of NaN values in each column
nan_count = food_df.isna().sum()

# Print the count of NaN values
print("Count of NaN values in each column:")
print(nan_count)

Count of NaN values in each column:
name              0
ingredients    1071
steps          3567
calories       1321
carbs          1344
protein        1388
fat            1543
servings       1328
image_url       144
link              0
dtype: int64


In [578]:
# Drop rows with any NaN values
food_df = food_df.dropna()

# Display the cleaned DataFrame
print("DataFrame after dropping rows with NaN values:")
food_df.describe()


DataFrame after dropping rows with NaN values:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,20060,20060,20060,20060,20060,20060,20060,20060,20060,20060
unique,16586,16599,16604,2821,893,736,708,196,16592,16605
top,Smoked Fish Dip,"{'quantity': '2', 'unit': 'cups', 'name': 'fla...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/45291/smoked...
freq,8,8,8,88,336,1295,719,4355,10,8


### Remove duplicate

In [579]:
# Print the sum of duplicate rows
duplicate_count = food_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 3455


In [580]:
# Remove the duplicate rows
food_df= food_df.drop_duplicates()

# Display the cleaned DataFrame
print("\nDataFrame after removing duplicate rows:")
food_df.describe()


DataFrame after removing duplicate rows:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16605,16605,16605,16605,16605,16605,16605,16605,16605,16605
unique,16586,16599,16604,2821,893,736,708,196,16592,16605
top,Cajun Chicken and Sausage Gumbo,[],Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.taste.com.au/recipes/grilled-haris...
freq,2,4,2,88,256,1030,540,3712,4,1


### Clean '[]' ingridients

In [581]:
# Remove rows where 'ingredients' is '[]'
food_df = food_df[food_df['ingredients'] != '[]']

# Display the cleaned DataFrame
print("DataFrame after removing rows with '[]' in the 'ingredients' column:")
food_df.describe()

DataFrame after removing rows with '[]' in the 'ingredients' column:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16601,16601,16601,16601,16601,16601,16601,16601,16601,16601
unique,16582,16598,16600,2820,892,735,706,196,16588,16601
top,Cajun Chicken and Sausage Gumbo,"{'quantity': '6', 'unit': '', 'name': 'eggs'}",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.taste.com.au/recipes/grilled-haris...
freq,2,2,2,88,256,1030,540,3711,4,1


### Lowercase all name

In [582]:
# Convert all food names to lowercase
food_df['name'] = food_df['name'].str.lower()

# Display the updated DataFrame
print("DataFrame after converting food names to lowercase:")
food_df.head()

DataFrame after converting food names to lowercase:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
4,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365,8g,46g,17g,4,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...
6,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g...",Preheat an air fryer to 380 degrees F (193 deg...,143,20g,2g,6g,10,https://www.allrecipes.com/thmb/_EDaiFt0gIGQOL...,https://www.allrecipes.com/air-fryer-s-mores-r...
7,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}...",Preheat an air fryer to 400 degrees F (200 deg...,283,62g,3g,3g,1,https://www.allrecipes.com/thmb/156WNgRfzvGn-s...,https://www.allrecipes.com/air-fryer-baked-yam...
8,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex...","Whisk together olive oil, wine, 2 tablespoons ...",636,21g,43g,41g,6,https://cdn.jwplayer.com/v2/media/ahbYMLcr/thu...,https://www.allrecipes.com/lemon-garlic-butter...
9,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F...",Preheat the air fryer to 370 degrees F (188 de...,902,108g,29g,40g,2,https://www.allrecipes.com/thmb/cdL3DKZH3beUk5...,https://www.allrecipes.com/air-fryer-grilled-p...


### Remove duplicated recipe name

In [583]:
# Remove duplicate food names, keeping the first occurrence
food_df = food_df.drop_duplicates(subset='name', keep='first')

# Display the cleaned DataFrame
print("DataFrame after removing duplicate food names:")
food_df.describe()

DataFrame after removing duplicate food names:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16569,16569,16569,16569,16569,16569,16569,16569,16569,16569
unique,16569,16566,16568,2805,885,729,703,195,16557,16569
top,grilled harissa zucchini on tabbouleh,"{'quantity': '12', 'unit': '', 'name': 'eggs'}",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.taste.com.au/recipes/grilled-haris...
freq,1,2,2,88,255,1030,540,3703,4,1


### Check top frequency image_url

In [584]:
# Getting the mode
mode_image_url = food_df['image_url'].mode()

# Get the frequency of the most common image_url
mode_count = food_df['image_url'].value_counts().iloc[0]

# Print the full mode and the frequency (mode number)
print("Full mode of image_url column:")
print(mode_image_url.iloc[0])

print("\nMode count (number of occurrences):")
print(mode_count)

Full mode of image_url column:
https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg

Mode count (number of occurrences):
4


In [585]:
# URL of the image
image_url = "https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg"

# Display the image
Image(url=image_url)

No need to remove duplicate image

### Splitting into 3 df for to transform each website dataset into a same format

In [586]:
# Ensure there are no leading or trailing spaces in the link column
food_df['link'] = food_df['link'].str.strip()

# Split the DataFrame based on the 'link' column
Allrecipe_df = food_df[food_df['link'].str.contains('https://www.allrecipes.com', na=False)]
BBC_df = food_df[food_df['link'].str.contains('https://www.bbcgoodfood.com', na=False)]
Taste_df = food_df[food_df['link'].str.contains('https://www.taste.com.au', na=False)]

# Reset the index
Allrecipe_df.reset_index(drop=True, inplace=True)
BBC_df.reset_index(drop=True, inplace=True)
Taste_df.reset_index(drop=True, inplace=True)

### Transform ALLrecipe dataset into a same format (We use ALLrecipe_df 's format as the model for ther 2 other website)

In [587]:
Allrecipe_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365,8g,46g,17g,4,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...
1,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g...",Preheat an air fryer to 380 degrees F (193 deg...,143,20g,2g,6g,10,https://www.allrecipes.com/thmb/_EDaiFt0gIGQOL...,https://www.allrecipes.com/air-fryer-s-mores-r...
2,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}...",Preheat an air fryer to 400 degrees F (200 deg...,283,62g,3g,3g,1,https://www.allrecipes.com/thmb/156WNgRfzvGn-s...,https://www.allrecipes.com/air-fryer-baked-yam...
3,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex...","Whisk together olive oil, wine, 2 tablespoons ...",636,21g,43g,41g,6,https://cdn.jwplayer.com/v2/media/ahbYMLcr/thu...,https://www.allrecipes.com/lemon-garlic-butter...
4,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F...",Preheat the air fryer to 370 degrees F (188 de...,902,108g,29g,40g,2,https://www.allrecipes.com/thmb/cdL3DKZH3beUk5...,https://www.allrecipes.com/air-fryer-grilled-p...


In [588]:
Allrecipe_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,12685,12685,12685,12685,12685,12685,12685,12685,12685,12685
unique,12685,12682,12684,1058,174,105,112,76,12673,12685
top,zucchini plum bread,"{'quantity': '2 ¼', 'unit': 'cups', 'name': 'I...",Pour the 1/2 cup of water or milk into a small...,145,1g,3g,0g,8,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/234327/zucch...
freq,1,2,2,38,255,1030,540,2369,4,1


### Transform BBCGoodFood dataset into a same format

In [589]:
#Check what format we're dealing with
BBC_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,"('salsa verde baked eggs',)","([{'ingredient': 'olive oil', 'quantity': '5',...",(['Step 1: Drizzle 1 tbsp of the olive oil in ...,'268low','7g','12g','21g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/salsa-verd...
1,"('sausage & fennel orecchiette',)","([{'ingredient': '', 'quantity': '1', 'unit': ...","(['Step 1: Boil a large pan of salted water, t...",'527','62g','21g','19g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/sausage-fe...
2,"('roasted cod with zingy beetroot salad',)","([{'ingredient': 'baby new potatoes', 'quantit...",(['Step 1: Heat the oven to 200C/180C fan/gas ...,'262low','21g','36g','3glow',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/roasted-co...
3,"('tomato soup & hummus crispbreads',)","([{'ingredient': 'rapeseed oil', 'quantity': '...",(['Step 1: Heat the oil in a large non-stick p...,'403','51g','15g','12g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/tomato-sou...
4,"('puy lentils with seared salmon',)","([{'ingredient': 'dried puy lentils', 'quantit...",(['Step 1: Put the lentils in a pan of water w...,'519','29g','38g','25g',2,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/puy-lentil...


In [590]:
BBC_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,3065,3065,3065,3065,3065,3065,3065,3065,3065,3065
unique,3065,3065,3065,1003,178,179,151,168,3065,3065
top,"('cucumber & elderflower spritzer',)","([{'ingredient': '', 'quantity': '1', 'unit': ...",(['Step 1: Chop the cucumber into a few chunks...,'0','0g','2g','0g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/cucumber-e...
freq,1,1,1,88,110,195,182,877,1,1


In [591]:
# Function to clean up the columns
def clean_column(value):
    return value.strip("(['),]").replace("'", "").replace("low", "").strip()

# Apply the function to the relevant columns
BBC_df['name'] = BBC_df['name'].apply(clean_column)
BBC_df['ingredients'] = BBC_df['ingredients'].apply(lambda x: x.strip("(['),])"))
BBC_df['steps'] = BBC_df['steps'].apply(lambda x: x.strip("(['),])"))
BBC_df['calories'] = BBC_df['calories'].apply(clean_column)
BBC_df['carbs'] = BBC_df['carbs'].apply(clean_column)
BBC_df['protein'] = BBC_df['protein'].apply(clean_column)
BBC_df['fat'] = BBC_df['fat'].apply(clean_column)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['name'] = BBC_df['name'].apply(clean_column)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['ingredients'] = BBC_df['ingredients'].apply(lambda x: x.strip("(['),])"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['steps'] = BBC_df['steps'].apply(lambda x: x.strip("(['),])"))
A

In [592]:
#Check result
BBC_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,salsa verde baked eggs,"{'ingredient': 'olive oil', 'quantity': '5', '...",Step 1: Drizzle 1 tbsp of the olive oil in afr...,268,7g,12g,21g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/salsa-verd...
1,sausage & fennel orecchiette,"{'ingredient': '', 'quantity': '1', 'unit': 'l...","Step 1: Boil a large pan of salted water, turn...",527,62g,21g,19g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/sausage-fe...
2,roasted cod with zingy beetroot salad,"{'ingredient': 'baby new potatoes', 'quantity'...",Step 1: Heat the oven to 200C/180C fan/gas 6. ...,262,21g,36g,3g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/roasted-co...
3,tomato soup & hummus crispbreads,"{'ingredient': 'rapeseed oil', 'quantity': '2'...",Step 1: Heat the oil in a large non-stick pan ...,403,51g,15g,12g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/tomato-sou...
4,puy lentils with seared salmon,"{'ingredient': 'dried puy lentils', 'quantity'...",Step 1: Put the lentils in a pan of water with...,519,29g,38g,25g,2,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/puy-lentil...


### Transform Taste dataset into a same format

In [593]:
def convert_to_decimal(quantity_str):
    """Convert a string representation of a number (including mixed numbers) to decimal."""
    if not quantity_str:
        return None
    
    # Clean up the string
    quantity_str = quantity_str.strip()
    
    try:
        # Check if it's a mixed number (e.g., "1 1/2")
        mixed_match = re.match(r'(\d+)\s+(\d+)/(\d+)', quantity_str)
        if mixed_match:
            whole = int(mixed_match.group(1))
            num = int(mixed_match.group(2))
            denom = int(mixed_match.group(3))
            return str(whole + (num / denom))
        
        # Check if it's a simple fraction (e.g., "1/2")
        fraction_match = re.match(r'(\d+)/(\d+)', quantity_str)
        if fraction_match:
            num = int(fraction_match.group(1))
            denom = int(fraction_match.group(2))
            return str(num / denom)
        
        # If it's a simple number, convert it directly
        return str(float(quantity_str))
    
    except (ValueError, ZeroDivisionError):
        return None

def parse_ingredient(ingredient):
    # Regex pattern to match quantity, unit, and name
    pattern = r'(?P<quantity>[0-9/.\s]+)?\s*(?P<unit>[a-zA-Z()]+)?\s*(?P<name>.+)'
    match = re.match(pattern, ingredient.strip())
    
    if match:
        # Get the initial groups
        quantity = match.group('quantity')
        unit = match.group('unit')
        name = match.group('name')
        
        # Clean up quantity and convert to decimal
        quantity = convert_to_decimal(quantity) if quantity else None
        
        # Clean up unit
        if unit:
            unit = standardize_unit(unit)
            # If unit is not valid, add it to the name
            if not unit and name:
                name = f"{match.group('unit')} {name}"
        
        # Clean up name:
        # 1. Remove content in parentheses
        name = re.sub(r'\([^)]*\)', '', name)
        
        # 2. Remove descriptive phrases after commas
        name = re.sub(r',.*$', '', name)

        # Remove all special symbols
        name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
        
        # 3. Remove specific descriptors
        descriptors = [
            r'finely chopped',
            r'thinly sliced',
            r'peeled',
            r'seeded',
            r'\b\w+ly\b',  # words ending in 'ly',
            r'\b\w+ed\b',  # words ending in 'ed',
            # words about,
            r'about',
        ]
        for descriptor in descriptors:
            name = re.sub(descriptor, '', name, flags=re.IGNORECASE)
        
        # 4. Final cleanup
        name = name.strip(' ,')

        
        return {
            'quantity': quantity,
            'unit': unit,
            'name': name if name else ''
        }
    return None

In [594]:
# Fix unit to be valid
def standardize_unit(unit):
    """Standardize cooking units and validate them."""
    if not unit:
        return None
        
    # Dictionary of valid cooking units and their standard forms
    valid_units = {
        # Volume
        'tbsp': ['tbsp', 'tablespoon', 'tablespoons'],
        'tsp': ['tsp', 'teaspoon', 'teaspoons'],
        'cup': ['cup', 'cups', 'c'],
        'ml': ['ml', 'milliliter', 'milliliters', 'millilitre', 'millilitres'],
        'l': ['l', 'liter', 'liters', 'litre', 'litres'],
        
        # Weight
        'g': ['g', 'gram', 'grams'],
        'kg': ['kg', 'kilogram', 'kilograms'],
        'oz': ['oz', 'ounce', 'ounces'],
        'lb': ['lb', 'pound', 'pounds'],
        
        # Count
        'bunch': ['bunch', 'bunches'],
        'clove': ['clove', 'cloves'],
        'piece': ['piece', 'pieces']
    }
    
    # Convert to lowercase for comparison
    unit = unit.lower().strip()
    
    # Check if the unit is valid and return standardized form
    for standard, variants in valid_units.items():
        if unit in variants:
            return standard
            
    return None

In [595]:
# Function to extract the calorie value from the string
def extract_calories(energy_str):
    match = re.search(r'(\d+)\s*cal', energy_str)
    if match:
        return match.group(1)  # Extracts the calorie value
    # if energy_str is already a number return the number instead
    elif energy_str.isdigit():
        return energy_str
    return None

In [596]:
Taste_df = food_df[food_df['link'].str.contains('https://www.taste.com.au', na=False)]

In [597]:
# Safely handle the 'steps' column
if 'steps' in Taste_df.columns:
    Taste_df['steps'] = Taste_df['steps'].apply(
        lambda step_list: ', '.join(ast.literal_eval(step_list)) 
        if isinstance(step_list, str) and step_list.startswith('[') else step_list
    )

# Safely handle the 'ingredients' column
if 'ingredients' in Taste_df.columns:
    Taste_df['ingredients'] = Taste_df['ingredients'].apply(
        lambda ing_list: ', '.join(
            map(
                lambda item: str(parse_ingredient(item)) if parse_ingredient(item) else '',
                ast.literal_eval(ing_list)
            )
        ).strip('[]') 
        if isinstance(ing_list, str) and ing_list.startswith('[') else ing_list
    )

# Safely extract the calorie value from the 'calories'
def extract_calories(calorie_value):
    try:
        # Example: "300 kcal" -> "300"
        return int(re.search(r'\d+', calorie_value).group()) if isinstance(calorie_value, str) else None
    except (AttributeError, ValueError):
        return None

if 'calories' in Taste_df.columns:
    Taste_df['calories'] = Taste_df['calories'].apply(extract_calories)

Taste_df['ingredients']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Taste_df['steps'] = Taste_df['steps'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Taste_df['ingredients'] = Taste_df['ingredients'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Taste_df['calories'] = Taste_df['calories'].apply(extract_calories)


23316    {'quantity': '1.0', 'unit': 'cup', 'name': 'br...
23317    {'quantity': '1.0', 'unit': None, 'name': 'egg...
23318    {'quantity': '50.0', 'unit': 'g', 'name': 'Cha...
23319    {'quantity': '1.0', 'unit': 'tbsp', 'name': 'o...
23320    {'quantity': '1.0', 'unit': None, 'name': 'she...
                               ...                        
24157    {'quantity': '4.0', 'unit': None, 'name': 'sco...
24158    {'quantity': '3.0', 'unit': None, 'name': 'egg...
24159    {'quantity': '0.25', 'unit': 'cup', 'name': 'b...
24160    {'quantity': '100.0', 'unit': 'g', 'name': 'bu...
24161    {'quantity': '5.0', 'unit': None, 'name': 'zuc...
Name: ingredients, Length: 819, dtype: object

In [598]:
Taste_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
23316,easiest-ever fried rice,"{'quantity': '1.0', 'unit': 'cup', 'name': 'br...",Boil brown and white rice separately following...,2272,87.4g,26.8g,8.7g,4,https://content.api.news/v3/images/bin/640b903...,https://www.taste.com.au/recipes/unique-fried-...
23317,cheesy french onion meatballs and mushrooms,"{'quantity': '1.0', 'unit': None, 'name': 'egg...","Combine the egg, beef, breadcrumbs and 1&1/2 t...",3138,39.0g,35.9g,49.4g,4,https://img.taste.com.au/gJ9jKwKI/w720-h480-cf...,https://www.taste.com.au/recipes/cheesy-french...
23318,thai chicken lettuce cups with avocado and lime,"{'quantity': '50.0', 'unit': 'g', 'name': 'Cha...",Bring a large saucepan of salted water to a bo...,4775,80.9g,83.5g,56.8g,4,https://img.taste.com.au/fqti94HE/w720-h480-cf...,https://www.taste.com.au/recipes/thai-chicken-...
23319,oven-baked mushroom risotto with pesto,"{'quantity': '1.0', 'unit': 'tbsp', 'name': 'o...",Preheat oven to 150°C. Heat half the oil in a ...,2685,69.8g,15.2g,31.4g,6,https://img.taste.com.au/-botctrO/w720-h480-cf...,https://www.taste.com.au/recipes/oven-baked-mu...
23320,tomato-tofu salad with creamy ginger dressing,"{'quantity': '1.0', 'unit': None, 'name': 'she...",Preheat the oven to 220°C (200°C fan-forced). ...,1367,16.0g,8.1g,27.2g,4,https://img.taste.com.au/AWH3XZcf/w720-h480-cf...,https://www.taste.com.au/recipes/tomato-tofu-s...


### Merge back formated datasets

In [599]:
#Merge back
new_df = pd.concat([Allrecipe_df, BBC_df, Taste_df], ignore_index=True)

new_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16569,16569,16569,16569,16569,16569,16569,16569,16569,16569
unique,16515,16566,16568,1828,730,614,565,195,16557,16569
top,easy egg fried rice,"{'quantity': '12', 'unit': '', 'name': 'eggs'}",Pour the 1/2 cup of water or milk into a small...,0,3g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.taste.com.au/recipes/grilled-haris...
freq,2,2,2,88,330,1202,754,3703,4,1


In [600]:
print(new_df.iloc[0]['ingredients'])

{'quantity': '1 1/2', 'unit': 'pounds', 'name': 'skinless boneless chicken thighs'}, {'quantity': '3', 'unit': 'cloves', 'name': 'garlic, minced'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'lemon zest'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'paprika'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'dried oregano'}, {'quantity': '1/2', 'unit': 'teaspoon', 'name': 'salt'}, {'quantity': '1/4', 'unit': 'teaspoon', 'name': 'crushed red pepper'}, {'quantity': '1/2', 'unit': 'cup', 'name': 'grated parmesan cheese'}, {'quantity': '1/4', 'unit': 'cup', 'name': 'panko bread crumbs'}, {'quantity': None, 'unit': None, 'name': 'cooking spray'}


### Add a new header for standardizing measurements in grams/ ml

In [601]:

# Conversion dictionary
CONVERSION_FACTORS = {
    'pounds': 453.592,
    'pound': 453.592,
    'ounces': 28.3495,
    'ounce': 28.3495,
    'grams': 1,
    'gram': 1,
    'g': 1,
    'ml': 1,
    'milliliters': 1,
    'l': 1000,
    'liters': 1000,
    'kilograms': 1000,
    'kilogram': 1000,
    'kg': 1000,
    'cups': 240,  # Assuming 1 cup = 240 ml
    'cup': 240,
    'tbsp': 15,
    'tablespoons': 15,
    'tablespoon': 15,
    'tsp': 5,
    'teaspoons': 5,
    'teaspoon': 5,
    'cloves': 5,  # Assuming 1 clove of garlic = 5 grams
    'large': 56.8
}

FRACTIONS = {
    "½": "1/2",
    "⅓": "1/3",
    "⅔": "2/3",
    "¼": "1/4",
    "¾": "3/4",
    "⅕": "1/5",
    "⅖": "2/5",
    "⅗": "3/5",
    "⅘": "4/5",
    "⅙": "1/6",
    "⅚": "5/6",
    "⅐": "1/7",
    "⅛": "1/8",
    "⅜": "3/8",
    "⅝": "5/8",
    "⅞": "7/8",
    "11⁄16": "11/16",
    "1⁄16": "1/16",
    "1⁄8": "1/8"
}

GREEN = '\033[32m'
RESET = '\033[0m'

In [602]:
import re

# Helper function to handle fractions
def handle_fraction(quantity_str):
    """Handles fractions like '1/4' or '1/2'."""
    if '/' in quantity_str:
        try:
            # Check if it's a mixed fraction or simple fraction
            if ' ' in quantity_str:
                # For mixed fractions like '1 1/2'
                return handle_mixed_fraction(quantity_str)
            else:
                # Simple fractions like '1/2'
                numerator, denominator = map(float, quantity_str.split('/'))
                return numerator / denominator
        except ValueError:
            return None
    return None

# Helper function to handle mixed fractions like '1 1/2'
def handle_mixed_fraction(quantity_str):
    """Handles mixed fractions like '1 1/2'."""
    if ' ' in quantity_str and '/' in quantity_str:
        parts = quantity_str.split(' ', 1)
        whole = float(parts[0])  # Whole number part
        fraction = handle_fraction(parts[1])  # Fractional part
        if fraction is not None:
            return whole + fraction
        else:
            return None
    return None

# Helper function to handle ranges like '1 1/2 to 2' or '1/2 to 1/4'
def handle_range(quantity_str):
    """Handles ranges like '1 1/2 to 2' or '1/2 to 1/4'."""
    if 'to' in quantity_str:
        parts = quantity_str.split('to')
        lower = convert_to_number(parts[0].strip())
        upper = convert_to_number(parts[1].strip())
        
        if lower is not None and upper is not None:
            return (lower + upper) / 2  # Return the average of the two numbers
        else:
            return None
    return None

# Helper function to handle parentheses multiplication like '1 (12)'
def handle_parentheses_multiplication(quantity_str):
    """Handles cases like '1 (12)' and multiplies the numbers."""
    if '(' in quantity_str and not ')' in quantity_str:
        parts = quantity_str.split('(')
        multiplier = float(parts[0].strip())
        factor = float(parts[1].strip())
        return multiplier * factor
    return None

# Helper function to remove unit from the quantity string
def remove_unit(quantity_str):
    """Removes known units from the quantity string."""
    units = ['cup', 'tbsp', 'tsp', 'g', 'ml', 'ounce', 'gram', 'pound', 'cups', 'large', 'small']
    for unit in units:
        if unit in quantity_str.lower():
            quantity_str = quantity_str.replace(unit, '').strip()
            break
    return quantity_str

# Main function to convert the quantity
def convert_to_number(quantity_str, name=""):
    try:
        # Remove units like 'cup', 'tbsp', etc.
        quantity_str = remove_unit(quantity_str)

        # Handle open parentheses like '1 (12)'
        result = handle_parentheses_multiplication(quantity_str)
        if result is not None:
            return result

        # Normalize fraction symbols like '½' to '1/2'
        # This assumes you have a FRACTIONS dictionary that maps symbols like '½' to '1/2'
        for symbol, fraction in FRACTIONS.items():
            quantity_str = quantity_str.replace(symbol, fraction)

        # Remove unnecessary characters like "( ... )"
        quantity_str = re.sub(r'[\(\)-]', '', quantity_str)
        quantity_str = re.sub(r'\s+', ' ', quantity_str).strip()  # Replace multiple spaces with one

        # Handle fractions like '1/4' and mixed fractions like '1 1/2'
        result = handle_fraction(quantity_str)
        if result is not None:
            return result

        # Handle ranges like '1 1/2 to 2' or '1/2 to 1/4'
        result = handle_range(quantity_str)
        if result is not None:
            return result

        # Check for empty or descriptive strings
        if not quantity_str or 'for serving' in quantity_str.lower() or 'to taste' in quantity_str.lower():
            return None

        # Try to convert the remaining value to a float (simple numeric value)
        try:
            return float(quantity_str)
        except ValueError:
            return None

    except Exception as e:
        print(f"Error converting quantity: {quantity_str} - {e} for {GREEN}{name}{RESET}")
        return None


In [603]:
quantity_error_string = {
    "1 1/2",
    "2 (8",
    "1/4 cup",
    "1/2 to 1/4",
    "1 (12",
    "2 (8",
    "1⁄16",
    "1 1/2 to 2",
    "1/4 to 1/2",
    "1 to 2",
    "1/2 to 2",
    "1 1/2 to 3 3/4"
}

count = 0
print(f"{'Original Quantity':<20} {'Converted Quantity':<20}")
print("-" * 40)  # Adding a separator line

for string in quantity_error_string:
    count += 1
    ori = string
    # Convert the numeric part of the quantity
    string = convert_to_number(string, f"test{count}")
    
    # Format and print the output in a neatly aligned way
    print(f"{ori:<20} {string if string is not None else 'None':<20}")



Original Quantity    Converted Quantity  
----------------------------------------
1 (12                12.0                
2 (8                 16.0                
1/4 cup              0.25                
1/2 to 1/4           0.375               
1/4 to 1/2           0.375               
1 1/2                1.5                 
1 1/2 to 3 3/4       2.625               
1⁄16                 0.0625              
1 1/2 to 2           1.75                
1/2 to 2             1.25                
1 to 2               1.5                 


In [604]:
# Function to process an individual ingredient
def process_ingredient(ingredient):
    try:
        quantity = ingredient.get("quantity")
        unit = ingredient.get("unit")
        name = ingredient.get("name")
        
        # 
        if isinstance(quantity, str):
            quantity = convert_to_number(quantity, name)
            
        # If quantity is None or invalid, return ingredient as is
        if quantity is None:
            return {
                "quantity": ingredient.get("quantity"),
                "unit": unit,
                "name": name,
                "quantity_in_grams": 0
            }
        
        # Convert to grams or ml using the conversion factor
        if unit in CONVERSION_FACTORS and quantity is not None:
            conversion_factor = CONVERSION_FACTORS[unit]
            quantity_in_grams_or_ml = round(quantity * conversion_factor, 2)
        else:
            # If no unit or quantity, return as is
            quantity_in_grams_or_ml = None

        # Return updated ingredient
        return {
            "quantity": ingredient.get("quantity"),
            "unit": unit,
            "name": name,
            "quantity_in_grams": quantity_in_grams_or_ml
        }
    except Exception as e:
        return [] 


# Function to process a list of ingredients
def process_ingredients(ingredients_str):
    try:
        ingredients_list = ast.literal_eval(ingredients_str)  # Safely evaluate the list
        processed_ingredients = [process_ingredient(ing) for ing in ingredients_list]
        return processed_ingredients
    except Exception as e:
        print(f"Error processing ingredients: {ingredients_str}, Error: {e}")
        return ingredients_str

In [605]:

# Apply the conversion function to the DataFrame
new_df["ingredients"] = new_df["ingredients"].apply(process_ingredients)

# Display updated ingredients
print("Updated Ingredients:")
ingredients_full = new_df.loc[0, 'ingredients']  # Replace 0 with the desired row index

# Print the full content of the 'ingredients' column
print(ingredients_full)


Error converting quantity: 2 (10- - could not convert string to float: '10-' for [32mspinach, thawed and squeezed dry[0m
Error converting quantity: 4 (4- - could not convert string to float: '4-' for [32mtrout fillets, thawed if frozen, skinned if desired[0m
Error converting quantity: 1 (8- - could not convert string to float: '8-' for [32mcremini mushrooms, halved or quartered if large[0m
Error converting quantity: 1 (16- - could not convert string to float: '16-' for [32mdill pickle chips, drained[0m
Error converting quantity: 1 lare (8 - could not convert string to float: '1 lare' for [32msweet potato[0m
Error converting quantity: 1 ( - could not convert string to float: '' for [32martichoke hearts in water[0m
Error converting quantity: 1 ( - could not convert string to float: '' for [32msourdough loaf or crusty Italian bread, unsliced[0m
Error converting quantity: 3 (8- - could not convert string to float: '8-' for [32mcream cheese, at room temperature[0m
Error conv

In [618]:
def print_recipes_with_none_quantity(df):
    count = 0  # Initialize counter for recipes with None quantity_in_grams
    
    # Iterate through the DataFrame
    for idx, row in df.iterrows():
        # Extract the ingredients list for the current row
        ingredients = row['ingredients']
        
        # Check if any ingredient is a dictionary and has quantity_in_grams as None
        invalid_ingredients = [ingredient for ingredient in ingredients if isinstance(ingredient, dict) and ingredient.get('quantity_in_grams') is None]
        
        # If there are invalid ingredients, print the recipe name and those ingredients
        if invalid_ingredients:
            count += 1  # Increment the counter
            print(f"Recipe: {row['name']}")
            print("Ingredients with None quantity_in_grams:")
            for ingredient in invalid_ingredients:
                print(f"  - {ingredient['name']} | Quantity: {ingredient['quantity']} {ingredient.get('unit', '')} | Quantity in grams: {ingredient['quantity_in_grams']}")
            print("-" * 50)  # Separator for readability
    
    # Print the total count of recipes with None quantity_in_grams
    print(f"Total number of recipes with ingredients missing quantity_in_grams: {count}")


In [619]:
# Print recipes with ingredients having quantity_in_grams as None
print_recipes_with_none_quantity(new_df)


Recipe: air fryer s’mores
Ingredients with None quantity_in_grams:
  - graham crackers | Quantity: 1 sleeve | Quantity in grams: None
  - chocolate candy bars | Quantity: 5 (1.5 ounce) | Quantity in grams: None
  - s’more-sized marshmallows | Quantity: 10 None | Quantity in grams: None
--------------------------------------------------
Recipe: air fryer baked yams
Ingredients with None quantity_in_grams:
  - yam | Quantity: 1 None | Quantity in grams: None
--------------------------------------------------
Recipe: air fryer grilled pimento cheese
Ingredients with None quantity_in_grams:
  - French bread, | Quantity: 4 slices | Quantity in grams: None
  - tomato | Quantity: 1 None | Quantity in grams: None
--------------------------------------------------
Recipe: air fryer chicken parmesan
Ingredients with None quantity_in_grams:
  - boneless skinless chicken breasts, patted dry | Quantity: 2 2 (8-ounce) | Quantity in grams: None
--------------------------------------------------
Recip

### Like One hot Encoding

### Export to cleaned.csv

In [607]:
# # # Save the DataFrame to a CSV file
# new_df.to_csv('cleaned.csv', index=False)

# # # Confirm the saving
# print("DataFrame has been saved to 'cleaned.csv'")

DataFrame has been saved to 'cleaned.csv'


## Merge from the check.ipynb

In [None]:
ingredients_df = new_df[['ingredients', 'link']]
ingredients_df.head()

In [621]:
print(ingredients_df.iloc[0]['ingredients'])

[{'quantity': '1 1/2', 'unit': 'pounds', 'name': 'skinless boneless chicken thighs', 'quantity_in_grams': 680.39}, {'quantity': '3', 'unit': 'cloves', 'name': 'garlic, minced', 'quantity_in_grams': 15.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'lemon zest', 'quantity_in_grams': 5.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'paprika', 'quantity_in_grams': 5.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'dried oregano', 'quantity_in_grams': 5.0}, {'quantity': '1/2', 'unit': 'teaspoon', 'name': 'salt', 'quantity_in_grams': 2.5}, {'quantity': '1/4', 'unit': 'teaspoon', 'name': 'crushed red pepper', 'quantity_in_grams': 1.25}, {'quantity': '1/2', 'unit': 'cup', 'name': 'grated parmesan cheese', 'quantity_in_grams': 120.0}, {'quantity': '1/4', 'unit': 'cup', 'name': 'panko bread crumbs', 'quantity_in_grams': 60.0}, {'quantity': None, 'unit': None, 'name': 'cooking spray', 'quantity_in_grams': 0}]


In [625]:
# Function to convert ingredients string to list of dictionaries
def parse_ingredients(ingredients_str):
    return ast.literal_eval(ingredients_str)

# Expanding ingredients into individual rows
expanded_rows = []
for index, row in ingredients_df.iterrows():
    ingredients = parse_ingredients(row['ingredients'])
    print(f"Ingredients at index {index}: {ingredients}")  # Debug print to check the structure
    for ingredient in ingredients:
        # Debug print the type of 'ingredient' to check if it's a list or dict
        # print(f"Type of ingredient: {type(ingredient)}")
        # print(f"Ingredient: {ingredient}")  # Debug print to check the structure of each ingredient
        if isinstance(ingredient, dict):  # Ensure the ingredient is a dictionary
            expanded_rows.append({
                'name': ingredient.get('name', ''),
                'quantity': ingredient.get('quantity', ''),
                'unit': ingredient.get('unit', ''),
                'quantity_in_grams': ingredients.get('quantity_in_grams',''),
                'link': row['link']
            })
        # else:
        #     print(f"Skipping invalid ingredient structure: {ingredient}")
# Creating a new DataFrame
all_ing_df = pd.DataFrame(expanded_rows)

indices = [10, 11, 33, 39, 60]
# Iterate through each row of filtered_df and print the entry
for index in indices:
    row = all_ing_df.iloc[index]
    print(f"Index: {index}")
    print(f"Name: {row['name']}")
    print(f"Quantity: {row['quantity']}")
    print(f"Unit: {row['unit']}")
    print(f"Link: {row['link']}")
    print('-' * 40)  # Add a separator for readability


print(f"Total number of recipes: {len(all_ing_df)}")

ValueError: malformed node or string: [{'quantity': '1 1/2', 'unit': 'pounds', 'name': 'skinless boneless chicken thighs', 'quantity_in_grams': 680.39}, {'quantity': '3', 'unit': 'cloves', 'name': 'garlic, minced', 'quantity_in_grams': 15.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'lemon zest', 'quantity_in_grams': 5.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'paprika', 'quantity_in_grams': 5.0}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'dried oregano', 'quantity_in_grams': 5.0}, {'quantity': '1/2', 'unit': 'teaspoon', 'name': 'salt', 'quantity_in_grams': 2.5}, {'quantity': '1/4', 'unit': 'teaspoon', 'name': 'crushed red pepper', 'quantity_in_grams': 1.25}, {'quantity': '1/2', 'unit': 'cup', 'name': 'grated parmesan cheese', 'quantity_in_grams': 120.0}, {'quantity': '1/4', 'unit': 'cup', 'name': 'panko bread crumbs', 'quantity_in_grams': 60.0}, {'quantity': None, 'unit': None, 'name': 'cooking spray', 'quantity_in_grams': 0}]