## Preprocessing Merged Dataset

### Import Dependencies

In [1421]:
import pandas as pd
from IPython.display import Image
import re
import ast
import fractions
from fractions import Fraction

### Import dataset

In [1422]:
# Load Dataset Allrecipes
food_df = pd.read_csv('Food_Dataset.csv', index_col=None)
food_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,15 Best Air Fryer Thanksgiving Recipes,,,,,,,,https://www.allrecipes.com/thmb/zNe_lQRZgjj1rS...,https://www.allrecipes.com/gallery/best-air-fr...
1,Air Fryer Turkey Breast,"{'quantity': '1', 'unit': 'tablespoon', 'name'...",,263.0,0g,40g,10g,6.0,https://www.allrecipes.com/thmb/PaF8nNOY0bLCvo...,https://www.allrecipes.com/recipe/275372/air-f...
2,16 Quick-and-Easy Side Dish Recipes for the Ai...,,,,,,,,https://www.allrecipes.com/thmb/91y3R4leqrUtBV...,https://www.allrecipes.com/gallery/air-fryer-s...
3,Best Holiday Party Appetizers to Make in the A...,,,,,,,,https://cdn.jwplayer.com/v2/media/rggkwMPu/pos...,https://www.allrecipes.com/article/best-holida...
4,Air Fryer Lemon Garlic Parmesan Chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365.0,8g,46g,17g,4.0,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...


In [1423]:
food_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,24191,23120,20624,22870,22847,22803,22648,22863,24047,24191
unique,19311,18356,17071,2874,902,750,721,201,19120,19346
top,Grilled Asparagus,"{'quantity': '1', 'unit': 'pound', 'name': 'fr...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/17445/grille...
freq,9,9,8,88,415,1490,813,4792,10,9


### Remove NaN

In [1424]:
# Count of NaN values in each column
nan_count = food_df.isna().sum()

# Print the count of NaN values
print("Count of NaN values in each column:")
print(nan_count)

Count of NaN values in each column:
name              0
ingredients    1071
steps          3567
calories       1321
carbs          1344
protein        1388
fat            1543
servings       1328
image_url       144
link              0
dtype: int64


In [1425]:
# Drop rows with any NaN values
food_df = food_df.dropna()

# Display the cleaned DataFrame
print("DataFrame after dropping rows with NaN values:")
food_df.describe()


DataFrame after dropping rows with NaN values:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,20089,20089,20089,20089,20089,20089,20089,20089,20089,20089
unique,16615,16628,16633,2825,893,740,708,196,16621,16634
top,Smoked Fish Dip,"{'quantity': '2', 'unit': 'cups', 'name': 'fla...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/45291/smoked...
freq,8,8,8,88,336,1295,719,4353,10,8


### Remove duplicate

In [1426]:
# Print the sum of duplicate rows
duplicate_count = food_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 3455


In [1427]:
# Remove the duplicate rows
food_df= food_df.drop_duplicates()

# Display the cleaned DataFrame
print("\nDataFrame after removing duplicate rows:")
food_df.describe()


DataFrame after removing duplicate rows:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16634,16634,16634,16634,16634,16634,16634,16634,16634,16634
unique,16615,16628,16633,2825,893,740,708,196,16621,16634
top,Sriracha Deviled Eggs,[],Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,2,4,2,88,256,1030,540,3710,4,1


### Clean '[]' ingridients

In [1428]:
# Remove rows where 'ingredients' is '[]'
food_df = food_df[food_df['ingredients'] != '[]']

# Display the cleaned DataFrame
print("DataFrame after removing rows with '[]' in the 'ingredients' column:")
food_df.describe()

DataFrame after removing rows with '[]' in the 'ingredients' column:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16630,16630,16630,16630,16630,16630,16630,16630,16630,16630
unique,16611,16627,16629,2824,892,739,706,196,16617,16630
top,Pecan Shortbread Cookies,"{'quantity': '2 ¼', 'unit': 'cups', 'name': 'I...",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,2,2,2,88,256,1030,540,3709,4,1


### Lowercase all name

In [1429]:
# Convert all food names to lowercase
food_df['name'] = food_df['name'].str.lower()

# Display the updated DataFrame
print("DataFrame after converting food names to lowercase:")
food_df.head()

DataFrame after converting food names to lowercase:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
4,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365,8g,46g,17g,4,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...
6,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g...",Preheat an air fryer to 380 degrees F (193 deg...,143,20g,2g,6g,10,https://www.allrecipes.com/thmb/_EDaiFt0gIGQOL...,https://www.allrecipes.com/air-fryer-s-mores-r...
7,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}...",Preheat an air fryer to 400 degrees F (200 deg...,283,62g,3g,3g,1,https://www.allrecipes.com/thmb/156WNgRfzvGn-s...,https://www.allrecipes.com/air-fryer-baked-yam...
8,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex...","Whisk together olive oil, wine, 2 tablespoons ...",636,21g,43g,41g,6,https://cdn.jwplayer.com/v2/media/ahbYMLcr/thu...,https://www.allrecipes.com/lemon-garlic-butter...
9,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F...",Preheat the air fryer to 370 degrees F (188 de...,902,108g,29g,40g,2,https://www.allrecipes.com/thmb/cdL3DKZH3beUk5...,https://www.allrecipes.com/air-fryer-grilled-p...


### Remove duplicated recipe name

In [1430]:
# Remove duplicate food names, keeping the first occurrence
food_df = food_df.drop_duplicates(subset='name', keep='first')

# Display the cleaned DataFrame
print("DataFrame after removing duplicate food names:")
food_df.describe()

DataFrame after removing duplicate food names:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16598,16598,16598,16598,16598,16598,16598,16598,16598,16598
unique,16598,16595,16597,2809,885,733,703,195,16586,16598
top,air fryer lemon garlic parmesan chicken,"{'quantity': '12', 'unit': '', 'name': 'eggs'}",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,1,2,2,88,255,1030,540,3701,4,1


### Check top frequency image_url

In [1431]:
# Getting the mode
mode_image_url = food_df['image_url'].mode()

# Get the frequency of the most common image_url
mode_count = food_df['image_url'].value_counts().iloc[0]

# Print the full mode and the frequency (mode number)
print("Full mode of image_url column:")
print(mode_image_url.iloc[0])

print("\nMode count (number of occurrences):")
print(mode_count)

Full mode of image_url column:
https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg

Mode count (number of occurrences):
4


In [1432]:
# URL of the image
image_url = "https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg"

# Display the image
Image(url=image_url)

No need to remove duplicate image

### Splitting into 3 df for to transform each website dataset into a same format

In [1433]:
# Ensure there are no leading or trailing spaces in the link column
food_df['link'] = food_df['link'].str.strip()

# Split the DataFrame based on the 'link' column
Allrecipe_df = food_df[food_df['link'].str.contains('https://www.allrecipes.com', na=False)]
BBC_df = food_df[food_df['link'].str.contains('https://www.bbcgoodfood.com', na=False)]
Taste_df = food_df[food_df['link'].str.contains('https://www.taste.com.au', na=False)]

# Reset the index
Allrecipe_df.reset_index(drop=True, inplace=True)
BBC_df.reset_index(drop=True, inplace=True)
Taste_df.reset_index(drop=True, inplace=True)

### Transform ALLrecipe dataset into a same format (We use ALLrecipe_df 's format as the model for ther 2 other website)

In [1434]:
Allrecipe_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365,8g,46g,17g,4,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...
1,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g...",Preheat an air fryer to 380 degrees F (193 deg...,143,20g,2g,6g,10,https://www.allrecipes.com/thmb/_EDaiFt0gIGQOL...,https://www.allrecipes.com/air-fryer-s-mores-r...
2,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}...",Preheat an air fryer to 400 degrees F (200 deg...,283,62g,3g,3g,1,https://www.allrecipes.com/thmb/156WNgRfzvGn-s...,https://www.allrecipes.com/air-fryer-baked-yam...
3,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex...","Whisk together olive oil, wine, 2 tablespoons ...",636,21g,43g,41g,6,https://cdn.jwplayer.com/v2/media/ahbYMLcr/thu...,https://www.allrecipes.com/lemon-garlic-butter...
4,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F...",Preheat the air fryer to 370 degrees F (188 de...,902,108g,29g,40g,2,https://www.allrecipes.com/thmb/cdL3DKZH3beUk5...,https://www.allrecipes.com/air-fryer-grilled-p...


In [1435]:
Allrecipe_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,12685,12685,12685,12685,12685,12685,12685,12685,12685,12685
unique,12685,12682,12684,1058,174,105,112,76,12673,12685
top,air fryer lemon garlic parmesan chicken,"{'quantity': '2 ¼', 'unit': 'cups', 'name': 'I...",Pour the 1/2 cup of water or milk into a small...,215,1g,3g,0g,8,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,1,2,2,38,255,1030,540,2369,4,1


In [1436]:
# Check Missing Ingredients
def has_missing_attributes(ingredient_str):
    # Safely convert string representation of dict to actual dict
    ingredient = ast.literal_eval(ingredient_str)
        
        # Ensure it's a dictionary
    if isinstance(ingredient, dict):
        return any(ingredient.get(key) in [None, ''] for key in ['quantity'])
    else:
        return True

# Filter rows where ingredients have missing attributes
rows_with_missing = Allrecipe_df[Allrecipe_df['ingredients'].apply(has_missing_attributes)]

# Display rows with missing values
print("Rows with missing attributes in ingredients:")
rows_with_missing[['name', 'ingredients']]

Rows with missing attributes in ingredients:


Unnamed: 0,name,ingredients
0,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'..."
1,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g..."
2,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}..."
3,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex..."
4,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F..."
...,...,...
12680,"zucchini bread, pumpkin style","{'quantity': '3', 'unit': 'medium', 'name': 'z..."
12681,gluten-free zucchini bread (or muffins),"{'quantity': '3', 'unit': 'cups', 'name': 'glu..."
12682,savory zucchini muffins,"{'quantity': '1', 'unit': 'tablespoon', 'name'..."
12683,andy's jalapeno zucchini bread,"{'quantity': '3', 'unit': 'cups', 'name': 'all..."


In [1437]:
ingredients_full = Allrecipe_df.loc[0, 'ingredients']  # Replace 0 with the desired row index

# Print the full content of the 'ingredients' column
print(ingredients_full)

{'quantity': '1 1/2', 'unit': 'pounds', 'name': 'skinless boneless chicken thighs'}, {'quantity': '3', 'unit': 'cloves', 'name': 'garlic, minced'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'lemon zest'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'paprika'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'dried oregano'}, {'quantity': '1/2', 'unit': 'teaspoon', 'name': 'salt'}, {'quantity': '1/4', 'unit': 'teaspoon', 'name': 'crushed red pepper'}, {'quantity': '1/2', 'unit': 'cup', 'name': 'grated parmesan cheese'}, {'quantity': '1/4', 'unit': 'cup', 'name': 'panko bread crumbs'}, {'quantity': None, 'unit': None, 'name': 'cooking spray'}


In [1438]:
import ast

# Conversion dictionary
CONVERSION_FACTORS = {
    'pounds': 453.592,
    'ounces': 28.3495,
    'grams': 1,
    'gram': 1,
    'g': 1,
    'ml': 1,
    'l': 1,
    'kilograms': 1000,
    'cups': 240,  # Assuming 1 cup = 240 ml
    'cup': 240,
    'tablespoons': 15,
    'tablespoon': 15,
    'teaspoons': 5,
    'teaspoon': 5,
    'cloves': 5,  # Assuming 1 clove of garlic = 5 grams
    'milliliters': 1,
    'liters': 1000,
    'large': 56.8
}

FRACTIONS = {
    '¼': '1/4',
    '½': '1/2',
    '⅓': '1/3',
    '¾': '3/4',
    '⅔': '2/3',
    '⅕': '1/5',
    '⅖': '2/5',
    '⅗': '3/5',
    '⅘': '4/5',
    '⅛': '1/8',
    '⅜': '3/8',
    '⅝': '5/8',
    '⅞': '7/8'
}

# Helper function to handle fractional quantities, including mixed fractions
import re

# Helper function to handle fractional quantities, including mixed fractions
def convert_to_number(quantity_str):
    try:
        quantity_str = re.sub(r'[\(\)-]', '', quantity_str)
        # Remove the unit (e.g., 'cup', 'tbsp', etc.)
        quantity_str = re.sub(r'\b(ppings of choice|cup|tsp|g|ml|ounce|gram|pound|cups|large|small|stick|tablespoon|tablespoons|Jamaican|lb|fluid|cans|whole|oven-ready)\b', '', quantity_str, flags=re.IGNORECASE).strip()
        
        match = re.match(r"^\s*(\d+(\.\d+)?)", quantity_str)
        if match:
            return float(match.group(1))
        
        # Check for empty or descriptive strings (like 'for serving' or 'to taste')
        if not quantity_str or 'for serving' in quantity_str.lower() or 'to taste' in quantity_str.lower():
            return None  # Return None to indicate this is not a valid quantity
        
        # Check if the quantity is zest
        if 'zest' in quantity_str.lower():
            return 5
        
        if 'chopped peanuts' in quantity_str.lower():
            return 132
        
        if 'baking spray' in quantity_str.lower():
            return None
        
        # Handle "plus" scenarios like "1/4 cup plus 1"
        if 'plus' in quantity_str:
            parts = quantity_str.split('plus')
            quantity_1 = convert_to_number(parts[0].strip())
            quantity_2 = convert_to_number(parts[1].strip())
            if quantity_1 is not None and quantity_2 is not None:
                return quantity_1 + quantity_2
            else:
                return None
        
        # Replace fractional symbols with standard fractions
        for symbol, fraction in FRACTIONS.items():
            if symbol in quantity_str:
                quantity_str = quantity_str.replace(symbol, fraction)
                
        # Handle ranges like "1 1/2 to 2" by taking the first value or averaging
        if 'to' in quantity_str:
            parts = quantity_str.split('to')
            lower = convert_to_number(parts[0].strip())
            upper = convert_to_number(parts[1].strip())
            if lower is not None and upper is not None:
                return (lower + upper) / 2  # or return lower, depending on your needs
            else:
                return None
        
        # Handle mixed fractions like "1 1/2"
        if ' ' in quantity_str:
            whole, fraction = quantity_str.split(' ')
            quantity = float(whole) + convert_to_number(fraction)
            return quantity
        
        # Handle standard fractions like "1/2"
        if '/' in quantity_str:
            numerator, denominator = quantity_str.split('/')
            quantity = float(numerator) / float(denominator)
            return quantity
        

        # Attempt to convert to a float
        try:
            quantity = float(quantity_str)
            return quantity
        except ValueError:
            return None  # Return None if the value cannot be converted
    except Exception as e:
        print(f"Error converting quantity: {quantity_str}")
        return None  # Return None in case of any other error

# Function to process an individual ingredient
def process_ingredient(ingredient):
    try:
        quantity = ingredient.get("quantity")
        unit = ingredient.get("unit")
        name = ingredient.get("name")
        
        # Convert quantity using the helper function
        if isinstance(quantity, str):
            quantity = convert_to_number(quantity)
        
        # If quantity is None or invalid, return ingredient as is
        if quantity is None:
            return ingredient
        
        # Convert to grams or ml using the conversion factor
        if unit in CONVERSION_FACTORS and quantity is not None:
            conversion_factor = CONVERSION_FACTORS[unit]
            quantity_in_grams_or_ml = round(quantity * conversion_factor, 2)
            # Normalize unit to 'grams' or 'ml'
            normalized_unit = "ml" if unit in ["tbsp", "tsp", "cups", "ml"] else "grams"
        else:
            # If no unit or quantity, return as is
            quantity_in_grams_or_ml = quantity
            normalized_unit = unit

        # Return updated ingredient
        return {
            "quantity": quantity_in_grams_or_ml,
            "unit": normalized_unit,
            "name": name
        }
    except Exception as e:
        return [] 


# Function to process a list of ingredients
def process_ingredients(ingredients_str):
    try:
        ingredients_list = ast.literal_eval(ingredients_str)  # Safely evaluate the list
        processed_ingredients = [process_ingredient(ing) for ing in ingredients_list]
        return processed_ingredients
    except Exception as e:
        print(f"Error processing ingredients: {ingredients_str}, Error: {e}")
        return ingredients_str

# Apply the conversion function to the DataFrame
Allrecipe_df["ingredients"] = Allrecipe_df["ingredients"].apply(process_ingredients)

# Display updated ingredients
print("Updated Ingredients:")
ingredients_full = Allrecipe_df.loc[0, 'ingredients']  # Replace 0 with the desired row index

# Print the full content of the 'ingredients' column
print(ingredients_full)


Updated Ingredients:
[{'quantity': 453.59, 'unit': 'grams', 'name': 'skinless boneless chicken thighs'}, {'quantity': 15.0, 'unit': 'grams', 'name': 'garlic, minced'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'lemon zest'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'paprika'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'dried oregano'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'salt'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'crushed red pepper'}, {'quantity': 240.0, 'unit': 'grams', 'name': 'grated parmesan cheese'}, {'quantity': 240.0, 'unit': 'grams', 'name': 'panko bread crumbs'}, {'quantity': None, 'unit': None, 'name': 'cooking spray'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Allrecipe_df["ingredients"] = Allrecipe_df["ingredients"].apply(process_ingredients)


In [1439]:
# Display updated ingredients
print("Updated Ingredients:")
ingredients_full = Allrecipe_df.loc[3, 'ingredients']  # Replace 0 with the desired row index

# Print the full content of the 'ingredients' column
print(ingredients_full)


Updated Ingredients:
[{'quantity': 240.0, 'unit': 'grams', 'name': 'extra-virgin olive oil'}, {'quantity': 240.0, 'unit': 'grams', 'name': 'white wine, such as Pinot Grigio'}, {'quantity': 45.0, 'unit': 'grams', 'name': 'lemon juice'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'lemon zest'}, {'quantity': 20.0, 'unit': 'grams', 'name': 'garlic'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'crushed red pepper flakes'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'oregano'}, {'quantity': 10.0, 'unit': 'grams', 'name': 'kosher salt'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'pepper'}, {'quantity': 907.18, 'unit': 'grams', 'name': 'skinless boneless chicken thighs'}, {'quantity': 480.0, 'unit': 'grams', 'name': 'Italian breadcrumbs'}, {'quantity': 480.0, 'unit': 'grams', 'name': 'panko'}, {'quantity': 240.0, 'unit': 'grams', 'name': 'grated Parmesan cheese'}, {'quantity': 5.0, 'unit': 'grams', 'name': 'garlic powder'}, {'quantity': None, 'unit': None, 'name': 'skewers'}, {'quantity': Non

### Transform BBCGoodFood dataset into a same format

In [1440]:
#Check what format we're dealing with
BBC_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,"('salsa verde baked eggs',)","([{'ingredient': 'tbspolive oil', 'quantity': ...",(['Step 1: Drizzle 1 tbsp of the olive oil in ...,'268low','7g','12g','21g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/salsa-verd...
1,"('sausage & fennel orecchiette',)","([{'ingredient': 'and finely sliced', 'quantit...","(['Step 1: Boil a large pan of salted water, t...",'527','62g','21g','19g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/sausage-fe...
2,"('roasted cod with zingy beetroot salad',)","([{'ingredient': 'new potatoes', 'quantity': '...",(['Step 1: Heat the oven to 200C/180C fan/gas ...,'262low','21g','36g','3glow',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/roasted-co...
3,"('tomato soup & hummus crispbreads',)","([{'ingredient': 'tbsprapeseed oil', 'quantity...",(['Step 1: Heat the oil in a large non-stick p...,'403','51g','15g','12g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/tomato-sou...
4,"('puy lentils with seared salmon',)","([{'ingredient': 'puy lentils', 'quantity': '1...",(['Step 1: Put the lentils in a pan of water w...,'519','29g','38g','25g',2,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/puy-lentil...


In [1441]:
BBC_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,3094,3094,3094,3094,3094,3094,3094,3094,3094,3094
unique,3094,3094,3094,1007,178,183,151,168,3094,3094
top,"('salsa verde baked eggs',)","([{'ingredient': 'tbspolive oil', 'quantity': ...",(['Step 1: Drizzle 1 tbsp of the olive oil in ...,'0','0g','2g','0g',4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/salsa-verd...
freq,1,1,1,88,111,196,182,875,1,1


In [1442]:
# Function to clean up the columns
def clean_column(value):
    return value.strip("(['),]").replace("'", "").replace("low", "").strip()

# Apply the function to the relevant columns
BBC_df['name'] = BBC_df['name'].apply(clean_column)
BBC_df['ingredients'] = BBC_df['ingredients'].apply(lambda x: x.strip("(['),])"))
BBC_df['steps'] = BBC_df['steps'].apply(lambda x: x.strip("(['),])"))
BBC_df['calories'] = BBC_df['calories'].apply(clean_column)
BBC_df['carbs'] = BBC_df['carbs'].apply(clean_column)
BBC_df['protein'] = BBC_df['protein'].apply(clean_column)
BBC_df['fat'] = BBC_df['fat'].apply(clean_column)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['name'] = BBC_df['name'].apply(clean_column)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['ingredients'] = BBC_df['ingredients'].apply(lambda x: x.strip("(['),])"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBC_df['steps'] = BBC_df['steps'].apply(lambda x: x.strip("(['),])"))
A

In [1443]:
#Check result
BBC_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,salsa verde baked eggs,"{'ingredient': 'tbspolive oil', 'quantity': '5...",Step 1: Drizzle 1 tbsp of the olive oil in afr...,268,7g,12g,21g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/salsa-verd...
1,sausage & fennel orecchiette,"{'ingredient': 'and finely sliced', 'quantity'...","Step 1: Boil a large pan of salted water, turn...",527,62g,21g,19g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/sausage-fe...
2,roasted cod with zingy beetroot salad,"{'ingredient': 'new potatoes', 'quantity': '20...",Step 1: Heat the oven to 200C/180C fan/gas 6. ...,262,21g,36g,3g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/roasted-co...
3,tomato soup & hummus crispbreads,"{'ingredient': 'tbsprapeseed oil', 'quantity':...",Step 1: Heat the oil in a large non-stick pan ...,403,51g,15g,12g,4,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/tomato-sou...
4,puy lentils with seared salmon,"{'ingredient': 'puy lentils', 'quantity': '160...",Step 1: Put the lentils in a pan of water with...,519,29g,38g,25g,2,https://images.immediate.co.uk/production/vola...,https://www.bbcgoodfood.com/recipes/puy-lentil...


### Transform Taste dataset into a same format

In [1444]:
# Function to parse a single ingredient
def parse_ingredient(ingredient):
    # Regex pattern to match quantity, unit, and name
    pattern = r'(?P<quantity>[0-9/.\s]+)?\s*(?P<unit>[a-zA-Z()]+)?\s*(?P<name>.+)'
    match = re.match(pattern, ingredient.strip())
    if match:
        return {
            'quantity': match.group('quantity').strip() if match.group('quantity') else None,
            'unit': match.group('unit').strip() if match.group('unit') else None,
            'name': match.group('name').strip() if match.group('name') else None,
        }
    return None

In [1445]:
# Function to extract the calorie value from the string
def extract_calories(energy_str):
    match = re.search(r'(\d+)\s*cal', energy_str)
    if match:
        return match.group(1)  # Extracts the calorie value
    # if energy_str is already a number return the number instead
    elif energy_str.isdigit():
        return energy_str
    return None

In [1446]:
# Handle the 'steps' column
if 'steps' in Taste_df.columns:
    Taste_df.loc[:, 'steps'] = Taste_df['steps'].apply(
        lambda step_list: ', '.join(eval(step_list)) if isinstance(step_list, str) and step_list.startswith('[') else step_list
    )

# Convert the 'ingredients' column to lists, parse, and format without square brackets
if 'ingredients' in Taste_df.columns:
    Taste_df.loc[:, 'ingredients'] = Taste_df['ingredients'].apply(
        lambda ing_list: ', '.join(
            map(str, [parse_ingredient(item) for item in ast.literal_eval(ing_list)])
        ).strip('[]') if isinstance(ing_list, str) and ing_list.startswith('[') else ing_list
    )


# Extract the calorie value from the 'calories'
if 'calories' in Taste_df.columns:
    Taste_df.loc[:, 'calories'] = Taste_df['calories'].apply(extract_calories)


In [1447]:
Taste_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,easiest-ever fried rice,"{'quantity': '1', 'unit': 'cup', 'name': '(200...",Boil brown and white rice separately following...,543,87.4g,26.8g,8.7g,4,https://content.api.news/v3/images/bin/640b903...,https://www.taste.com.au/recipes/unique-fried-...
1,cheesy french onion meatballs and mushrooms,"{'quantity': '1', 'unit': 'egg', 'name': ', li...","Combine the egg, beef, breadcrumbs and 1&1/2 t...",750,39.0g,35.9g,49.4g,4,https://img.taste.com.au/gJ9jKwKI/w720-h480-cf...,https://www.taste.com.au/recipes/cheesy-french...
2,thai chicken lettuce cups with avocado and lime,"{'quantity': '50', 'unit': 'g', 'name': ""Chang...",Bring a large saucepan of salted water to a bo...,1141,80.9g,83.5g,56.8g,4,https://img.taste.com.au/fqti94HE/w720-h480-cf...,https://www.taste.com.au/recipes/thai-chicken-...
3,oven-baked mushroom risotto with pesto,"{'quantity': '1', 'unit': 'tbsp', 'name': 'oli...",Preheat oven to 150°C. Heat half the oil in a ...,642,69.8g,15.2g,31.4g,6,https://img.taste.com.au/-botctrO/w720-h480-cf...,https://www.taste.com.au/recipes/oven-baked-mu...
4,tomato-tofu salad with creamy ginger dressing,"{'quantity': '1', 'unit': 'sheet', 'name': 'no...",Preheat the oven to 220°C (200°C fan-forced). ...,327,16.0g,8.1g,27.2g,4,https://img.taste.com.au/AWH3XZcf/w720-h480-cf...,https://www.taste.com.au/recipes/tomato-tofu-s...


In [1448]:
Taste_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,819,819,819,819,819,819,819,819,819,819
unique,819,819,819,563,574,483,476,14,819,819
top,easiest-ever fried rice,"{'quantity': '1', 'unit': 'cup', 'name': '(200...",Boil brown and white rice separately following...,548,35.3g,6.7g,0.2g,4,https://content.api.news/v3/images/bin/640b903...,https://www.taste.com.au/recipes/unique-fried-...
freq,1,1,1,6,6,9,6,490,1,1


### Merge back formated datasets

In [1449]:
#Merge back
new_df = pd.concat([Allrecipe_df, BBC_df, Taste_df], ignore_index=True)

new_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16598,16598,16598,16598,16598,16598,16598,16598,16598,16598
unique,16543,16576,16597,1155,730,616,565,195,16586,16598
top,red lentil soup,"[[], [], []]",Pour the 1/2 cup of water or milk into a small...,0,3g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,2,22,2,88,329,1204,756,3701,4,1


### Standardize the metrics to gram and mililiters

In [1450]:
# # Conversion dictionary
# conversion_factors = {
#     'pounds': 453.592,
#     'ounces': 28.3495,
#     'grams': 1,
#     'gram': 1,
#     'g': 1,
#     'ml': 1,
#     'l': 1,
#     'kilograms': 1000,
#     'cups': 240,  # Assuming 1 cup = 240 ml
#     'cup':240,
#     'tablespoons': 15,
#     'tablespoon': 15,
#     'teaspoons': 5,
#     'teaspoon': 5,
#     'cloves': 5,  # Assuming 1 clove of garlic = 5 grams
#     'milliliters': 1,
#     'liters': 1000,
# }

# # Mapping for special fraction characters
# fraction_map = {
#     '¼': '1/4',
#     '½': '1/2',
#     '⅓': '1/3',
#     '¾': '3/4',
#     '⅔': '2/3',
#     '⅕': '1/5',
#     '⅖': '2/5',
#     '⅗': '3/5',
#     '⅘': '4/5',
# }

# # Function to convert mixed fractions to float
# def mixed_fraction_to_float(mixed_fraction):
#     try:
#         # Replace special fraction symbols with their string equivalents
#         for frac, text in fraction_map.items():
#             mixed_fraction = mixed_fraction.replace(frac, text)
        
#         # Convert the updated fraction string to float using fractions.Fraction
#         return float(sum(Fraction(s) for s in mixed_fraction.split()))
#     except ValueError:
#         return None

# # Function to convert quantity string to float
# def convert_quantity(quantity_str):
#     # First, replace any '//' with '/' to handle incorrect fraction format
#     quantity_str = quantity_str.replace('//', '/')
    
#     # Handle mixed fractions like '1 1/2'
#     if ' ' in quantity_str:
#         whole_number, fraction_part = quantity_str.split(' ', 1)
#         try:
#             quantity = float(whole_number) + float(Fraction(fraction_part))
#             return quantity
#         except ValueError:
#             return None  # Return None if the fraction part is invalid
    
#     # Handle simple fractions like '1/4'
#     if '/' in quantity_str:
#         try:
#             return float(Fraction(quantity_str))  # Use Fraction directly
#         except ValueError:
#             return None  # Return None if the fraction is invalid
    
#     # Handle special fraction symbols and replace them
#     if quantity_str in fraction_map:
#         return float(Fraction(fraction_map[quantity_str]))

#     # Handle other quantities (like numbers)
#     try:
#         return float(quantity_str)
#     except ValueError:
#         return None  # Return None if the quantity is invalid

# # Function to standardize a single ingredient
# def standardize_ingredient(ingredient):
#     # Check if ingredient is a dictionary
#     if isinstance(ingredient, dict):
#         name = ingredient.get('name', '')
#         quantity = ingredient.get('quantity', None)
#         unit = ingredient.get('unit', None)  # Keep original unit, don't default to 'gram'
#     else:
#         # If not a dictionary, assume it's a string or unstructured input
#         name = str(ingredient)
#         quantity = None
#         unit = None  # No unit assumed
    
#     # Handle cases where quantity or unit is None
#     if quantity is None or unit is None:
#         return {'name': name, 'quantity': None, 'unit': unit}

#     # Process the quantity (handle fractions, etc.)
#     quantity = convert_quantity(quantity)
    
#     # If the quantity is non-numeric, skip it and add to the non-numeric list
#     if quantity is None:
#         non_numeric_quantities.append((name, quantity, unit))
#         return {'name': name, 'quantity': None, 'unit': unit}  # Return with original unit if invalid quantity

#     # Apply conversion factor if the unit exists in the conversion dictionary
#     if unit in conversion_factors:
#         conversion_factor = conversion_factors[unit]
#         quantity = quantity * conversion_factor  # Convert quantity to grams
    
#     return {'name': name, 'quantity': quantity, 'unit': unit}  # Keep original unit


# # Function to standardize a list of ingredients
# def standardize_ingredients(ingredients_list):
#     return [standardize_ingredient(ingredient) for ingredient in ingredients_list]

# # Ensure ingredients are lists of dictionaries
# def parse_ingredients(ingredients_str):
#     try:
#         # Try to convert string to a list of dictionaries
#         return ast.literal_eval(ingredients_str)
#     except (ValueError, SyntaxError):
#         return []

# # List to collect non-numeric quantities for debugging
# non_numeric_quantities = []

# # Apply parsing if the ingredients column is not already a list of dictionaries
# new_df['ingredients'] = new_df['ingredients'].apply(parse_ingredients)

# # Debugging: Check the structure of the 'ingredients' column after parsing
# print(new_df['ingredients'].head())

# # Now apply standardizing function to the parsed ingredients
# new_df['ingredients'] = new_df['ingredients'].apply(standardize_ingredients)

# # Debugging: Check the structure of the 'ingredients' column after standardizing
# print(new_df['ingredients'].head())

# # Function to find remaining units that were not converted to gram
# def find_remaining_units(ingredients_list):
#     # Create an empty set to hold unique units that weren't converted
#     units_set = set()

#     # Ensure the list is properly structured (list of dictionaries)
#     if isinstance(ingredients_list, list):
#         for ingredient in ingredients_list:
#             if isinstance(ingredient, dict):  # Ensure ingredient is a dictionary
#                 unit = ingredient.get('unit')
                
#                 # Clean up unit by stripping extra spaces
#                 if unit:
#                     unit = unit.strip().lower()  # Normalize the unit (to lower case, no leading/trailing spaces)
                    
#                     # If unit is not part of the conversion factors, add it to the set
#                     if unit not in conversion_factors:
#                         units_set.add(unit)
#     else:
#         print(f"Unexpected data format: {ingredients_list}")
    
#     # Return the unique units that were not converted (i.e., those not in the dictionary)
#     return units_set

# # Apply to DataFrame and print the result
# remaining_units = find_remaining_units(new_df['ingredients'].iloc[0])  # Test with the first row
# print("Remaining unique units that were not converted:", remaining_units)
# # Print out non-numeric quantities
# if non_numeric_quantities:
#     print("Non-numeric quantities encountered:")
#     for item in non_numeric_quantities:
#         print(f"Name: {item[0]}, Quantity: {item[1]}, Unit: {item[2]}")
# else:
#     print("No non-numeric quantities found.")


In [1451]:
# def is_invalid_recipe(ingredients):
#     return any(item['quantity'] is None or item['unit'] is None for item in ingredients)

# # Filter out invalid recipes
# new_df = new_df[~new_df['ingredients'].apply(is_invalid_recipe)]

In [1452]:
# bbc = new_df[new_df['link'].str.contains('https://www.bbcgoodfood.com', na=False)]
# bbc.head()

### Like One hot Encoding

### Export to cleaned.csv

In [1453]:
# # Save the DataFrame to a CSV file
# new_df.to_csv('cleaned.csv', index=False)

# # Confirm the saving
# print("DataFrame has been saved to 'cleaned.csv'")