## Preprocessing Merged Dataset

### Import Dependencies

In [1]:
import pandas as pd
from IPython.display import Image

### Import dataset

In [2]:
# Load Dataset Allrecipes
food_df = pd.read_csv('Food_Dataset.csv')
food_df.head()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
0,15 Best Air Fryer Thanksgiving Recipes,,,,,,,,https://www.allrecipes.com/thmb/zNe_lQRZgjj1rS...,https://www.allrecipes.com/gallery/best-air-fr...
1,Air Fryer Turkey Breast,"{'quantity': '1', 'unit': 'tablespoon', 'name'...",,263.0,0g,40g,10g,6.0,https://www.allrecipes.com/thmb/PaF8nNOY0bLCvo...,https://www.allrecipes.com/recipe/275372/air-f...
2,16 Quick-and-Easy Side Dish Recipes for the Ai...,,,,,,,,https://www.allrecipes.com/thmb/91y3R4leqrUtBV...,https://www.allrecipes.com/gallery/air-fryer-s...
3,Best Holiday Party Appetizers to Make in the A...,,,,,,,,https://cdn.jwplayer.com/v2/media/rggkwMPu/pos...,https://www.allrecipes.com/article/best-holida...
4,Air Fryer Lemon Garlic Parmesan Chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365.0,8g,46g,17g,4.0,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...


In [3]:
food_df.describe()

Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,24191,23120,20624,22870,22847,22803,22648,22863,24047,24191
unique,19311,18356,17071,2874,902,750,721,201,19120,19346
top,Grilled Asparagus,"{'quantity': '1', 'unit': 'pound', 'name': 'fr...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/17445/grille...
freq,9,9,8,88,415,1490,813,4792,10,9


### Remove NaN

In [4]:
# Count of NaN values in each column
nan_count = food_df.isna().sum()

# Print the count of NaN values
print("Count of NaN values in each column:")
print(nan_count)

Count of NaN values in each column:
name              0
ingredients    1071
steps          3567
calories       1321
carbs          1344
protein        1388
fat            1543
servings       1328
image_url       144
link              0
dtype: int64


In [5]:
# Drop rows with any NaN values
food_df = food_df.dropna()

# Display the cleaned DataFrame
print("DataFrame after dropping rows with NaN values:")
food_df.describe()


DataFrame after dropping rows with NaN values:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,20089,20089,20089,20089,20089,20089,20089,20089,20089,20089
unique,16615,16628,16633,2825,893,740,708,196,16621,16634
top,Smoked Fish Dip,"{'quantity': '2', 'unit': 'cups', 'name': 'fla...","Gather the ingredients.. Place whitefish, sour...",'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/recipe/45291/smoked...
freq,8,8,8,88,336,1295,719,4353,10,8


### Remove duplicate

In [6]:
# Print the sum of duplicate rows
duplicate_count = food_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 3455


In [7]:
# Remove the duplicate rows
food_df= food_df.drop_duplicates()

# Display the cleaned DataFrame
print("\nDataFrame after removing duplicate rows:")
food_df.describe()


DataFrame after removing duplicate rows:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16634,16634,16634,16634,16634,16634,16634,16634,16634,16634
unique,16615,16628,16633,2825,893,740,708,196,16621,16634
top,Sriracha Deviled Eggs,[],Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,2,4,2,88,256,1030,540,3710,4,1


### Clean '[]' ingridients

In [8]:
# Remove rows where 'ingredients' is '[]'
food_df = food_df[food_df['ingredients'] != '[]']

# Display the cleaned DataFrame
print("DataFrame after removing rows with '[]' in the 'ingredients' column:")
food_df.describe()

DataFrame after removing rows with '[]' in the 'ingredients' column:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16630,16630,16630,16630,16630,16630,16630,16630,16630,16630
unique,16611,16627,16629,2824,892,739,706,196,16617,16630
top,Pecan Shortbread Cookies,"{'quantity': '2 ¼', 'unit': 'cups', 'name': 'I...",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,2,2,2,88,256,1030,540,3709,4,1


### Lowercase all name

In [9]:
# Convert all food names to lowercase
food_df['name'] = food_df['name'].str.lower()

# Display the updated DataFrame
print("DataFrame after converting food names to lowercase:")
food_df.head()

DataFrame after converting food names to lowercase:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
4,air fryer lemon garlic parmesan chicken,"{'quantity': '1 1/2', 'unit': 'pounds', 'name'...",Gather all ingredients.. Preheat an air fryer ...,365,8g,46g,17g,4,https://www.allrecipes.com/thmb/5nJvgXENSeFx82...,https://www.allrecipes.com/air-fryer-lemon-gar...
6,air fryer s’mores,"{'quantity': '1', 'unit': 'sleeve', 'name': 'g...",Preheat an air fryer to 380 degrees F (193 deg...,143,20g,2g,6g,10,https://www.allrecipes.com/thmb/_EDaiFt0gIGQOL...,https://www.allrecipes.com/air-fryer-s-mores-r...
7,air fryer baked yams,"{'quantity': '1', 'unit': None, 'name': 'yam'}...",Preheat an air fryer to 400 degrees F (200 deg...,283,62g,3g,3g,1,https://www.allrecipes.com/thmb/156WNgRfzvGn-s...,https://www.allrecipes.com/air-fryer-baked-yam...
8,lemon garlic butter chicken spiedini,"{'quantity': '1/2', 'unit': 'cup', 'name': 'ex...","Whisk together olive oil, wine, 2 tablespoons ...",636,21g,43g,41g,6,https://cdn.jwplayer.com/v2/media/ahbYMLcr/thu...,https://www.allrecipes.com/lemon-garlic-butter...
9,air fryer grilled pimento cheese,"{'quantity': '4', 'unit': 'slices', 'name': 'F...",Preheat the air fryer to 370 degrees F (188 de...,902,108g,29g,40g,2,https://www.allrecipes.com/thmb/cdL3DKZH3beUk5...,https://www.allrecipes.com/air-fryer-grilled-p...


### Remove duplicated recipe name

In [10]:
# Remove duplicate food names, keeping the first occurrence
food_df = food_df.drop_duplicates(subset='name', keep='first')

# Display the cleaned DataFrame
print("DataFrame after removing duplicate food names:")
food_df.describe()

DataFrame after removing duplicate food names:


Unnamed: 0,name,ingredients,steps,calories,carbs,protein,fat,servings,image_url,link
count,16598,16598,16598,16598,16598,16598,16598,16598,16598,16598
unique,16598,16595,16597,2809,885,733,703,195,16586,16598
top,air fryer lemon garlic parmesan chicken,"{'quantity': '12', 'unit': '', 'name': 'eggs'}",Pour the 1/2 cup of water or milk into a small...,'0',1g,3g,0g,4,https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thu...,https://www.allrecipes.com/air-fryer-lemon-gar...
freq,1,2,2,88,255,1030,540,3701,4,1


### Check top frequency image_url

In [11]:
# Getting the mode
mode_image_url = food_df['image_url'].mode()

# Get the frequency of the most common image_url
mode_count = food_df['image_url'].value_counts().iloc[0]

# Print the full mode and the frequency (mode number)
print("Full mode of image_url column:")
print(mode_image_url.iloc[0])

print("\nMode count (number of occurrences):")
print(mode_count)

Full mode of image_url column:
https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg

Mode count (number of occurrences):
4


In [12]:
# URL of the image
image_url = "https://cdn.jwplayer.com/v2/media/Ug0PzrYB/thumbnails/RflzUhLv.jpg"

# Display the image
Image(url=image_url)

No need to remove duplicate image

### Export to cleaned.csv

In [13]:
# Save the DataFrame to a CSV file
food_df.to_csv('cleaned.csv', index=False)

# Confirm the saving
print("DataFrame has been saved to 'cleaned.csv'")

DataFrame has been saved to 'cleaned.csv'
