# Data Cleaning

### read dataset & rename columns

In [10]:
import pandas as pd

In [11]:
df = pd.read_json('../data.json')
df.rename(columns={0: 'title', 1: 'category', 2:'cuisine', 3:'total_time', 4:'nbr_of_ser', 5:'steps', 6:'rating', 7:'ingredients', 8:'image_link'}, inplace=True)
df.head()


Unnamed: 0,title,category,cuisine,total_time,nbr_of_ser,steps,rating,ingredients,image_link
0,Two Huge Chocolate Chip Cookies,Dessert,American,15 minutes,2,"[Preheat the oven to 350 degrees., Mix butter,...",5.0,"[3 tablespoons butter , very soft or partially...",https://pinchofyum.com/wp-content/uploads/Two-...
1,Marry Me Chicken,Dinner,American,40 minutes,4,[Sauce Base:Heat a large skillet over medium l...,4.6,"[2 tablespoons butter, 3 cloves garlic , thinl...",https://pinchofyum.com/wp-content/uploads/Marr...
2,Ridiculously Good Air Fryer Tofu,Dinner,American,20 minutes,3,"[Mix The Batter:In a medium size mixing bowl, ...",4.9,[1 block extra firm tofu (look for one that is...,https://pinchofyum.com/wp-content/uploads/Cris...
3,Baked Tortellini with Sausage,Dinner,Italian,40 minutes,6,[Brown the sausage:Heat a very large oven-safe...,5.0,"[1 tablespoon avocado or olive oil, 1 lb . gro...",https://pinchofyum.com/wp-content/uploads/Bake...
4,Butter Cauliflower and Chickpeas with Mint Cil...,Dinner,Indian-Inspired,40 minutes,6,[Roast the cauliflower:Toss the cauliflower wi...,4.4,"[1 – 2 tablespoons avocado oil, 1 head caulifl...",https://pinchofyum.com/wp-content/uploads/Butt...


### cast number of servings column to int

In [12]:
df['nbr_of_ser'] = df['nbr_of_ser'].fillna(0)
numeric_column = pd.to_numeric(df['nbr_of_ser'], errors='coerce')
non_numeric_values = df[numeric_column.isna()]

# filter out rows with non-numeric values
df_filtered = df.drop(numeric_column[numeric_column.isna()].index)

# print the DataFrame after dropping rows with non-numeric values
df_filtered['nbr_of_ser'] = df_filtered['nbr_of_ser'].astype(int)
df_filtered['nbr_of_ser'].dtype

dtype('int32')

### remove cuisines duplicates

In [13]:
df_filtered['cuisine'] = df['cuisine'].str.upper()
unique_values = df_filtered['cuisine'].unique()
len(unique_values)

51

### cast total_time column to integer

In [14]:
def change_dur_to_int(duration):
    # check if duration is already integer
    if isinstance(duration, int):
        return duration
    
    # split the string into components based on whitespace
    components = duration.split()

    # initialize variables to store hours and minutes
    hours = 0
    minutes = 0

    # extract hours and minutes from components
    for i in range(len(components)):
        # check for hours or hour
        if components[i] == 'hours' or components[i] == 'hour':
            hours = int(components[i - 1])
        # check for minutes or minute or mins
        elif components[i] == 'minutes' or components[i] == 'minute' or components[i] == 'mins' :
            minutes = int(components[i - 1])

    total_minutes = hours * 60 + minutes
    return total_minutes

df_filtered['total_time'] = df_filtered['total_time'].apply(change_dur_to_int)

### replace rating null values with average rating

In [15]:
# Calculate the average rating excluding NaN values
average_rating = df_filtered['rating'].mean()

# Replace NaN values in 'rating' column with the average rating
df_filtered['rating'] = df_filtered['rating'].fillna(average_rating)
df_filtered.reset_index(drop=True, inplace=True)

### drop all categories that do not have more than 5 recipes

In [16]:
title_counts = df_filtered.groupby('category')['title'].count()
categories_to_drop = title_counts[title_counts < 5].index
df_filtered = df_filtered[~df_filtered['category'].isin(categories_to_drop)]
df_filtered.reset_index(drop=True, inplace=True)

### drop all cuisines that do not have more than 5 recipes

In [17]:
title_counts = df_filtered.groupby('cuisine')['title'].count()
cuisines_to_drop = title_counts[title_counts < 5].index
df_filtered = df_filtered[~df_filtered['cuisine'].isin(cuisines_to_drop)]
df_filtered.reset_index(drop=True, inplace=True)
df_filtered = df_filtered.drop(index=101)
df_filtered.head()

Unnamed: 0,title,category,cuisine,total_time,nbr_of_ser,steps,rating,ingredients,image_link
0,Two Huge Chocolate Chip Cookies,Dessert,AMERICAN,15,2,"[Preheat the oven to 350 degrees., Mix butter,...",5.0,"[3 tablespoons butter , very soft or partially...",https://pinchofyum.com/wp-content/uploads/Two-...
1,Marry Me Chicken,Dinner,AMERICAN,40,4,[Sauce Base:Heat a large skillet over medium l...,4.6,"[2 tablespoons butter, 3 cloves garlic , thinl...",https://pinchofyum.com/wp-content/uploads/Marr...
2,Ridiculously Good Air Fryer Tofu,Dinner,AMERICAN,20,3,"[Mix The Batter:In a medium size mixing bowl, ...",4.9,[1 block extra firm tofu (look for one that is...,https://pinchofyum.com/wp-content/uploads/Cris...
3,Baked Tortellini with Sausage,Dinner,ITALIAN,40,6,[Brown the sausage:Heat a very large oven-safe...,5.0,"[1 tablespoon avocado or olive oil, 1 lb . gro...",https://pinchofyum.com/wp-content/uploads/Bake...
4,Butter Cauliflower and Chickpeas with Mint Cil...,Dinner,INDIAN-INSPIRED,40,6,[Roast the cauliflower:Toss the cauliflower wi...,4.4,"[1 – 2 tablespoons avocado oil, 1 head caulifl...",https://pinchofyum.com/wp-content/uploads/Butt...


In [19]:
df_filtered = df_filtered[df_filtered['total_time'] > 0]

In [20]:
df_filtered.to_json('cleaned_data.json', orient='records')