# <u>Data Cleaning and Initial Process</u>


#### Imports

In [1]:
import pandas as pd
import numpy as np
import ast

import re
import nltk
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv("RecipeData.csv")

Initial cleaning, dropping duplicates rows, and rows with NaN values.

In [3]:
# Dropping additional index column
df = df.drop(df.columns[0], axis = 1)
# Dropping duplicates
df = df.drop_duplicates()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13839 entries, 0 to 19560
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Recipe            13746 non-null  object 
 1   Date              13838 non-null  object 
 2   Rating            12834 non-null  float64
 3   Number of Raters  12834 non-null  float64
 4   Time              13746 non-null  float64
 5   Categories        12201 non-null  object 
 6   Servings          13738 non-null  float64
 7   Ingredients       13839 non-null  object 
 8   Instructions      13745 non-null  object 
 9   Calories          13344 non-null  float64
 10  Fat               13344 non-null  float64
 11  Carbs             13344 non-null  float64
 12  Proteins          13344 non-null  float64
dtypes: float64(8), object(5)
memory usage: 1.5+ MB


In [4]:
# Drop rows with NaN
df = df.dropna()

# Reseting the index after droping rows
df = df.reset_index(drop = True)

# Handling the list columns
df["Instructions"] = df["Instructions"].apply(ast.literal_eval)
df["Ingredients"] = df["Ingredients"].apply(ast.literal_eval)
df["Categories"] = df["Categories"].apply(ast.literal_eval)
df["Date"] = df["Date"].apply(ast.literal_eval)

# Drop rows with NaN
df = df.dropna()

# Reseting the index after droping rows
df = df.reset_index(drop = True)


#### <u>Cleaning and processing the categories:</u>

Our project is about predicting the category of a recipe, therefore drop uncategorized recipes.

In [5]:
# Python convert empty lists to False, using astype(bool) we keep only non-empty lists
df = df[df["Categories"].astype(bool)]
df.shape

(11125, 13)

There were no uncategorized recipes.</br>
Check how many different categories. 

In [6]:
df["Categories"].explode().value_counts()

Dinner         5062
Dessert        2832
Appetizer      1233
Lunch          1066
Breakfast      1043
Snack           979
Side Dish       929
Drink           288
Condiment        39
Salad            36
Entree           34
Bread            32
Beverage         30
Sauce            29
Cake             28
Ingredient       28
Soup             25
Brunch           23
Cocktail         20
Pie              11
Pasta            10
Sandwich          5
Spice Mix         4
Jam / Jelly       3
Candy             3
Coffee            1
Name: Categories, dtype: int64

There are many categories that are actually sub categories, or categories with similar meaning.</br>
Also many categories don't have much recepies in them.

Function recieves category list, and returns a new category list keeping only main categories.

In [7]:
def replace_category(cat_list):
    main_cat = ["Dinner", "Dessert", "Appetizer", "Lunch", "Breakfast", "Side Dish", "Drink"]
    new_list = []
    for cat in cat_list:
        if cat in main_cat:
            new_list.append(cat)
        elif cat == "Beverage" or cat == "Cocktail" or cat == "Coffee":
            new_list.append("Drink")
        elif cat == "Cake" or cat == "Candy" or cat == "jam / jelly":
            new_list.append("Dessert")
        elif cat == "Snack":
            new_list.append("Appetizer")
    # Converting list to set and back to list, prevent with double enteries.
    return[*set(new_list)]

In [8]:
df["Categories"] = df["Categories"].apply(replace_category)

Remove recipes with no categories.

In [9]:
# Python convert empty lists to False, using astype(bool) we keep only non-empty lists
df = df[df["Categories"].astype(bool)]
df.shape

(11032, 13)

In [10]:
df["Categories"].value_counts()

[Dinner]                                                     3902
[Dessert]                                                    2609
[Appetizer]                                                   976
[Breakfast]                                                   839
[Lunch]                                                       615
[Dinner, Side Dish]                                           539
[Dinner, Appetizer]                                           268
[Dinner, Lunch]                                               247
[Side Dish]                                                   225
[Drink]                                                       215
[Dessert, Appetizer]                                          105
[Appetizer, Lunch]                                             63
[Side Dish, Breakfast]                                         41
[Drink, Breakfast]                                             36
[Appetizer, Breakfast]                                         34
[Breakfast

Many recipes fall into multiple categories, making classification challenging.</br>
Upon further analysis, we have identified three main categories: Dinner, Lunch, and Breakfast, along with four subcategories: Dessert, Appetizer, Side Dish, and Drink.</br>
Interestingly, our dataset contains numerous recipes that exhibit combinations of these categories. For instance, a recipe labeled as "Lunch" and "Side Dish" is likely a Side Dish intended for consumption during lunchtime. In light of this observation, we propose iterating through the recipes and retaining only the subcategories if they are present.

In [11]:
def keep_only_sub_cat(cat_list):
    main_cat = ["Dinner", "Lunch", "Breakfast"]
    #sub_cat = ["Dessert", "Appetizer", "Side Dish", "Drink"]
    recipe_main_cat = []
    recipe_sub_cat = []
    for cat in cat_list:
        if cat in main_cat:
            recipe_main_cat.append(cat)
        else:
            recipe_sub_cat.append(cat)
    if len(recipe_sub_cat) == 0:
        return[*set(recipe_main_cat)]
    else:
        return[*set(recipe_sub_cat)]

In [12]:
df["Categories"] = df["Categories"].apply(keep_only_sub_cat)

In [13]:
df["Categories"].value_counts()

[Dinner]                           3902
[Dessert]                          2669
[Appetizer]                        1369
[Side Dish]                         861
[Breakfast]                         839
[Lunch]                             615
[Drink]                             263
[Dinner, Lunch]                     247
[Appetizer, Dessert]                115
[Side Dish, Appetizer]               45
[Breakfast, Lunch]                   31
[Drink, Dessert]                     25
[Side Dish, Dessert]                 16
[Dinner, Breakfast]                   8
[Drink, Appetizer]                    7
[Dessert, Appetizer]                  6
[Side Dish, Drink]                    5
[Appetizer, Drink]                    5
[Appetizer, Side Dish, Dessert]       2
[Dessert, Drink]                      1
[Dinner, Breakfast, Lunch]            1
Name: Categories, dtype: int64

Dropping all recipes with multiple categories since they can't be categorized properly.

In [14]:
df = df[df["Categories"].apply(lambda x: len(x) == 1)]
# Reseting the index after droping rows
df = df.reset_index(drop = True)
df["Categories"] = df["Categories"].apply(lambda x: x[0])

In [15]:
df.head()

Unnamed: 0,Recipe,Date,Rating,Number of Raters,Time,Categories,Servings,Ingredients,Instructions,Calories,Fat,Carbs,Proteins
0,Air Fryer Waffle Egg in a Hole,"[2023, 4]",5.0,1.0,10.0,Breakfast,1.0,"[frozen waffle, large , egg, salt and pepper t...",[Preheat the air fryer to 350 degrees F (175 d...,444.0,24.0,39.0,18.0
1,Air Fryer Chicken Bites with Parmesan Cheese,"[2022, 9]",5.0,3.0,28.0,Appetizer,4.0,"[olive oil, Worcestershire sauce, dried Italia...","[Whisk olive oil, Worcestershire, Italian seas...",187.0,5.0,4.0,29.0
2,Air Fryer Cherry Cream Cheese Croissants,"[2022, 8]",4.6,5.0,15.0,Breakfast,8.0,"[flour for dusting, refrigerated crescent roll...",[Lightly dust a work surface with flour. Unrol...,227.0,16.0,16.0,5.0
3,Air Fryer Smoked Salmon Wontons,"[2022, 11]",5.0,2.0,50.0,Appetizer,48.0,"[cream cheese, softened, cold-smoked salmon, f...","[Combine cream cheese, salmon, capers, and red...",38.0,1.0,5.0,1.0
4,Air-Fried Raspberry Brie Bites,"[2022, 10]",4.0,1.0,35.0,Appetizer,16.0,"[Brie cheese, phyllo dough, thawed, raspberry ...",[Slice the brie into squares about ½ inch thic...,252.0,16.0,23.0,4.0


#### <u>Cleaning and processing the ingredients:</u>

The Ingredients column consists of a list of ingredient strings. Upon examination, we noticed that certain ingredients contain additional information or unnecessary descriptions, such as "toasted chopped pecans" or "minced fresh rosemary (Optional)." After conducting experiments, we have implemented two filters to process our ingredients:


<li>NLTK Tokenization tag: We utilize this filter to identify whether a word is a noun or not.</li>

<li>WordNet corpus synsets: This filter helps us determine if a word is a noun and if it has 'food' mentioned in its description.</li>

By applying these filters, we can effectively identify and extract the relevant food-related nouns from the ingredient strings. 

In [16]:
# Deleting what comes after "or"
def del_or(ingredient):
    new_ing = ingredient.lower()
    ing_split = new_ing.split()
    if "or" in ing_split:
        index = ing_split.index("or")
        new_ing = ' '.join(ing_split[:index])
    return new_ing

In [17]:
def process_ingredients(list_ingredients):
    lemmatizer = WordNetLemmatizer()
    new_list_ingredients = []
    for ingredient in list_ingredients:
        ingredient = del_or(ingredient)
        tokens = word_tokenize(ingredient.lower())
        tags = pos_tag(tokens)
        for word, tag in tags:
            # Check if the word is tagged as a noun
            if tag.startswith('NN'):
                # Second check if the word is a noun, and if it is a 'food'
                syns = wn.synsets(word, pos=wn.NOUN) 
                for syn in syns:
                    if 'food' in syn.lexname():
                        new_list_ingredients.append(lemmatizer.lemmatize(word))
    return [*set(new_list_ingredients)]

In [18]:
df["Ingredients"] = df["Ingredients"].apply(process_ingredients)

In [19]:
df.head()

Unnamed: 0,Recipe,Date,Rating,Number of Raters,Time,Categories,Servings,Ingredients,Instructions,Calories,Fat,Carbs,Proteins
0,Air Fryer Waffle Egg in a Hole,"[2023, 4]",5.0,1.0,10.0,Breakfast,1.0,"[pepper, cheese, salt, egg, waffle, syrup]",[Preheat the air fryer to 350 degrees F (175 d...,444.0,24.0,39.0,18.0
1,Air Fryer Chicken Bites with Parmesan Cheese,"[2022, 9]",5.0,3.0,28.0,Appetizer,4.0,"[parsley, worcestershire, pepper, salt, season...","[Whisk olive oil, Worcestershire, Italian seas...",187.0,5.0,4.0,29.0
2,Air Fryer Cherry Cream Cheese Croissants,"[2022, 8]",4.6,5.0,15.0,Breakfast,8.0,"[cheese, dough, cinnamon, cherry, roll, cream,...",[Lightly dust a work surface with flour. Unrol...,227.0,16.0,16.0,5.0
3,Air Fryer Smoked Salmon Wontons,"[2022, 11]",5.0,2.0,50.0,Appetizer,48.0,"[salmon, wonton, cheese, onion, water, caper, ...","[Combine cream cheese, salmon, capers, and red...",38.0,1.0,5.0,1.0
4,Air-Fried Raspberry Brie Bites,"[2022, 10]",4.0,1.0,35.0,Appetizer,16.0,"[cheese, raspberry, phyllo, dough, honey, salt...",[Slice the brie into squares about ½ inch thic...,252.0,16.0,23.0,4.0


#### <u>Processing the date column:</u>

We will split the date column into two separate columns: year and month. This is done to analyze any potential patterns or correlations between recipes and specific time periods. It is possible that certain recipes were uploaded specifically for holidays or during the COVID-19 pandemic when more people were cooking at home.


In [20]:
df["Year"] = df["Date"].apply(lambda x: x[0])
df["Month"] = df["Date"].apply(lambda x: x[1])
df.drop("Date", axis = 1, inplace = True)

In [21]:
df.head()

Unnamed: 0,Recipe,Rating,Number of Raters,Time,Categories,Servings,Ingredients,Instructions,Calories,Fat,Carbs,Proteins,Year,Month
0,Air Fryer Waffle Egg in a Hole,5.0,1.0,10.0,Breakfast,1.0,"[pepper, cheese, salt, egg, waffle, syrup]",[Preheat the air fryer to 350 degrees F (175 d...,444.0,24.0,39.0,18.0,2023,4
1,Air Fryer Chicken Bites with Parmesan Cheese,5.0,3.0,28.0,Appetizer,4.0,"[parsley, worcestershire, pepper, salt, season...","[Whisk olive oil, Worcestershire, Italian seas...",187.0,5.0,4.0,29.0,2022,9
2,Air Fryer Cherry Cream Cheese Croissants,4.6,5.0,15.0,Breakfast,8.0,"[cheese, dough, cinnamon, cherry, roll, cream,...",[Lightly dust a work surface with flour. Unrol...,227.0,16.0,16.0,5.0,2022,8
3,Air Fryer Smoked Salmon Wontons,5.0,2.0,50.0,Appetizer,48.0,"[salmon, wonton, cheese, onion, water, caper, ...","[Combine cream cheese, salmon, capers, and red...",38.0,1.0,5.0,1.0,2022,11
4,Air-Fried Raspberry Brie Bites,4.0,1.0,35.0,Appetizer,16.0,"[cheese, raspberry, phyllo, dough, honey, salt...",[Slice the brie into squares about ½ inch thic...,252.0,16.0,23.0,4.0,2022,10


#### <u>Processing the Instructions column:</u>

We will replace the Instructions column with the number of words in the instructions. This modification allows us to quantify the length of each recipe's instructions. The length of a recipe can provide insights into various factors, including the recipe's complexity and the likelihood of people attempting it. 

In [22]:
def proccess_instructions(list_instructions):
    len_instructions = 0
    for instruction in list_instructions:
        split_instruction = instruction.split()
        len_instructions += len(split_instruction)
    return len_instructions

In [23]:
df["Instructions"] = df["Instructions"].apply(proccess_instructions)

In [24]:
df

Unnamed: 0,Recipe,Rating,Number of Raters,Time,Categories,Servings,Ingredients,Instructions,Calories,Fat,Carbs,Proteins,Year,Month
0,Air Fryer Waffle Egg in a Hole,5.0,1.0,10.0,Breakfast,1.0,"[pepper, cheese, salt, egg, waffle, syrup]",142,444.0,24.0,39.0,18.0,2023,4
1,Air Fryer Chicken Bites with Parmesan Cheese,5.0,3.0,28.0,Appetizer,4.0,"[parsley, worcestershire, pepper, salt, season...",103,187.0,5.0,4.0,29.0,2022,9
2,Air Fryer Cherry Cream Cheese Croissants,4.6,5.0,15.0,Breakfast,8.0,"[cheese, dough, cinnamon, cherry, roll, cream,...",149,227.0,16.0,16.0,5.0,2022,8
3,Air Fryer Smoked Salmon Wontons,5.0,2.0,50.0,Appetizer,48.0,"[salmon, wonton, cheese, onion, water, caper, ...",206,38.0,1.0,5.0,1.0,2022,11
4,Air-Fried Raspberry Brie Bites,4.0,1.0,35.0,Appetizer,16.0,"[cheese, raspberry, phyllo, dough, honey, salt...",167,252.0,16.0,23.0,4.0,2022,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10513,Southern Summer Squash Pudding,4.4,17.0,110.0,Dessert,24.0,"[milk, flour, butter, lemon, egg, squash, sugar]",128,85.0,1.0,17.0,1.0,2020,6
10514,Zucchini Carrot Bread,3.8,6.0,75.0,Breakfast,16.0,"[applesauce, carrot, salt, cinnamon, nutmeg, a...",83,184.0,1.0,40.0,4.0,2020,6
10515,Honey Butter Zucchini Bread,4.1,19.0,210.0,Breakfast,24.0,"[salt, honey, vanilla, cinnamon, butter, walnu...",91,266.0,14.0,33.0,4.0,2020,6
10516,Spiced Zucchini Carrot Muffins,4.8,8.0,45.0,Lunch,21.0,"[flour, vanilla, salt, raisin, cinnamon, butte...",94,227.0,12.0,28.0,4.0,2020,6


In [25]:
df.to_csv('RecipeDataCleaned.csv')