In [107]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re

In [108]:
df = pd.read_csv("./data/recipedata-grouped.csv", index_col=0)

## Remove columns that we are not going to use

We are going to drop columns we are not interested in, and columns with too many missing values.

In [109]:
df = df.drop(['id', 
              'gaps',
              'lowFodmap', 
              'aggregateLikes', 
              'spoonacularScore',
              'weightWatcherSmartPoints',
              'creditsText', 
              'sourceName',
              'sourceUrl', 
              'image', 
              'imageType',
              'occasions',
              'author', 
              'nutrition', 
              'winePairing',
              'originalId', 
              'spoonacularSourceUrl', 
              'license', 
              'preparationMinutes', 
              'cookingMinutes', 
              'cuisines',
              # 'diets', 
              'dishTypes', 'analyzedInstructions'], axis=1)
# df.head()

## Drop data points with missing data (if needed)

In [110]:
# check how many missing values we have for each column
df = df.replace('[]', np.nan)
for column in df.columns:
    col = df[column]
    n_empty = sum(col.isna()==True)
    print(column, n_empty)

vegetarian 0
vegan 0
glutenFree 0
dairyFree 0
veryHealthy 0
cheap 0
veryPopular 0
sustainable 0
healthScore 0
pricePerServing 0
extendedIngredients 0
title 0
readyInMinutes 0
servings 0
summary 4
diets 818
instructions 196


We only have missing values in the "summary", "diets" and "instructions" columns. I am going to keep all the data points, but you can drop some of them if you want. 

In [111]:
print("Before dropping rows: ", df.shape)
# df = df.dropna()
print("After dropping rows:", df.shape)

Before dropping rows:  (3859, 17)
After dropping rows: (3859, 17)


In [112]:
df.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,extendedIngredients,title,readyInMinutes,servings,summary,diets,instructions
0,True,False,True,False,False,False,False,False,26.0,510.34,"[{'id': 1034053, 'aisle': 'Oil, Vinegar, Salad...",Roasted Butternut Squash Bisque,45,6,Roasted Butternut Squash Bisque could be just ...,"['gluten free', 'lacto ovo vegetarian', 'primal']",For garlic broth:\nAdd all ingredients to the ...
1,False,False,False,False,False,False,False,False,13.0,204.73,"[{'id': 10211821, 'aisle': 'Produce', 'image':...",Turkey Goulash By Mommie Cooks,45,6,Turkey Goulash By Mommie Cooks might be just t...,,"<ol><li>If you're using ground turkey, start o..."
2,False,False,False,False,False,False,False,False,20.0,145.24,"[{'id': 10120420, 'aisle': 'Pasta and Rice', '...",What to make for dinner tonight?? Bruschetta S...,35,5,You can never have too many main course recipe...,,wash and rinse pork chops and place into the s...
3,True,False,True,False,False,False,False,False,22.0,106.64,"[{'id': 9037, 'aisle': 'Produce', 'image': 'av...",Not Your Normal Seven Layer Dip,15,12,Not Your Normal Seven Layer Dip might be just ...,"['gluten free', 'lacto ovo vegetarian']",Layer your bowl with refried beans.Add a layer...
4,False,False,True,False,False,False,False,False,35.0,554.27,"[{'id': 11011, 'aisle': 'Produce', 'image': 'a...",Pan-seared salmon with brown butter lime sauce...,45,4,Pan-seared salmon with brown butter lime sauce...,"['gluten free', 'primal', 'pescatarian']","<ol><li>In a frying pan, melt the butter over ..."


## Convert the `extendedIngredients` column to a simple list of ingredient names

In [113]:
evaluated_ingredient = pd.DataFrame(map(literal_eval, df["extendedIngredients"]))

In [114]:
row, col = evaluated_ingredient.shape
ingredient_names = []

for r in range(row):
    tmp = []
    for c in range(col):
        cell = evaluated_ingredient.iloc[r,c]
        if cell: 
            tmp.append(cell["name"])
    tmp = "; ".join(tmp)
    ingredient_names.append(tmp)

In [115]:
df["Ingredients"] = pd.Series(ingredient_names)

If you want to keep the `extendIngredients` column, do not run the following cell.

In [116]:
df = df.drop(["extendedIngredients"], axis=1)

In [117]:
# rearrange the columns
columns = df.columns.tolist()
columns = ['title','summary','instructions','Ingredients',
             'vegetarian',
             'vegan',
             'glutenFree',
             'dairyFree',
             'veryHealthy',
             'cheap',
             'veryPopular',
             'sustainable',
             'healthScore',
             'pricePerServing',
             'readyInMinutes',
             'servings',
             'diets',]

In [118]:
df = df[columns]

In [119]:
df.head()

Unnamed: 0,title,summary,instructions,Ingredients,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings,diets
0,Roasted Butternut Squash Bisque,Roasted Butternut Squash Bisque could be just ...,For garlic broth:\nAdd all ingredients to the ...,extra virgin olive oil; butter; sweet onions; ...,True,False,True,False,False,False,False,False,26.0,510.34,45,6,"['gluten free', 'lacto ovo vegetarian', 'primal']"
1,Turkey Goulash By Mommie Cooks,Turkey Goulash By Mommie Cooks might be just t...,"<ol><li>If you're using ground turkey, start o...",bell pepper; canned diced tomatoes; cooked egg...,False,False,False,False,False,False,False,False,13.0,204.73,45,6,
2,What to make for dinner tonight?? Bruschetta S...,You can never have too many main course recipe...,wash and rinse pork chops and place into the s...,bow tie pasta; parmigiano reggiano; pasta; por...,False,False,False,False,False,False,False,False,20.0,145.24,35,5,
3,Not Your Normal Seven Layer Dip,Not Your Normal Seven Layer Dip might be just ...,Layer your bowl with refried beans.Add a layer...,avocado; canned tomatoes; cilantro; green chil...,True,False,True,False,False,False,False,False,22.0,106.64,15,12,"['gluten free', 'lacto ovo vegetarian']"
4,Pan-seared salmon with brown butter lime sauce...,Pan-seared salmon with brown butter lime sauce...,"<ol><li>In a frying pan, melt the butter over ...",asparagus spears; butter; butter; garlic; lime...,False,False,True,False,False,False,False,False,35.0,554.27,45,4,"['gluten free', 'primal', 'pescatarian']"


## Remove the html tags, urls and line breaks from the text data

We will use the following four columns as text data:
1. title
2. summary
3. instructions
4. ingredients

In [130]:
for r in range(row):
    for c in range(4): # only the 4 first columns are text data that we are going to use
        text = df.iloc[r, c]
        if text is not np.nan and text is not None:
            # remove line breaks and html tags
            text = re.sub(r"<[^>]*>'|[\n\r]+", " ", text).lower()
            # remove urls 
            text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", " ", text)
            # remove extra spaces
            text = re.sub(r"\s\s+", " ", text)
            df.iloc[r, c] = text

In [131]:
df

Unnamed: 0,title,summary,instructions,Ingredients,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings,diets
0,roasted butternut squash bisque,roasted butternut squash bisque could be just ...,for garlic broth: add all ingredients to the s...,extra virgin olive oil; butter; sweet onions; ...,True,False,True,False,False,False,False,False,26.0,510.34,45,6,"['gluten free', 'lacto ovo vegetarian', 'primal']"
1,turkey goulash by mommie cooks,turkey goulash by mommie cooks might be just t...,"if you're using ground turkey, start out by b...",bell pepper; canned diced tomatoes; cooked egg...,False,False,False,False,False,False,False,False,13.0,204.73,45,6,
2,what to make for dinner tonight?? bruschetta s...,you can never have too many main course recipe...,wash and rinse pork chops and place into the s...,bow tie pasta; parmigiano reggiano; pasta; por...,False,False,False,False,False,False,False,False,20.0,145.24,35,5,
3,not your normal seven layer dip,not your normal seven layer dip might be just ...,layer your bowl with refried beans.add a layer...,avocado; canned tomatoes; cilantro; green chil...,True,False,True,False,False,False,False,False,22.0,106.64,15,12,"['gluten free', 'lacto ovo vegetarian']"
4,pan-seared salmon with brown butter lime sauce...,pan-seared salmon with brown butter lime sauce...,"in a frying pan, melt the butter over medium-...",asparagus spears; butter; butter; garlic; lime...,False,False,True,False,False,False,False,False,35.0,554.27,45,4,"['gluten free', 'primal', 'pescatarian']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,scalloped potato casserole,scalloped potato casserole is a gluten free an...,,celery; chives; egg; fresh tarragon; juice of ...,True,False,True,False,False,False,True,False,11.0,66.37,45,4,"['gluten free', 'lacto ovo vegetarian']"
29,no-bake cereal bars,no-bake cereal bars is a dairy free morn meal....,"directions in a large saucepan, cook and stir ...",whipping cream; semi sweet chocolate chips; st...,False,False,False,True,False,False,True,False,2.0,10.25,20,120,['dairy free']
30,nutty fudge torte,"you can never have too many dessert recipes, s...","directions in a microwave, melt chocolate chip...",bicarbonate of soda; butter; dark chocolate; e...,False,False,False,False,False,False,False,False,2.0,68.42,70,14,
31,lemony carrot salad with dill,lemony carrot salad with dill might be just th...,,cream; mascarpone cheese; savoiardi; sugar; ma...,True,True,True,True,True,False,True,False,61.0,66.95,15,4,"['gluten free', 'dairy free', 'paleolithic', '..."


## Export the resulting pd as csv

In [132]:
df.to_csv(f"data/cleaneddata.csv")