## Cleaning

In [1]:
import json
import os
import pandas as pd
import numpy as np
import re

---
### Load in all the recipes

In [2]:
# verify the folder, remove the checkpoints file
os.listdir('../data/api_data/')[1:]

['002.json',
 '004.json',
 '005.json',
 '006.json',
 '007.json',
 '008.json',
 '009.json',
 '010.json',
 '011.json',
 '012.json',
 '013.json']

In [3]:
# load in all json as dataframes
list_of_dfs = []
for file in os.listdir('../data/api_data/')[1:]:
    with open(f'../data/api_data/{file}') as f:
        list_of_dfs.append(pd.DataFrame(json.load(f)['recipes']))

In [4]:
for dframe in list_of_dfs:
    print(dframe.shape)

(50, 37)
(100, 37)
(100, 37)
(100, 37)
(100, 37)
(100, 38)
(100, 37)
(100, 38)
(100, 37)
(100, 37)
(100, 37)


---
### Merge all the dataframes into one, dropping duplicates

In [5]:
master_df = list_of_dfs[0]

for dframe in list_of_dfs[1:]:
    master_df = master_df.append(dframe)

In [6]:
# random recipes were acquired using the api
# drop duplicates
master_df = master_df.drop_duplicates(subset=['id'])

In [7]:
master_df.shape

(665, 38)

---
### Clean up json format columns

In [8]:
json_columns = [
    'extendedIngredients',
    'analyzedInstructions'
]

Simplified list of ingredients

In [9]:
master_df['simplifiedIngredients'] = [[ing for ing in pd.DataFrame(cell)['nameClean']] for cell in master_df['extendedIngredients']]

Simplified list of instructions

In [10]:
# needed a function to pull all the steps from each component of the recipe
def make_list(df_row):
    list_of_steps = []
    for a in range(len(df_row)):
        for b in range(len(df_row[a]['steps'])):
            list_of_steps.append(df_row[a]['steps'][b]['step'])
    return list_of_steps

In [11]:
master_df['simplifiedInstructions'] = [make_list(row) for row in master_df['analyzedInstructions']]

In [12]:
# # example of needing to go multiple layers down, get the steps, then switch higher up to a separate component of the meal to get its steps
# for a in range(len(master_df['analyzedInstructions'].iloc[0])):
#     for b in range(len(master_df['analyzedInstructions'].iloc[0][a]['steps'])):
#         print(master_df['analyzedInstructions'].iloc[0][a]['steps'][b]['step'])

---
### A bit of feature selection to make the dataframe a bit smaller

In [13]:
master_df.columns

Index(['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy',
       'cheap', 'veryPopular', 'sustainable', 'weightWatcherSmartPoints',
       'gaps', 'lowFodmap', 'aggregateLikes', 'spoonacularScore',
       'healthScore', 'creditsText', 'license', 'sourceName',
       'pricePerServing', 'extendedIngredients', 'id', 'title',
       'readyInMinutes', 'servings', 'sourceUrl', 'image', 'imageType',
       'summary', 'cuisines', 'dishTypes', 'diets', 'occasions',
       'instructions', 'analyzedInstructions', 'originalId',
       'spoonacularSourceUrl', 'preparationMinutes', 'cookingMinutes',
       'author', 'simplifiedIngredients', 'simplifiedInstructions'],
      dtype='object')

In [14]:
# While some models may drop further features, these are the ones I don't plan on using at all
features_to_drop = [
    'weightWatcherSmartPoints',
    'gaps',
    'veryPopular',
    'aggregateLikes',
    'spoonacularScore',
    'healthScore',
    'creditsText',
    'license',
    'sourceName',
    'pricePerServing',
    'id',
    'servings',
    'sourceUrl',
    'image',
    'imageType',
    'diets',
    'originalId',
    'spoonacularSourceUrl',
    'cookingMinutes',
    'preparationMinutes',
    'author',
    'extendedIngredients',
    'analyzedInstructions'
]

In [15]:
# drop the features
master_df = master_df.drop(columns=features_to_drop)

In [16]:
# dropping the columns had the added benefit of clearing all of the missing values
# one thing to note however is that some of the columns have an empty list instead of nan
# those were not removed
master_df.isnull().sum().sort_values(ascending=False)[:3]

vegetarian               0
readyInMinutes           0
simplifiedIngredients    0
dtype: int64

---
### Convert booleans to numerics

In [17]:
boolean_columns = [
    'vegetarian',
    'vegan',
    'glutenFree',
    'dairyFree',
    'veryHealthy',
    'cheap',
    'sustainable',
    'lowFodmap'
]

In [18]:
for column in boolean_columns:
    master_df[column] = [1 if val is True else 0 for val in master_df[column]]

---
### Save as a csv

In [19]:
master_df.to_csv('../data/recipes.csv')