## Cleaning

In [38]:
import json
import os
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer

---
### Load in all the recipes

In [37]:
# verifies the folder
os.listdir('../data/api_data/')

['001.json',
 '002.json',
 '004.json',
 '005.json',
 '006.json',
 '007.json',
 '008.json',
 '009.json',
 '010.json',
 '011.json',
 '012.json',
 '013.json',
 '014.json',
 '015.json',
 '016.json',
 '017.json']

In [3]:
# load in all json as dataframes
list_of_dfs = []
for file in os.listdir('../data/api_data/')[1:]:
    with open(f'../data/api_data/{file}') as f:
        list_of_dfs.append(pd.DataFrame(json.load(f)['recipes']))

---
### Merge all the dataframes into one, dropping duplicates

In [5]:
master_df = list_of_dfs[0]

for dframe in list_of_dfs[1:]:
    master_df = master_df.append(dframe)

In [6]:
# random recipes were acquired using the api
# drop duplicates
master_df = master_df.drop_duplicates(subset=['id'])

In [7]:
master_df.shape

(761, 38)

---
### Clean up json format columns

In [8]:
json_columns = [
    'extendedIngredients',
    'analyzedInstructions'
]

Simplified list of ingredients

In [9]:
master_df['simplifiedIngredients'] = [[ing for ing in pd.DataFrame(cell)['nameClean']] for cell in master_df['extendedIngredients']]

Simplified list of instructions

In [10]:
# needed a function to pull all the steps from each component of the recipe
def make_list(df_row):
    list_of_steps = []
    for a in range(len(df_row)):
        for b in range(len(df_row[a]['steps'])):
            list_of_steps.append(df_row[a]['steps'][b]['step'])
    return list_of_steps

In [11]:
master_df['simplifiedInstructions'] = [make_list(row) for row in master_df['analyzedInstructions']]

In [12]:
# # example of needing to go multiple layers down, get the steps, then switch higher up to a separate component of the meal to get its steps
# for a in range(len(master_df['analyzedInstructions'].iloc[0])):
#     for b in range(len(master_df['analyzedInstructions'].iloc[0][a]['steps'])):
#         print(master_df['analyzedInstructions'].iloc[0][a]['steps'][b]['step'])

---
### A bit of feature selection to make the dataframe a bit smaller

In [13]:
master_df.columns

Index(['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy',
       'cheap', 'veryPopular', 'sustainable', 'weightWatcherSmartPoints',
       'gaps', 'lowFodmap', 'aggregateLikes', 'spoonacularScore',
       'healthScore', 'creditsText', 'license', 'sourceName',
       'pricePerServing', 'extendedIngredients', 'id', 'title',
       'readyInMinutes', 'servings', 'sourceUrl', 'image', 'imageType',
       'summary', 'cuisines', 'dishTypes', 'diets', 'occasions',
       'instructions', 'analyzedInstructions', 'originalId',
       'spoonacularSourceUrl', 'preparationMinutes', 'cookingMinutes',
       'author', 'simplifiedIngredients', 'simplifiedInstructions'],
      dtype='object')

In [14]:
# While some models may drop further features, these are the ones I don't plan on using at all
features_to_drop = [
    'weightWatcherSmartPoints',
    'gaps',
    'veryPopular',
    'aggregateLikes',
    'spoonacularScore',
    'healthScore',
    'creditsText',
    'license',
    'sourceName',
    'pricePerServing',
    'id',
    'servings',
    'sourceUrl',
    'image',
    'imageType',
    'diets',
    'originalId',
    'spoonacularSourceUrl',
    'cookingMinutes',
    'preparationMinutes',
    'author',
    'extendedIngredients',
    'analyzedInstructions'
]

In [15]:
# drop the features
master_df = master_df.drop(columns=features_to_drop)

In [16]:
# dropping the columns had the added benefit of clearing all of the missing values
# one thing to note however is that some of the columns have an empty list instead of nan
# those were not removed
master_df.isnull().sum().sort_values(ascending=False)[:3]

vegetarian               0
readyInMinutes           0
simplifiedIngredients    0
dtype: int64

---
### Convert booleans to numerics

In [17]:
boolean_columns = [
    'vegetarian',
    'vegan',
    'glutenFree',
    'dairyFree',
    'veryHealthy',
    'cheap',
    'sustainable',
    'lowFodmap'
]

In [18]:
for column in boolean_columns:
    master_df[column] = [1 if val is True else 0 for val in master_df[column]]

---
## TODO: Dummify dishType
so it doesn't break when exported as a csv and reimported elsewhere

In [24]:
master_df['dishTypes'].iloc[400][0]

'lunch'

In [63]:
# example of the original
print(type(master_df['dishTypes'].iloc[400]))
master_df['dishTypes'].iloc[400]

<class 'list'>


['lunch', 'main course', 'main dish', 'dinner']

---
## Simplify the cuisines column

In [51]:
simplifiedCuisine = {
    '[]': '',
    "['Mexican']": 'Mexican',
    "['Mediterranean', 'Italian', 'European']": 'Mediterranean',
    "['American']": 'American',
    "['German', 'European']": 'German',
    "['Mediterranean', 'French', 'European']": 'French',
    "['Creole', 'Cajun']": 'Cajun',
    "['Southern']": 'American',
    "['Chinese', 'Asian']": 'Chinese',
    "['Asian']": 'Asian',
    "['English', 'Scottish', 'British', 'European']": 'British',
    "['Mediterranean', 'European', 'Greek']": '',
    "['Indian', 'Asian']": 'Indian',
    "['African']": 'African',
    "['Korean', 'Asian']": 'Korean',
    "['Vietnamese', 'Asian']": 'Vietnamese',
    "['Spanish', 'European']": 'Spanish',
    "['Middle Eastern']": 'Middle Eastern',
    "['Eastern European', 'European']": 'Eastern European',
    "['European', 'Irish']": 'Irish',
    "['Cajun', 'Creole']": 'Cajun',
    "['South American', 'Latin American']": 'South American',
    "['Thai', 'Asian']": 'Thai',
    "['Jewish']": 'Middle Eastern',
    "['Japanese', 'Asian']": 'Japanese'
}

In [52]:
master_df['cuisines'] = [str(val) for val in master_df['cuisines']]
master_df['cuisines'] = master_df['cuisines'].map(simplifiedCuisine)

---
## Convert the ingredients from list to string form

In [71]:
master_df['simplifiedIngredients']

0     [wheat flour, sugar, butter, milk, low sodium ...
1     [butter, sugar, golden brown sugar, egg, vanil...
2     [wheat flour, unsalted butter, water, egg yolk...
3     [wheat flour, almond meal, turbinado sugar, ba...
4     [butter, powdered sugar, vanilla extract, whea...
                            ...                        
84    [semisweet chocolate, butter, cream of tartar,...
89    [95 percent lean ground beef, yellow onion, ce...
90    [boneless skinless chicken breast, ham, chedda...
95    [dried chickpeas, garlic, lemon juice, olive o...
96             [cream, dulce de leche, sugar, egg yolk]
Name: simplifiedIngredients, Length: 761, dtype: object

In [None]:
[str]

---
### Save as a csv

In [None]:
master_df.to_csv('../data/recipes.csv')