## Cleaning

In [1]:
import json
import os
import pandas as pd
import numpy as np

---
### Load in all the recipes

In [2]:
# verify the folder, remove the checkpoints file
os.listdir('../data/api_data/')[1:]

['.json',
 '001.json',
 '002.json',
 '004.json',
 '005.json',
 '006.json',
 '007.json',
 '008.json',
 '009.json',
 '010.json',
 '011.json',
 '012.json',
 '013.json']

In [3]:
# load in all json as dataframes
list_of_dfs = []
for file in os.listdir('../data/api_data/')[1:]:
    with open(f'../data/api_data/{file}') as f:
        list_of_dfs.append(pd.DataFrame(json.load(f)['recipes']))

In [4]:
for dframe in list_of_dfs:
    print(dframe.shape)

(100, 38)
(54, 37)
(50, 37)
(100, 37)
(100, 37)
(100, 37)
(100, 37)
(100, 38)
(100, 37)
(100, 38)
(100, 37)
(100, 37)
(100, 37)


---
### Merge all the dataframes into one, dropping duplicates

In [5]:
master_df = list_of_dfs[0]

for dframe in list_of_dfs[1:]:
    master_df = master_df.append(dframe)

In [6]:
# random recipes were acquired using the api
# drop duplicates
master_df = master_df.drop_duplicates(subset=['id'])

In [7]:
master_df.shape

(692, 38)

---
### A bit of feature selection to make the dataframe a bit smaller

In [8]:
master_df.columns

Index(['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy',
       'cheap', 'veryPopular', 'sustainable', 'weightWatcherSmartPoints',
       'gaps', 'lowFodmap', 'aggregateLikes', 'spoonacularScore',
       'healthScore', 'creditsText', 'license', 'sourceName',
       'pricePerServing', 'extendedIngredients', 'id', 'title',
       'readyInMinutes', 'servings', 'sourceUrl', 'image', 'imageType',
       'summary', 'cuisines', 'dishTypes', 'diets', 'occasions',
       'instructions', 'analyzedInstructions', 'originalId',
       'spoonacularSourceUrl', 'preparationMinutes', 'cookingMinutes',
       'author'],
      dtype='object')

In [9]:
# While some models may drop further features, these are the ones I don't plan on using at all
features_to_drop = [
    'weightWatcherSmartPoints',
    'gaps',
    'veryPopular',
    'aggregateLikes',
    'spoonacularScore',
    'healthScore',
    'creditsText',
    'license',
    'sourceName',
    'pricePerServing',
    'id',
    'servings',
    'sourceUrl',
    'image',
    'imageType',
    'diets',
    'originalId',
    'spoonacularSourceUrl',
    'cookingMinutes',
    'preparationMinutes',
    'author'
]

In [10]:
# drop the features
master_df = master_df.drop(columns=features_to_drop)

In [11]:
# dropping the columns had the added benefit of clearing all of the missing values
# one thing to note however is that some of the columns have an empty list instead of nan
# those were not removed
master_df.isnull().sum().sort_values(ascending=False)[:3]

analyzedInstructions    0
lowFodmap               0
vegan                   0
dtype: int64

In [12]:
# save the cleaned recipes as a csv
master_df.to_csv('../data/recipes.csv')