In [1]:
# Import-a-ton
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline  

# Epicurious

Having a look at the recipes dataset.

In [None]:
# Load the dataset
epicurious = './datasets/epicurious/epi_r.csv'
df = pd.read_csv(epicurious)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df['rating'].describe()

In [None]:
# Columns with null values
df.columns[df.isnull().any()].tolist()

In [None]:
# Don't really need these columns...drop!
df = df.drop(columns=df.columns[df.isnull().any()].tolist())

In [None]:
df.head()

In [None]:
# Check for duplicated rows
df[df.duplicated('title', keep=False)].head()

In [None]:
# How many duplicated rows are there?
df.duplicated('title', keep=False).sum()

In [None]:
# Checking one:
df[df['title']=='Dried Pear Crisps ']

In [None]:
# Drop the duplicates
df.drop_duplicates(subset='title', keep='first', inplace=True)

In [None]:
# After dropping duplicates, reset the index
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
# Checking the ratings column, most important!
df['rating'].value_counts()

In [None]:
# Drop the whitespace at the end of the title cells
df['title'] = df['title'].str.strip()

# Recipes

Extracting just the recipes, their titles, ratings, and preparation details.

In [None]:
recipes = df.iloc[:,:8]

In [None]:
# Don't need #cakeweek an #wasteless
recipes = recipes.drop(columns=['#cakeweek','#wasteless'])

In [None]:
recipes['22-minute meals'].value_counts()

In [None]:
recipes['3-ingredient recipes'].value_counts()

In [None]:
recipes['30 days of groceries'].value_counts()

In [None]:
recipes['advance prep required'].value_counts()

In [None]:
# Change binary columns to int64
binary_columns = ['22-minute meals',
                  '3-ingredient recipes',
                  '30 days of groceries',
                  'advance prep required']

recipes[binary_columns] = recipes[binary_columns].astype('int64')

In [None]:
recipes.info()

In [None]:
recipes.head()

# Ingredients

Taking just the binary ingredients columns.

In [None]:
ingredients = df.iloc[:,8:].copy()

In [None]:
# All binary columns; change them all to integers
ingredients = ingredients.astype('int64')

In [None]:
ingredients.info()

In [None]:
list_of_ingredients = ingredients.columns.tolist()

In [None]:
# Pickle!
pickle.dump(list_of_ingredients,open('epi_list_ingredients.p','wb'))

In [None]:
# Average number of ingredients required
ingredients.sum(axis=1, numeric_only=True).mean()

# Cleaning Further Across both Datasets

In [None]:
# Bring in the pickled list of ingredients
list_of_ingredients = pickle.load(open('epi_list_ingredients.p', 'rb'))

In [None]:
list_of_ingredients[:5]

In [None]:
# Getting rid of the state names
#
# Using dataset from the internet
state_names = pd.read_csv('./us_cities_states_counties.csv', sep='|')

In [None]:
state_names.head()

In [None]:
state_names_list = state_names['State full'].drop_duplicates(keep='last', inplace=False).str.lower().tolist()

In [None]:
state_names_list[:5]

In [None]:
type(state_names_list)

In [None]:
# Remove those in the state_names list
epi_ingredients = ingredients.columns[~ingredients.columns.isin(state_names_list)]

In [None]:
epi_ingredients = epi_ingredients.tolist()

In [None]:
# Remove Unicode characters
epi_ingredients_clean = []
for x in epi_ingredients:
    try:
        epi_ingredients_clean.append(x.encode('ascii','ignore'))
    except UnicodeDecodeError:
        pass

In [None]:
epi_ingredients_clean[:10]

In [None]:
# Remove these ones manually cause they are just slightly more manageable at this point
remove_manually = ['epi + ushg','epi loves the microwave','anthony bourdain']

In [None]:
epi_ingredients_clean_ = [x for x in epi_ingredients_clean if x not in remove_manually]

In [None]:
len(epi_ingredients_clean_)

In [None]:
# Extract only the cleaned columns
epi_ingredients_clean_df = ingredients[epi_ingredients_clean_]

In [None]:
epi_ingredients_clean_df.head()

In [None]:
# Average number of ingredients required
epi_ingredients_clean_df.sum(axis=1, numeric_only=True).mean()

The average is still almost the same so we haven't lost too much information from the column dropping.

In [None]:
number_of_ingredients_per_recipe = epi_ingredients_clean_df.sum(axis=1).to_frame('sum_of_ingredients')

In [None]:
number_of_ingredients_per_recipe.head()

In [None]:
len(number_of_ingredients_per_recipe)

In [None]:
# How does it look like?
number_of_ingredients_per_recipe.plot(y='sum_of_ingredients', use_index=True, kind='box',figsize=(8,8))
plt.show()

In [None]:
# Any zero ingredients recipes?
zero_ingredients = number_of_ingredients_per_recipe[number_of_ingredients_per_recipe['sum_of_ingredients']==0]
zero_ingredients

In [None]:
zero_ingredients_list = zero_ingredients.index.tolist()

Hmmmm, I suppose I can remove them from my dataset.

In [None]:
epi_clean = pd.concat([recipes,epi_ingredients_clean_df], axis=1)

In [None]:
epi_clean.head()

In [None]:
# Let's check what are these zero ingredients recipes...
epi_clean.loc[epi_clean.index.isin(zero_ingredients_list)]

Very drop-able. Pity about 5117 and 9829, but it's ok...

In [None]:
epi_clean.drop(zero_ingredients_list, inplace=True)

In [None]:
epi_clean.info()

In [None]:
epi_recipes_clean = epi_clean.iloc[:,:6].copy()

In [None]:
# Write to CSV
epi_recipes_clean.to_csv('./datasets/epicurious/epi_recipes_clean.csv', index=False)

In [None]:
epi_ingredients_clean = epi_clean.iloc[:,6:].copy()

In [None]:
# Write to CSV
epi_ingredients_clean.to_csv('./datasets/epicurious/epi_ingredients_clean.csv', index=False)

## Trying an alternative, the json file

In [19]:
test = pd.read_json('../data/raw/full_format_recipes.json', orient='values', encoding='utf-8')

In [20]:
test.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20 04:00:00,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27 04:00:00,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20 04:00:00,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


In [21]:
test['categories'][0]

['Sandwich',
 'Bean',
 'Fruit',
 'Tomato',
 'turkey',
 'Vegetable',
 'Kid-Friendly',
 'Apple',
 'Lentil',
 'Lettuce',
 'Cookie']

In [22]:
test['directions'][0]

['1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool.',
 '2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper.',
 '3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.']

In [23]:
test['ingredients'][0]

['4 cups low-sodium vegetable or chicken stock',
 '1 cup dried brown lentils',
 '1/2 cup dried French green lentils',
 '2 stalks celery, chopped',
 '1 large carrot, peeled and chopped',
 '1 sprig fresh thyme',
 '1 teaspoon kosher salt',
 '1 medium tomato, cored, seeded, and diced',
 '1 small Fuji apple, cored and diced',
 '1 tablespoon freshly squeezed lemon juice',
 '2 teaspoons extra-virgin olive oil',
 'Freshly ground black pepper to taste',
 '3 sheets whole-wheat lavash, cut in half crosswise, or 6 (12-inch) flour tortillas',
 '3/4 pound turkey breast, thinly sliced',
 '1/2 head Bibb lettuce']

In [24]:
test['title'][0]

'Lentil, Apple, and Turkey Wrap '

In [25]:
test.drop(columns=['fat','calories','date','protein','sodium','desc'], inplace=True)

In [26]:
test.head()

Unnamed: 0,categories,directions,ingredients,rating,title
0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...","[1. Place the stock, lentils, celery, carrot, ...","[4 cups low-sodium vegetable or chicken stock,...",2.5,"Lentil, Apple, and Turkey Wrap"
1,"[Food Processor, Onion, Pork, Bake, Bastille D...",[Combine first 9 ingredients in heavy medium s...,"[1 1/2 cups whipping cream, 2 medium onions, c...",4.375,Boudin Blanc Terrine with Red Onion Confit
2,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",[In a large heavy saucepan cook diced fennel a...,"[1 fennel bulb (sometimes called anise), stalk...",3.75,Potato and Fennel Soup Hodge
3,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",[Heat oil in heavy large skillet over medium-h...,"[2 tablespoons extra-virgin olive oil, 1 cup c...",5.0,Mahi-Mahi in Tomato Olive Sauce
4,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",[Preheat oven to 350°F. Lightly grease 8x8x2-i...,"[1 12-ounce package frozen spinach soufflé, th...",3.125,Spinach Noodle Casserole


In [27]:
test['title'] = test['title'].str.strip()

In [28]:
test.shape

(20130, 5)

In [29]:
test.drop_duplicates(subset='title', keep='first', inplace=True)

In [30]:
test.shape

(17776, 5)

In [31]:
test['title'][0]

'Lentil, Apple, and Turkey Wrap'

In [32]:
test.to_json('../data/processed/epi_recipe_json_cleaned.json', force_ascii=False, orient='records')

### Highly Rated Ones

Just in case

In [None]:
rated_highly = test[test['rating']>4.0].copy()

In [None]:
rated_highly.head()

In [None]:
rated_highly.reset_index(drop=True, inplace=True)

In [None]:
rated_highly.to_csv('./datasets/epicurious/epi_recipe_rated_highly.csv', index=False)