In [4]:
import numpy as np
import pandas as pd
import re

In [5]:
# vecorization of strings is not supported in numpy so we are stuck with loops
data = ["peter", "Paul", "MARY", "gUIDO"]
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [6]:
# this will break if there are any none values
data = ["peter", "Paul", None, "MARY", "gUIDO"]
[s if s is None else s.capitalize() for s in data]

['Peter', 'Paul', None, 'Mary', 'Guido']

In [7]:
# this can be error prone and verbose' pandas has features to address this such as str.capitalize()
names = pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [8]:
# tables of pandas string methods
monte = pd.Series(
    [
        "Graham Chapman",
        "John Cleese",
        "Terry Gilliam",
        "Eric Idle",
        "Terry Jones",
        "Michael Palin",
    ]
)
# nearly all python's built in string methods are built into pands
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [9]:
# others return nuumbers
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [10]:
# or boolean values
monte.str.startswith("T")

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [11]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [12]:
# methods using regular expressions
# there are several methods that accept regular expressions to examine the content of each string element and follow the conventions of python's re module (regular expressions)
# match, extract, findall, replace, contains, count, split, rsplit
# with these we can do a wide range of operations such as asking for a contiguous group of characters, at the begining of each element
monte.str.extract("([A-Za-z]+)", expand=False)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [13]:
# we can do something more complicated like finding all names that start and end with a consonant making use of the start of string (^) and end of string ($) regular expression characters
monte.str.findall(r"^[^AEIOU].*[^aeiou]$")

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [17]:
# there are some miscellaneous methods that enable other convienient operations
# these include: get, slice, slice_replace, cat, repeat, normalize, pad, wrap, join, get_dummies
# the get and slice operations enable vecorized element access from each array
# we can get a slic of the first three characters of each array using str.slice(0,3)
monte.str[0:3]  # this is shorthand for monte.str.slice(0,3)

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
dtype: float64

In [18]:
# these indexing methods also let you access elements of arrays returned by split
# to extract the last name of eac entry we can combine split and str indexing
monte.str.split().str[-1]

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [19]:
# indicator variables
# the get dummies method is useful when your data has a column containing some sort of coded indicator we might have a dataset that contains information in the form of codes such as A=born in America, B=born in the United Kingdom, C=Canada
full_monte = pd.DataFrame(
    {"name": monte, "info": ["B|C|D", "B|D", "A|C", "B|D", "B|C", "B|C|D"]}
)
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [20]:
# the get dummies routine lets us split out these indicator variables into a DataFrame
full_monte["info"].str.get_dummies("|")
# with these operations as building blocks you can construct an endless range of string processing procedures to suit your needs

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


RECIPE DATABASE

In [23]:
# these vecorized string operations become most usefull in the process of cleaning up messy, real world data
# our goal is to parse the recipe data into ingredient lists, so we can quickly find a recipe based on some ingredients we have on hand
recipes = pd.read_json("Data/recipeitems.json", lines=True)
recipes.shape

(173278, 17)

In [24]:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
name                                    Drop Biscuits and Sausage Gravy
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
url                   http://thepioneerwoman.com/cooking/2013/03/dro...
image                 http://static.thepioneerwoman.com/cooking/file...
ts                                             {'$date': 1365276011104}
cookTime                                                          PT30M
source                                                  thepioneerwoman
recipeYield                                                          12
datePublished                                                2013-03-11
prepTime                                                          PT10M
description           Late Saturday afternoon, after Marlboro Man ha...
totalTime                                                           NaN
creator                                                         

In [25]:
# there is a lof of information much of is it very mess and requires some processing to become useful
# the ingredient list is in string format
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [26]:
# the ingredient list average length is 250 characters with a minimum of 0 and a maximum of nearly 10,000
# we want to see which recipe has the longest ingredient list
recipes.name[np.argmax(recipes.ingredients.str.len())]

'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [28]:
# we can do other aggregate explorations; for example, we can see how many of the recipes are for breakfast foods
recipes.description.str.contains("[Bb]reakfast").sum()

3524

In [29]:
# or how many of them involve cinnamon
recipes.ingredients.str.contains("[Cc]innamon").sum()

10526

In [32]:
# we could even look to see whether any recipes misspell the ingredient as "cinamon"
recipes.ingredients.str.contains("[Cc]inamon").sum()

11

Recipe Recommender

In [33]:
# given a list of ingredients, we want to find all recipes that use all those ingredients while conceptually simple, the implementation can be a bit tricky due to the heterogeneous nature of the data.
# we'll start with a simple list of common ingredients and simply search to see whether they are in each recipe's ingredient list we'll start with the herbs and spices for the time being
spice_list = [
    "salt",
    "pepper",
    "oregano",
    "sage",
    "parsley",
    "rosemary",
    "tarragon",
    "thyme",
    "paprika",
    "cumin",
]

In [34]:
# we then build a boolean dataframe consisting of True and False values indicating whether this ingredient appears in the list
spice_df = pd.DataFrame(
    {
        spice: recipes.ingredients.str.contains(spice, re.IGNORECASE)
        for spice in spice_list
    }
)
spice_df.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [36]:
# now as an example let's say we'd like to find a recipe that uses parsley, poprika, and tarragon we can compute this very quickly using the query() method of DataFrame
selection = spice_df.query("parsley & paprika & tarragon")
len(selection)

10

In [37]:
# we only have 10 recipes with this combindation we can use the index returned by this selection to discover the names of those recipes
recipes.name[selection.index]
# now that we have narrowed down our recipe selection from 175,000 to 10 we are in position to make a more informed decision about what we want to cook for dinner

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object