In [4]:
# https://github.com/alexnguyen9/recipe-matcher

In [34]:
print("connected")

connected


In [35]:
import pandas as pd
import itertools
import numpy as np
import pickle
import nltk
import re

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
from nltk.stem.wordnet import WordNetLemmatizer

In [38]:
bucket='halifaxfoodie-ml-data'

allrecipes_recipes = 'allrecipes-recipes.json'
allrecipes_recipesJson = 's3://{}/{}'.format(bucket, allrecipes_recipes)

In [39]:
bbccouk_recipes = 'bbccouk-recipes.json'
bbccouk_recipesJson = 's3://{}/{}'.format(bucket, bbccouk_recipes)

In [40]:
epicurious_recipes = 'epicurious-recipes.json'
epicurious_recipesJson = 's3://{}/{}'.format(bucket, epicurious_recipes)

In [41]:
wnl = WordNetLemmatizer()
def singular(x):
    return [wnl.lemmatize(s) for s in x]

# get only nouns and adjectives (for some reason it kept removing 'pepper' and 'chicken')
def get_nouns_and_adj(l):
    return [m[0] for m in itertools.chain.from_iterable([nltk.pos_tag(nltk.word_tokenize(x)) for x in l]) if (m[1] in ['NN','JJ','NNS','NNP','NNPS'] or m[0] in ['pepper','chicken'])]

# change certain compound words to singular words

# remove meausure words, preparation adjectives
def remove_words(x):
    remove = ['quart','liter','ml','teaspoon','pt','tablespoon','cup','ounce','fluid','gallon','pint',
          'pound','slice','sheet','pound','gram','ml','stick','bulb','inch','pinch','large',
         'small','light','sprig','quarter','half','whole','handful','good','best','fresh',
         'package','can','packed','stem','medium','piece','stalk','finishing','bottle','container',
         'clove','ear','fine','quality','coarse','bunch','wedge','flat','ground','lb','c','tbs',
         'thin','wide','refridgerator','equipment','standard','b','unprocessed','en', 'round',
          'optional','tsp','warm','cold','chopped','boiling','kitchen','length','lengthwise','smallish',
          'quick','dry','wet','new','few','many','splash','drop','topping','pure','regular','oz',
          'jar','envelope','extra','generous','hard','old','little','different','low','fat','gluten','free',
          'raw','square','foil','special','store','hard','soft','frozen','bag','recipe','decadent','spiral',
          'mini','simple','cooked','dark','packet','pre','box','unsalted','firm','other','tb','thread',
          'strand','strip','thick','restaurant','accompaniment','kg','lbs','ripe','boneless','range','zesty',
          'sodium','lowfat','original','tbsp','fl','peel','available','dash','nonstick','adjustable',
          'natural','zest','preheat','head','refridgerated','such','uncooked','canned','size','skinless',
          'frying','baby','size','artisan','organic','canned','sliced','cooled','chilled','part',
          'peeled','bottled','unpeeled','crunchy','pt','litre','additional','addition','wrapped','sweetened']
    return [y for y in x if y not in remove]




if __name__ == '__main__':

    print("Reading JSON Files...")
    data_bbc = pd.read_json(bbccouk_recipesJson,lines=True)
    #data_cookstr = pd.read_json("data/cookstr-recipes.json",lines=True)
    data_epi = pd.read_json(epicurious_recipesJson,lines=True)
    data_ar = pd.read_json(allrecipes_recipesJson,lines=True)

    # fix epicurious food dataframe
    data_epi.rename(columns = {'hed':'title','prepSteps':'instructions'},inplace=True)
    data_epi.url = data_epi.url.apply(lambda x: 'www.epicurious.com' + x)

    # combine all the food recipes and get only the relevant columns (title, ingredients, instructions, url)
    combined = pd.concat([data_bbc,data_epi,data_ar],join='inner',ignore_index=True)


    # remove this food item since I think there is a bug in the scraper
    combined = combined[combined.title != 'Johnsonville® Three Cheese Italian Style Chicken Sausage Skillet Pizza']
    combined.dropna(inplace=True)

    # get recipes with at least 3 or more ingredients
    combined = combined[combined.ingredients.apply(len) > 2]

    # turn the list into a string
    combined.instructions = combined.instructions.apply(lambda x:' '.join(x))

    # reset index
    combined.reset_index(inplace=True,drop=True)

    print("Cleaning ingredients...")
    # clean the recipes
    recipe = combined.ingredients.apply(lambda x: [re.sub(",.*$", "", y).lower() for y in x]) # remove everything after a comma and make lower case
    recipe = recipe.apply(lambda x: [re.sub('é','e', y)  for y in x]) # change accented 'e'
    recipe = recipe.apply(lambda x: [re.sub('î','i', y)  for y in x]) # change accented 'i'
    recipe = recipe.apply(lambda x: [re.sub(r'[^\x00-\x7f]',r' ', y)  for y in x]) # remove accented characters
    recipe = recipe.apply(lambda x: [re.sub(" with.*$", "", y) for y in x]) # everything after a 'with'
    recipe = recipe.apply(lambda x: [re.sub('\([^()]*\)', "", y) for y in x]) # everything in parenthesis
    recipe = recipe.apply(lambda x: [re.sub(r'\W+'," ", y) for y in x]) # only alphanumeric characters

    recipe = recipe.apply(get_nouns_and_adj) # get only nouns and adjectives
    recipe = recipe.apply(singular) # convert to singular words
    recipe = recipe.apply(remove_words) # remove irrelatvent words

    print("Vectorizing ingredients...")
    # define the count vectorizer
    vc = CountVectorizer(stop_words='english',min_df=60,binary=True)

    # this is the document term matrix from our recipes
    X = vc.fit_transform(recipe.values)

    print("Pickling...")

Reading JSON Files...
Cleaning ingredients...
Vectorizing ingredients...
Pickling...


In [30]:
pickle.dump(X.toarray().astype(bool), open('food_matrix.pkl','wb')) # save as boolean array to save space
pickle.dump(combined, open('main_data.pkl','wb'))
pickle.dump(vc, open('transformer.pkl','wb'))