### Data Gathering and Pre-Processing

In [2]:
from recipe_scrapers import scrape_me

In [3]:
# Test out recipe scraper
scrape = scrape_me('https://www.allrecipes.com/recipe/257938/spicy-thai-basil-chicken-pad-krapow-gai')
print(scrape.title())
print(scrape.ingredients())

Spicy Thai Basil Chicken (Pad Krapow Gai)
['1/3 cup chicken broth', '1 tablespoon oyster sauce', '1 tablespoon soy sauce, or as needed', '2 teaspoons fish sauce', '1 teaspoon white sugar', '1 teaspoon brown sugar', '2 tablespoons vegetable oil', '1 pound skinless, boneless chicken thighs, coarsely chopped', '1/4 cup sliced shallots', '4 cloves garlic, minced', '2 tablespoons minced Thai chilies, Serrano, or other hot pepper', '1 cup very thinly sliced fresh basil leaves', '2 cups hot cooked rice']


In [4]:
# Remove common filler words that aren't ingredients; I actually ended up keeping some words that could be latent features
import pandas
data = pandas.read_csv("words_remove.csv")
words_remove = data['Words'].tolist()
print(words_remove)

['1', '2', '3', '4', '5', '6', '7', '8', '9', "'", ',', '/', 'baking', 'brown', 'cans', 'chopped', 'cloves', 'coarsely', 'crumbled', 'crumbs', 'crushed', 'cup', 'cups', 'cut', 'dark', 'divided', 'minced', 'mix', 'needed', 'optional', 'other', 'ounces', 'ounce', 'package', 'pan', 'parts', 'pound', 'sliced', 'tablespoons', 'tablespoon', 'tbs', 'tbsp', 'teaspoons', 'teaspoon', 'tsp', 'vegetable', 'white', 'large', 'purpose', 'peeled', 'discarded', 'finely', 'finely', 'pinches', 'pinch', 'shears', 'grey', 'serving', 'slices', 'slivered']


In [5]:
def clean_ingredients():
    for i in range(len(words_remove)):
        global ingredients
        ingredients = [x.replace(words_remove[i],"") for x in ingredients]
        ingredients = [x.replace("  "," ") for x in ingredients]
        ingredients = [x.strip() for x in ingredients]

In [6]:
# Test out writing the cleaned-up ingredients to a csv
import csv
filename = "recipetest.csv"
f = open(filename, "w")
headers = "ingredients"
f.write(headers)
ingredients = scrape.ingredients()
#ingredients = ''.join(scrape.ingredients())
#f.write(ingredients.replace("'", ""))

clean_ingredients()

with open(filename, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in ingredients:
        writer.writerow([val])
f.close()
print(ingredients)

['chicken broth', 'oyster sauce', 'soy sauce or as', 'fish sauce', 'sugar', 'sugar', 'oil', 'skinless boneless chicken thighs', 'shallots', 'garlic', 'Thai chilies Serrano or hot pepper', 'very thinly fresh basil leaves', 's hot cooked rice']


This gives us a list of cleaned-up ingredients that we could perhaps put into a dictionary and then create a matrix form. But I'm going to try a different method. I will concatenate the items in the list into a string which will allow me to use TfidfVectorizer from Sci-Kit Learn.

In [7]:
df_recipes = pandas.read_csv("recipe_links.csv")
recipe_links = df_recipes['Link'].tolist()

In [8]:
ingredients_combined = []
titles_list = []
for j in range(len(recipe_links)):
    scrape = scrape_me(recipe_links[j])
    ingredients = scrape.ingredients()
    clean_ingredients()
    ingredients_combined.append(' '.join(ingredients))
    titles_list.append(scrape.title())

In [9]:
ingredients_matrix = df_recipes
ingredients_matrix['Title'] = titles_list
ingredients_matrix['Ingredients'] = ingredients_combined
ingredients_matrix.head()

Unnamed: 0,Link,Title,Ingredients
0,https://www.allrecipes.com/recipe/257938/spicy...,Spicy Thai Basil Chicken (Pad Krapow Gai),chicken broth oyster sauce soy sauce or as fis...
1,https://www.allrecipes.com/recipe/238840/quick...,Quick Crispy Parmesan Chicken Breasts,cooking spray ko bread Parmesan cheese paprika...
2,https://www.allrecipes.com/recipe/23847/pasta-...,Pasta Pomodoro,( ) angel hair pasta olive oil onion garlic s ...
3,https://www.allrecipes.com/recipe/50435/fry-br...,Fry Bread Tacos II,Toppings: (. ) can pinto beans with liquid pic...
4,https://www.allrecipes.com/recipe/142488/amazi...,Amazing Spicy Grilled Shrimp,olive oil sesame oil fresh parsley hot sauce g...


In [33]:
# Test out adding input words from user
test_words = 'chicken broth oyster sauce soy sauce'
df_input = pandas.DataFrame({'Link':['N/A'],'Title':['User input'],'Ingredients':[test_words]})
df_input = df_input[['Link','Title','Ingredients']]
df_ingredients_merge = pandas.concat([df_input, ingredients_matrix], ignore_index=True)
df_ingredients_merge.head()

Unnamed: 0,Link,Title,Ingredients
0,,User input,chicken broth oyster sauce soy sauce
1,https://www.allrecipes.com/recipe/257938/spicy...,Spicy Thai Basil Chicken (Pad Krapow Gai),chicken broth oyster sauce soy sauce or as fis...
2,https://www.allrecipes.com/recipe/238840/quick...,Quick Crispy Parmesan Chicken Breasts,cooking spray ko bread Parmesan cheese paprika...
3,https://www.allrecipes.com/recipe/23847/pasta-...,Pasta Pomodoro,( ) angel hair pasta olive oil onion garlic s ...
4,https://www.allrecipes.com/recipe/50435/fry-br...,Fry Bread Tacos II,Toppings: (. ) can pinto beans with liquid pic...


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

cv = CountVectorizer(analyzer='word', stop_words='english', binary=True)
cv_matrix = cv.fit_transform(ingredients_matrix['Ingredients'])

print(cv_matrix.toarray())
print(cv_matrix)
#print(cv.get_feature_names())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
  (0, 338)	1
  (0, 103)	1
  (0, 231)	1
  (0, 22)	1
  (0, 164)	1
  (0, 441)	1
  (0, 299)	1
  (0, 201)	1
  (0, 376)	1
  (0, 82)	1
  (0, 437)	1
  (0, 169)	1
  (0, 378)	1
  (0, 440)	1
  (0, 39)	1
  (0, 391)	1
  (0, 278)	1
  (0, 427)	1
  (0, 152)	1
  (0, 401)	1
  (0, 361)	1
  (0, 287)	1
  (0, 50)	1
  (0, 78)	1
  (1, 439)	1
  :	:
  (101, 91)	1
  (101, 210)	1
  (101, 326)	1
  (101, 54)	1
  (101, 100)	1
  (101, 370)	1
  (101, 344)	1
  (101, 172)	1
  (101, 282)	1
  (101, 183)	1
  (101, 472)	1
  (101, 180)	1
  (101, 469)	1
  (101, 338)	1
  (101, 164)	1
  (101, 169)	1
  (101, 440)	1
  (101, 39)	1
  (101, 391)	1
  (101, 278)	1
  (101, 427)	1
  (101, 152)	1
  (101, 401)	1
  (101, 361)	1
  (101, 78)	1


Here we can use CountVectorizer to get a visual matrix/array represenation of what words were in each ingredient list and also see what feature names are being used from the scraped ingredients list.

In [35]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english', binary=True)
tfidf_matrix = tf.fit_transform(ingredients_matrix['Ingredients'])
tfidf_matrix

<102x3774 sparse matrix of type '<class 'numpy.float64'>'
	with 6478 stored elements in Compressed Sparse Row format>

TFidfVectorizer produces normalized vectors so we can use linear_kernal for cosine similarity. 
ngram_range allows us to pick up single words, two words, and three words in a sequence as they may be important.
I put binary=True because I don't care how many times an ingredient is mentioned in a recipe, I just care if it is listed or not.

In [36]:
#cosine similarity
recipe_comparitor = 0 # Using 0 returns the first item in the dataframe which is the user's input
cosine_similarities = linear_kernel(tfidf_matrix[recipe_comparitor], tfidf_matrix).flatten()
print('Comparing recipes to: ' + str(df_ingredients_merge['Title'][recipe_comparitor]))
print(cosine_similarities)

Comparing recipes to: User input
[1.         0.05219031 0.0512888  0.00602763 0.0214909  0.09542026
 0.05479357 0.00903535 0.01834866 0.00617547 0.05854258 0.0136611
 0.         0.01081819 0.02523115 0.00455815 0.00568003 0.06157857
 0.01003905 0.00879941 0.01303009 0.         0.02714384 0.01027347
 0.00569621 0.00816928 0.01152151 0.02001481 0.01025616 0.00358949
 0.02771912 0.00826375 0.00482704 0.00455242 0.00523385 0.
 0.0386541  0.03395485 0.00782916 0.00446972 0.02630707 0.00325493
 0.00235448 0.00698    0.01134075 0.02279882 0.00393201 0.00420737
 0.         0.004476   0.02681076 0.01087875 0.02375911 0.00850321
 0.00349959 0.00428596 0.         0.01299156 0.01841425 0.00882165
 0.01714502 0.01231448 0.00892337 0.03179183 0.0031044  0.01091204
 0.00631969 0.00516548 0.00764755 0.00460149 0.04769441 0.02247445
 0.02849799 0.00876959 0.00522117 0.06984408 0.01480549 0.00418577
 0.00577189 0.         0.01481508 0.00418905 0.09875691 0.02720382
 0.00956754 0.00259161 0.01016684 0.03

In [59]:
# This will compare every recipe with every recipe
#cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
#for idx, row in ingredients_matrix.iterrows():
#    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
#    similar_items = [(cosine_similarities[idx][i], ingredients_matrix['Title'][i]) for i in similar_indices]
#similar_items

In [60]:
cosine_index = cosine_similarities.argsort()[:-12:-1] # Return the 10 best matches for recipes not including recipe used for comparison
cosine_index

array([  0, 101,  82,   5,  75, 100,  17,  10,   6,   1,   2], dtype=int64)

In [61]:
similar_items = []
for i in range(len(cosine_index)):
    similar_items.append([(ingredients_matrix['Title'][cosine_index[i]]), cosine_similarities[cosine_index[i]]])
del similar_items[0] # Delete first item from list as that will be the recipe being used for comparison
print('Showing 10 best recipe matches and the cosine similarity')
similar_items

Showing 10 best recipe matches and the cosine similarity


[["Chef John's Caramel Chicken", 0.10018577466009912],
 ['Curry Stand Chicken Tikka Masala Sauce', 0.0987569124864771],
 ['Pan-Roasted 5-Spice Pork Loin', 0.09542026271252844],
 ['Barbeque Bacon Chicken Bake', 0.06984407923866139],
 ['Chicken Parmesan', 0.06958179273161899],
 ['Easy Bacon Fried Rice', 0.06157856505707622],
 ['Chicken Cacciatore in a Slow Cooker ', 0.05854257932047985],
 ['Easy Bulgogi (Korean BBQ Beef)', 0.05479356510043372],
 ['Quick Crispy Parmesan Chicken Breasts', 0.05219031485300253],
 ['Pasta Pomodoro', 0.05128880158770881]]

### Train/Test Split

In [37]:
# Train on the first 80% of recipes (recipes were added at random) using user input
tfidf_matrix_train = tf.fit_transform(ingredients_matrix['Ingredients'][0:int(0.8*len(ingredients_matrix['Ingredients']))])
cosine_similarities_train = linear_kernel(tfidf_matrix_train[recipe_comparitor], tfidf_matrix_train).flatten()
cosine_index_train = cosine_similarities_train.argsort()[:-12:-1]
similar_items_train = []
for i in range(len(cosine_index_train)):
    similar_items_train.append([(ingredients_matrix['Title'][cosine_index_train[i]]), cosine_similarities_train[cosine_index_train[i]]])
del similar_items_train[0] # Delete first item from list as that will be the recipe being used for comparison
print('Showing 10 best recipe matches and the cosine similarity')
similar_items_train

Showing 10 best recipe matches and the cosine similarity


[['Pan-Roasted 5-Spice Pork Loin', 0.09944632574750817],
 ['Barbeque Bacon Chicken Bake', 0.07664730486207577],
 ['Chicken Cacciatore in a Slow Cooker ', 0.06401317427595107],
 ['Easy Bacon Fried Rice', 0.06281590443174247],
 ['Quick Crispy Parmesan Chicken Breasts', 0.05766120977920045],
 ['Pasta Pomodoro', 0.0571480178030727],
 ['Easy Bulgogi (Korean BBQ Beef)', 0.05651129849731967],
 ['Chicken Souvlaki with Tzatziki Sauce', 0.05208050748597118],
 ['Spinach and Banana Power Smoothie', 0.03981292447049276],
 ['Pho Ga Soup', 0.036058842625919116]]

In [39]:
# Test on the last 20% of recipes (recipes were added at random) using same user input
tfidf_matrix_test = tf.fit_transform(ingredients_matrix['Ingredients'][int(0.8*len(ingredients_matrix['Ingredients'])):int(len(ingredients_matrix['Ingredients']))])
cosine_similarities_test = linear_kernel(tfidf_matrix_test[recipe_comparitor], tfidf_matrix_test).flatten()
cosine_index_test = cosine_similarities_test.argsort()[:-12:-1]
similar_items_test = []
for i in range(len(cosine_index_train)):
    similar_items_test.append([(ingredients_matrix['Title'][cosine_index_test[i]]), cosine_similarities_test[cosine_index_test[i]]])
del similar_items_test[0] # Delete first item from list as that will be the recipe being used for comparison
print('Showing 10 best recipe matches and the cosine similarity')
similar_items_test

Showing 10 best recipe matches and the cosine similarity


[['Loaded Crack Potatoes', 0.08981210394714638],
 ['Healthy Cauliflower and Edamame Salad', 0.06505351189374405],
 ['Easy Bulgogi (Korean BBQ Beef)', 0.039702112319354374],
 ['Fry Bread Tacos II', 0.0360768579812699],
 ['Quick Crispy Parmesan Chicken Breasts', 0.03237746140553183],
 ['Easy Bacon Fried Rice', 0.030656769037803018],
 ['Pan-Roasted 5-Spice Pork Loin', 0.028610136363353942],
 ['The Best Kale Salad', 0.026078800833765885],
 ['Creamsicle® Pancakes', 0.02529956276858359],
 ['Buffalo Chicken Mac and Cheese', 0.02019822665523814]]

Comparing the cosine similarities between the train and test output recommendations, they look to match as the best match is around 0.09 for both cases.

### SQL Database

In [19]:
# Export ingredients_matrix dataframe to a SQL database
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:recipe@localhost:5432/recipe_recommender')
ingredients_matrix.to_sql('Ingredients Matrix', engine)

In [29]:
# Test pulling from the SQL database
#import psycopg2
#try:
#    conn = psycopg2.connect("dbname='recipe_recommender' user='postgres' host='localhost' password='recipe'")
#except:
#    print('Unable to connect to database')   
#cur = conn.cursor()
#cur.fetchall()

df = pandas.read_sql_table('Ingredients Matrix', engine)

In [31]:
# Alternate solution since I cannot connect to Postgres on PythonAnywhere with a free account
ingredients_matrix.to_csv('ingredients_matrix.csv')