In [22]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import requests

## Clean up recipes

In [2]:
recipes = pd.read_csv('Food.com Recipes and Interactions/RAW_recipes.csv')

In [3]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [10]:
recipes.shape

(231637, 12)

In [12]:
# do we wanna drop it if description is NaN?
recipes.isna().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [14]:
recipes[recipes['name'].isna()]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
721,,368257,10,779451,2009-04-27,"['15-minutes-or-less', 'time-to-make', 'course...","[1596.2, 249.0, 155.0, 0.0, 2.0, 112.0, 14.0]",6,"['in a bowl , combine ingredients except for o...",-------------,"['lemon', 'honey', 'horseradish mustard', 'gar...",10


In [16]:
recipes = recipes.drop([721])

In [17]:
recipes.shape

(231636, 12)

In [20]:
# How do we want to deal with duplicates?
len(recipes['name'].unique())

230185

In [26]:
recipes = recipes.dropna(axis=0, how='any')

In [29]:
recipes = recipes.drop_duplicates(subset='name', keep='first')

In [30]:
recipes.shape

(225291, 12)

In [42]:
recipes.to_csv('Food.com Recipes and Interactions/recipes.csv',index=False)

## Clean up reviews

In [31]:
reviews = pd.read_csv('Food.com Recipes and Interactions/RAW_interactions.csv')

In [32]:
reviews.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [33]:
reviews.shape

(1132367, 5)

In [34]:
reviews.isna().sum()

user_id        0
recipe_id      0
date           0
rating         0
review       169
dtype: int64

In [36]:
new_reviews = reviews.merge(recipes, how='inner', left_on='recipe_id',right_on='id')

In [37]:
new_reviews.shape

(1099904, 17)

In [39]:
new_reviews = new_reviews[['recipe_id','rating','review']]

In [40]:
len(new_reviews['recipe_id'].unique())

225291

In [None]:
# good, we have at least one review for each recipe, though a few of them may be empty

In [43]:
reviews.to_csv('Food.com Recipes and Interactions/reviews.csv',index=False)