In [21]:
import pandas as pd

recipes_csv = pd.read_csv("../data/raw/RAW_recipes.csv", on_bad_lines='skip')
ratings_csv = pd.read_csv('../data/raw/RAW_interactions.csv', on_bad_lines='skip')

recipes_df = recipes_csv.drop_duplicates()
recipes_df.dropna(inplace=True)

ratings_df = ratings_csv.drop_duplicates()
ratings_df.dropna(inplace=True)

In [22]:
# Extract recent recipes
ratings_df = ratings_df[ratings_df['date'] >= '2010-01-01']

# Extract ratings of users who have at least rated 5 recipes
user_rating_counts = ratings_df['user_id'].value_counts()
users_to_keep = user_rating_counts[user_rating_counts >= 10].index
ratings_df = ratings_df[ratings_df['user_id'].isin(users_to_keep)]

In [23]:
recipes_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [24]:
ratings_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
6,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo..."
32,5060,310237,2010-05-07,5,"wow red and white sweetness! DH loved them , ..."


In [25]:
n_users = len(ratings_df.user_id.unique())
n_recipes = len(ratings_df.recipe_id.unique())
n_ratings = len(ratings_df)
print("Number of users:", n_users)
print("Number of recipes:", n_recipes)
print("Number of ratings:", n_ratings)

Number of users: 3879
Number of recipes: 94013
Number of ratings: 209691


In [26]:
user_item_matrix = ratings_df.pivot_table(index='user_id', columns='recipe_id', values='rating', fill_value=0)
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()

recipe_id,38,40,49,55,59,62,66,92,93,94,...,536990,537039,537071,537073,537175,537241,537319,537458,537459,537485
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Sparse User-Item Matrix

from scipy.sparse import csr_matrix

user_item_csr_matrix = csr_matrix(user_item_matrix.values)

In [28]:
# Recommender

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_item_csr_matrix)

In [29]:
# Recommend recipes
import random

user_id = random.choice(user_item_matrix.index)
user_id = 552052

n_neighbors = int(user_item_csr_matrix.shape[0] ** (1/2))
n_neighbors = 200

user_index = user_item_matrix.index.get_loc(user_id)

distances, indices = model_knn.kneighbors(user_item_csr_matrix[user_index], n_neighbors=n_neighbors)

recommended_recipes = recipes_df.iloc[indices[0]]

recipe_names = recommended_recipes['name'].tolist()
recipe_ids = recommended_recipes['id'].tolist()

max_distance = distances.max()
print(max_distance)
recommendation_percentages = [(1 - dist) * 100 for dist in distances[0]]

recommendations_df = pd.DataFrame({
    'recipe_name': recipe_names,
    'id': recipe_ids,
    'reliability': recommendation_percentages,
    'distance': distances[0]
}, index=range(1, len(recipe_names) + 1))

# get the top 10
print("Recommendations for user:", user_id)
recommendations_df.head(10)


0.9441854427814053
Recommendation for user: 552052


Unnamed: 0,recipe_name,id,reliability,distance
1,a different black forest dump cake,50775,100.0,1.110223e-16
2,all purpose cole slaw,412091,23.925507,0.7607449
3,algerian haroset,361279,21.351637,0.7864836
4,30 minute fresh mozzarella cheese homemade,290371,19.455197,0.805448
5,a secret ingredient to southern style sweet tea,346160,18.189577,0.8181042
6,3 pepper chili,298433,18.032633,0.8196737
7,2 minute microwave lime cheesecake,388669,17.250118,0.8274988
8,a yummy twist on sausage rolls,144920,17.089648,0.8291035
9,2 ingredient punch mock champagne punch,289160,16.429732,0.8357027
10,3 hour old fashioned oven pot roast,317798,14.574101,0.854259
