In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

## Loading Data

In [4]:
df_filtered= pd.read_csv("filtered_df.csv")
df_recipe= pd.read_csv("df_recipe_features.csv")
df_vegan= pd.read_csv("df_vegan_recipe.csv")
df_filtered['rating']= df_filtered['rating'].transform(lambda x: x - x.mean())
df_recipe.rename(columns={'id': 'item'}, inplace=True)
df_filtered.rename(columns={'recipe_id': 'item'}, inplace=True)
df_filered_check= df_recipe[df_recipe['item'].isin(df_filtered['item'])]
print(len(df_filered_check))


103662


In [5]:
len(df_recipe)

231637

In [6]:

df_recipe

Unnamed: 0.1,Unnamed: 0,name,item,minutes,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),dessert
0,0,arriba baked winter squash mexican style,137739,55,11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,False
1,1,a bit different breakfast pizza,31490,30,9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,False
2,2,all in the kitchen chili,112140,130,6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0,False
3,3,alouette potatoes,59389,45,11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0,False
4,4,amish tomato ketchup for canning,44061,190,5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,231632,zydeco soup,486161,60,7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22,415.2,26.0,34.0,26.0,44.0,21.0,15.0,False
231633,231633,zydeco spice mix,493372,5,1,['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13,14.8,0.0,2.0,58.0,1.0,0.0,1.0,False
231634,231634,zydeco ya ya deviled eggs,308080,40,7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,59.2,6.0,2.0,3.0,6.0,5.0,0.0,False
231635,231635,cookies by design cookies on a stick,298512,29,9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,188.0,11.0,57.0,11.0,7.0,21.0,9.0,False


 ## Hybrid Recommender System (Pipeline)

## Content Based Filtering

In [16]:
class ContentBasedFiltering:
    def __init__(self, groupRecommendation):
        self.groupRecommendation=groupRecommendation
        
    def retrieve_info (self,user_id):
        selected_user_ratings = df_filtered.loc[df_filtered['user_id'] == user_id]
        selected_user_ratings = selected_user_ratings.sort_values(by='item', ascending=True)
        return(selected_user_ratings)
    
    def combine_info( self,selected_users):
        group_contentbased_info=[]
        for user_id in selected_users:
            group_contentbased_info.append(self.retrieve_info(user_id))
        
        return (group_contentbased_info)

    def recipe_info(self, user_rating):
        
        rated_recipe_df = df_recipe.loc[df_recipe['item'].isin(list(user_rating['item']))]
        rated_recipe_df = rated_recipe_df.merge(user_rating, on='item')
        rated_recipe_df['rating']= rated_recipe_df['rating'].transform(lambda x: x - x.mean())
        return (rated_recipe_df)
    
    def combine_recipe_info(self, selected_user):
        group_recipe_df=pd.DataFrame()
        for user_id in selected_user:
            user_rating= self.retrieve_info(user_id)
            user_recipe_info= self.recipe_info(user_rating)
            group_recipe_df= group_recipe_df.append(user_recipe_info)
        
        return (group_recipe_df)
    
    
    def get_unrated_df(self, group_rating):
        unrated_recipe_df=[]
        count=0
        df_Recipe= df_recipe.copy()
        df_Recipe.set_index('item',inplace=True,drop=True)
        for group_Rating in group_rating:
            
            #unrated_recipe_df=df_recipe.loc[df_recipe['item'].isin(list(group_Rating['item']))]
            #diff = group_Rating[~group_Rating.index.isin(df_recipe['item']) ]
            #set(df_recipe['item']) - set(group_Rating['item'])
            

            diff= np.setdiff1d(df_recipe['item'], group_Rating['item'])
            
            unrated_recipe_df = df_Recipe.loc[diff]
            count= count+1
            unrated_recipe_df = unrated_recipe_df.sample(n=500)
            unrated_recipe_df = unrated_recipe_df.merge(df_filtered, on='item')
            #unrated_recipe_df['rating']= unrated_recipe_df['rating'].transform(lambda x: x - x.mean())
        
        return(unrated_recipe_df)

    def train_predict(self,selected_group):
        group_rating= self.combine_info(selected_group)
        group_recipe_df= self.combine_recipe_info(selected_group)
        
        group_unrated_df= self.get_unrated_df(group_rating)


        features_columns = ['dessert'] # the features we have selected from the dataset
        X_train = group_recipe_df[features_columns] # x contains the columns of the selected features
        y_train = group_recipe_df['rating'] # y contains the classification feature.
        X_test = group_unrated_df[features_columns] # x contains the columns of the selected features
        y_test = group_unrated_df['rating'] # y contains the classification feature.

        test_df['predicted_rating'] = recsys.predict(test_df)
        test_df['relevant'] = test_df['rating'].apply(lambda x: 1 if x>0 else 0)
        test_df['predicted_relevant'] = test_df['predicted_rating'].apply(lambda x: 1 if x>0 else 0)
        test_df=test_df.dropna()
        y_test = list(test_df['relevant'])
        y_pred = list(test_df['predicted_relevant'])
        precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")

        print("Precision:\t" + str(precision) + 
      "\nRecall:\t" + str(recall) + 
      "\nFscore:\t" + str(fscore))   

        neigh = KNeighborsRegressor(n_neighbors=3)

        neigh.fit(X_train, y_train) 
        y_unrated = neigh.predict(X_test)
        group_unrated_df['predicted_ratings_kNN'] = y_unrated
        groupRecommendation = group_unrated_df.sort_values(by='predicted_ratings_kNN', ascending=False)
        return (groupRecommendation)
           


            
            

In [128]:
groupRecommendation=[]
ContentBased= ContentBasedFiltering( groupRecommendation=[])

#users_ratings = df_filtered.groupby(['user_id']).count()
#selected = users_ratings['rating'] > 200
#selected_users = users_ratings.loc[selected]
##random_selected = selected_users.sample(10) # sample() returns a random row from the dataframe. The returned object is a dataframe with one row. If you pass a number as argument you specify to select more than one row.
#select_column_df = random_selected.reset_index()['user_id'] # reset_index() create a new index, and the user became a column. Then, we can filter using the column name
#select_column_df
selected_users= [1533,
 6651,
 38218,
 39733,
 60260,
 79877,
 230557,
 284922,
 340130,
 341344,
 373018,
 377039,
 422609]
groupRecommendation=ContentBased.train_predict(selected_users)


In [77]:
type(groupRecommendation['item'].iloc[0])

numpy.int64

In [23]:
len(groupRecommendation['user_id'].unique())

434

In [75]:
groupRecommendation['item']

Unnamed: 0.1,Unnamed: 0,user_id,item,date,rating,review
121659,577942,522099,1522,2008-04-16,0.445042,Funny story - I made this for Freezer Tag 2008...
121464,576958,522099,6932,2010-05-21,-0.554958,I followed your note and made the cloves optio...
158316,755796,522099,7798,2008-03-17,0.445042,This is an outstanding marinade. It made me l...
207227,987847,522099,9531,2009-08-25,0.445042,Truth in advertising. It has the same mouth-f...
214924,1024188,522099,13126,2010-09-08,0.445042,I made these and promptly forgot about them. ...
...,...,...,...,...,...,...
7440,35072,522099,428771,2010-06-14,0.445042,A million stars! I love chicken in yogurt bas...
71548,339450,522099,433896,2012-07-29,0.445042,I'm not sure these are legal! Sliced my potato...
207024,986992,522099,454232,2012-07-24,0.445042,Where's the million star button?!?! I LOVED th...
199690,952840,522099,460186,2012-07-29,0.445042,I used all green peppers as that's what the ga...


## Collaborative Filtering

In [25]:
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

In [122]:
result= df_filtered.sort_values(by= 'user_id')
result= result.drop(result.columns[[0,3,5]],axis=1)
result.rename(columns = {'user_id':'user', 'recipe_id':'item'}, inplace = True)
   

In [27]:
type(groupRecommendation.index)

pandas.core.indexes.numeric.Int64Index

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

train_df, test_df = train_test_split(result, test_size=0.2)

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(train_df)

test_df['predicted_rating'] = recsys.predict(test_df)
test_df['relevant'] = test_df['rating'].apply(lambda x: 1 if x>0 else 0)
test_df['predicted_relevant'] = test_df['predicted_rating'].apply(lambda x: 1 if x>0 else 0)
test_df=test_df.dropna()
y_test = list(test_df['relevant'])
y_pred = list(test_df['predicted_relevant'])
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")

print("Precision:\t" + str(precision) + 
      "\nRecall:\t" + str(recall) + 
      "\nFscore:\t" + str(fscore))
      
group_unseen_df = pd.DataFrame(list(itertools.product(selected_users, groupRecommendation['item'])), columns=['user', 'item'])

group_unseen_df = group_unseen_df.drop_duplicates(keep= 'first')
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df.dropna(inplace=True)


user 40 has no ratings and none provided
user 635 has no ratings and none provided
user 1367 has no ratings and none provided
user 1442 has no ratings and none provided
user 1817 has no ratings and none provided


Precision:	0.8385977091287747
Recall:	0.7120542292956086
Fscore:	0.7701625757092764


In [None]:
test_df= test_df.dropna()
mean_squared_error(test_df['rating'], test_df['predicted_rating'])

0.8808732548764928

In [None]:
group_unseen_df.sort_values(by='predicted_rating',ascending=False)

Unnamed: 0,user,item,predicted_rating
3021,126435,74300,1.782524
2616,351578,325457,1.654152
2845,351578,69382,1.078070
3573,64625,172160,1.031089
4524,177753,325457,1.026006
...,...,...,...
2950,126435,39165,-0.870455
4412,177753,860,-0.884900
3458,64625,860,-1.017721
2473,351578,39165,-1.518866


In [34]:
df_filtered['item'].loc[178151]

14872

In [50]:
[groupRecommendation.loc[groupRecommendation['item']== 178151].name][0]

287    chipotle chicken salad
290    chipotle chicken salad
289    chipotle chicken salad
288    chipotle chicken salad
Name: name, dtype: object

In [51]:
groupRecommendation['item'].iloc[0]

178151

In [82]:
dict_recipe_name={}
for i in range(len(groupRecommendation)):
    name= groupRecommendation['item'].iloc[i]
    key= groupRecommendation['name'].iloc[i]
    dict_recipe_name[name]=key

In [87]:
dict_recipe_name.get(325457)

'kicked up banana smoothie'

In [127]:
# least misery

least_misery_df = group_unseen_df.groupby('item').min()
least_misery_df = least_misery_df.join(df_recipe['name'], on='item')
least_misery_df = least_misery_df.sort_values(by="predicted_rating", ascending=False).reset_index()[['item', 'name', 'predicted_rating']]
print(least_misery_df.head(10))

     item                                        name  predicted_rating
0    9054                            asiago baco buns          0.855641
1  143126                 neoclassic chocolate mousse          0.779382
2   99152         grilled jerk chicken ala bobby flay          0.679645
3  412904                                         NaN          0.551613
4  189263                 slow cooked italian chicken          0.404392
5  134501                         mexican polenta pie          0.158392
6    8845  artichoke leaves with parmesan basil aioli          0.061644
7  169871       rachael ray s buffalo chicken pot pie          0.022616
8   84098                      finnish mushroom salad          0.005478
9  109423                                 hot tamales         -0.051466


In [93]:
dict_recipe_name.get(325457)

'kicked up banana smoothie'

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

train_df, test_df = train_test_split(df_filtered, test_size=0.2)

item_item = ItemItem(10, min_nbrs=1)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(item_item)
recsys.fit(train_df)

test_df['predicted_rating'] = recsys.predict(test_df)

test_df['relevant'] = test_df['rating'].apply(lambda x: 1 if x>3 else 0)
test_df['predicted_relevant'] = test_df['predicted_rating'].apply(lambda x: 1 if x>3 else 0)

y_test = list(test_df['relevant'])
y_pred = list(test_df['predicted_relevant'])
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")

print("Precision:\t" + str(precision) + 
      "\nRecall:\t" + str(recall) + 
      "\nFscore:\t" + str(fscore))

#precision and recall increase after min_nbrs is reduced