<h1> Hybrid Recommender Model using LightFM </h1>

<h3> 1. Import Libraries </h3>

In [1]:
from lightfm import LightFM
from lightfm.data import Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
import pickle

<h3> 2. Import dataset </h3>

In [2]:
interactions_train  = pd.read_csv('interactions_train_new.csv',index_col=1)# read csv into ratings_df dataframe
interactions_train.drop(columns=['date','u','i','rating'],inplace=True)
interactions_train.index.names = ['id']
interactions_train.head()

Unnamed: 0_level_0,user_id,without_0_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4684,2046,5.0
517,2046,5.0
7435,1773,5.0
278,1773,4.0
3431,2046,5.0


In [227]:
interactions_test  = pd.read_csv('interactions_test_new.csv')# read csv into ratings_df dataframe
interactions_test.drop(columns=['date','u','i','rating'],inplace=True)
interactions_test.head(5)

Unnamed: 0,user_id,recipe_id,without_0_rating
0,8937,44551,4.0
1,56680,126118,4.0
2,349752,219596,5.0
3,628951,82783,2.0
4,92816,435013,3.0


In [4]:
raw_recipes  = pd.read_csv('RAW_recipes.csv',index_col=1)# read csv into ratings_df dataframe
mask1=raw_recipes.index.isin(interactions_train.index)
#mask2=raw_recipes.index.isin(interactions_test.recipe_id)
raw_recipes=raw_recipes[mask1]
raw_recipes.shape

(160901, 11)

In [5]:
raw_recipes.head()

Unnamed: 0_level_0,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
137739,arriba baked winter squash mexican style,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
31490,a bit different breakfast pizza,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
59389,alouette potatoes,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
5289,apple a day milk shake,0,1533,1999-12-06,"['15-minutes-or-less', 'time-to-make', 'course...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4
70971,bananas 4 ice cream pie,180,102353,2003-09-10,"['weeknight', 'time-to-make', 'course', 'main-...","[4270.8, 254.0, 1306.0, 111.0, 127.0, 431.0, 2...",8,"['crumble cookies into a 9-inch pie plate , or...",,"['chocolate sandwich style cookies', 'chocolat...",6


<h3> 3. Data Pre-processing </h3>
<h5> a. Nutrition data</h5>
Convert nutrition column which has data on each colorie component such as 'calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates' into seperate column of its own

In [6]:
raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] = raw_recipes.nutrition.str.split(",",expand=True) 
raw_recipes['calories'] = raw_recipes['calories'].apply(lambda x: x.replace("[" ,""))
raw_recipes['carbohydrates'] = raw_recipes['carbohydrates'].apply(lambda x: x.replace("]" ,""))
raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] =  raw_recipes[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype(float)

In [7]:
#drop unwanted columns
raw_recipes = raw_recipes.drop(columns=['contributor_id','nutrition', 'description','submitted'])
raw_recipes_copy  = raw_recipes
raw_recipes.head()

Unnamed: 0_level_0,name,minutes,tags,n_steps,steps,ingredients,n_ingredients,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...","['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...","['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
59389,alouette potatoes,45,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
5289,apple a day milk shake,0,"['15-minutes-or-less', 'time-to-make', 'course...",4,"['combine ingredients in blender', 'cover and ...","['milk', 'vanilla ice cream', 'frozen apple ju...",4,160.2,10.0,55.0,3.0,9.0,20.0,7.0
70971,bananas 4 ice cream pie,180,"['weeknight', 'time-to-make', 'course', 'main-...",8,"['crumble cookies into a 9-inch pie plate , or...","['chocolate sandwich style cookies', 'chocolat...",6,4270.8,254.0,1306.0,111.0,127.0,431.0,220.0


<h5>b. One-hot encoding Tags list </h5>
The tags column are in the format '60-minutes-or-less',  'time-to-make', etc. We will create a seperate column for unique tags using MultiLabelBinarizer

In [9]:
raw_recipes.tags.apply(lambda x: x.split(','))

id
137739    [['60-minutes-or-less',  'time-to-make',  'cou...
31490     [['30-minutes-or-less',  'time-to-make',  'cou...
59389     [['60-minutes-or-less',  'time-to-make',  'cou...
5289      [['15-minutes-or-less',  'time-to-make',  'cou...
70971     [['weeknight',  'time-to-make',  'course',  'm...
                                ...                        
185979    [['30-minutes-or-less',  'time-to-make',  'cou...
367912    [['15-minutes-or-less',  'time-to-make',  'pre...
357451    [['15-minutes-or-less',  'time-to-make',  'cou...
188810    [['60-minutes-or-less',  'time-to-make',  'cou...
308080    [['60-minutes-or-less',  'time-to-make',  'cou...
Name: tags, Length: 160901, dtype: object

In [10]:
raw_recipes['tags']=raw_recipes['tags'].apply(eval)

In [11]:
mlb = MultiLabelBinarizer(sparse_output=True)
raw_recipes = raw_recipes.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(raw_recipes.pop('tags')),
                index=raw_recipes.index,
                columns=mlb.classes_))

In [12]:
#drop unwanted columns
raw_recipes = raw_recipes.drop(columns=['steps','ingredients','','name'])

raw_recipes.head()

Unnamed: 0_level_0,minutes,n_steps,n_ingredients,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates,...,whitefish,whole-chicken,whole-duck,whole-turkey,wild-game,wings,winter,yams-sweet-potatoes,yeast,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,55,11,7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,...,0,0,0,0,0,0,1,0,0,0
31490,30,9,6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,...,0,0,0,0,0,0,0,0,0,0
59389,45,11,11,368.1,17.0,10.0,2.0,14.0,8.0,20.0,...,0,0,0,0,0,0,0,0,0,0
5289,0,4,4,160.2,10.0,55.0,3.0,9.0,20.0,7.0,...,0,0,0,0,0,0,0,0,0,0
70971,180,8,6,4270.8,254.0,1306.0,111.0,127.0,431.0,220.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
raw_recipes.shape

(160901, 535)

<h3> 4. Data Preparation  </h3>
LightFM requires the item features to have a specific format. Each recipe must be in the format (137739,['minutes:55.0','n_steps:11.0','n_ingredients:7.0','calories:51.5','total fat:0.0',]).

In [14]:
#helper function to identify unqiue features in the dataset
def generate_recipe_features(df):
    rf = []
    #col = ['minutes']*len(df.minutes.unique())
    col=[]
    for column in raw_recipes.columns:
        col= col+ [column]*len(df[column].unique())
    
    unique_f1=[]
    for column in df.columns:
        unique_f1=unique_f1+list(df[column].unique())
    
    unique_f1=[float(i) for i in unique_f1]
    for x,y in zip(col, unique_f1):
        res = str(x)+ ":" +str(round(y,1))
        rf.append(res)
    return rf

In [15]:
rf=generate_recipe_features(raw_recipes)

In [16]:
rf[:5]

['minutes:55.0',
 'minutes:30.0',
 'minutes:45.0',
 'minutes:0.0',
 'minutes:180.0']

In [17]:
recipe_columns=list(raw_recipes.columns)
recipe_columns= [col+':' for col in recipe_columns]
def feature_colon_value(my_list):
   
    
    aa = my_list
    result = []
    for x,y in zip(recipe_columns,aa):
        res = str(x) +""+ str((y))
        result.append(res)
    return result
            

In [18]:
#ad_subset = raw_recipes[raw_recipes.columns]
#ad_list = [list(x) for x in ad_subset.values]

In [19]:
def built_item_tuples(df):
    ad_subset = df[df.columns]
    ad_list = [list(x) for x in ad_subset.values]
    feature_list = []
    for item in ad_list:
        feature_list.append(feature_colon_value(item))
    item_tuple=list(zip(df.index, feature_list))
    return item_tuple

In [20]:
#generate item tuples in the required format
item_tuples=built_item_tuples(raw_recipes)

In [21]:
item_tuples[:1]

[(137739,
  ['minutes:55.0',
   'n_steps:11.0',
   'n_ingredients:7.0',
   'calories:51.5',
   'total fat:0.0',
   'sugar:13.0',
   'sodium:0.0',
   'protein:2.0',
   'saturated fat:0.0',
   'carbohydrates:4.0',
   '1-day-or-more:0.0',
   '15-minutes-or-less:0.0',
   '3-steps-or-less:0.0',
   '30-minutes-or-less:0.0',
   '4-hours-or-less:0.0',
   '5-ingredients-or-less:0.0',
   '60-minutes-or-less:1.0',
   'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.:0.0',
   'a1-sauce:0.0',
   'african:0.0',
   'american:0.0',
   'amish-mennonite:0.0',
   'angolan:0.0',
   'appetizers:0.0',
   'apples:0.0',
   'april-fools-day:0.0',
   'argentine:0.0',
   'artichoke:0.0',
   'asian:0.0',
   'asparagus:0.0',
   'australian:0.0',
   'austrian:0.0',
   'avocado:0.0',
   'bacon:0.0',
   'baja:0.0',
   'baked-beans:0.0',
   'baking:0.0',
   'bananas:0.0',
   'bar-cookies:0.0',
   'barbecue:0.0',
   'bass:0.0',
   'bean-soup:0.0',
   'beans:0.0',
   'beans-side-dishes:0.0',
   'bea

In [22]:
#pass the unique user ID's, recipe ID's and unique item features to build the dataset
dataset1 = Dataset()
dataset1.fit(
        interactions_train['user_id'].unique(), # all the users
        interactions_train.index.unique(), # all the items
        item_features = rf # additional user features
)

In [23]:
item_features = dataset1.build_item_features(item_tuples, normalize= True)

In [24]:
item_features

<160901x186039 sparse matrix of type '<class 'numpy.float32'>'
	with 86242936 stored elements in Compressed Sparse Row format>

<h3> 5. Training the model </h3>

In [25]:
#lightfm expects the user-recipe interactions to be passed as as a CSR spare matrix
interactions_train['user_recipe_id_tuple'] = list(zip(
    interactions_train.user_id, interactions_train.index, interactions_train.without_0_rating))

In [26]:
interactions, weights = dataset1.build_interactions(
    interactions_train['user_recipe_id_tuple'])

In [32]:
model = LightFM(loss='warp',random_state=2022,
                learning_rate=0.90,
                no_components=100,
                user_alpha=0.000005)

In [33]:
#DO NOT RUN THIS, IT TAKES 3 HRS for COMPLETION. load the saved model from below (code below)
model.fit(interactions,item_features=item_features,epochs=5,num_threads=4,verbose=True,sample_weight=weights)

Epoch: 100%|███████████████████████████████████████████████████████| 5/5 [1:32:29<00:00, 1109.85s/it]


<lightfm.lightfm.LightFM at 0x7f2d25386580>

In [28]:
with open('lightfm_model.pickle', 'rb') as fin:
    model= pickle.load(fin)

In [35]:
# with open('lightfm_model.pickle', 'wb') as fle:
#     pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from lightfm.evaluation import auc_score
train_auc = auc_score(model,interactions,item_features=item_features).mean()
print('Hybrid training set AUC:', train_auc)

In [100]:
recipe_dict=pd.DataFrame(list(interactions_train.index.unique()), columns=['recipe_id'])
user_dict=pd.DataFrame(list(interactions_train['user_id'].unique()), columns=['user_id'])
user_dict['index_val'] = user_dict.index
#scores = pd.Series(model.predict(user_id,list(recipe_dict.index.values)))


<h3> 6. Generate Recommendations </h3>

LightFM generates implicit ratings and does not provide exact ratings in the scale of (1,5). Therefore, we cannot calculate RMSE in this case. But LightFM provides a ultility to calculate AUC and it was found to be 0.8388783 (Code in notebook LightFM_SUC.ipyb). This is a very good score.

We will pass in all the available recipe ID's and the user_id as input to the LightFM model.
LightFM ranks the recipes on its own local scale and returns the scores. We will filter out the recipes that the user has already rated from these, sort them and provide the top recommendations. The below function accomplishes all these tasks

In [191]:
user_dict

Unnamed: 0,user_id,index_val
0,2046,0
1,1773,1
2,2312,2
3,2625,3
4,2999,4
...,...,...
25071,2001355273,25071
25072,2002025577,25072
25073,2001773389,25073
25074,2001688000,25074


In [221]:
def user_recommendations(user_id,nrec_items=5):
    
    # LightFM requires the index of the user_id that was passed to the sparse matrix. 
    user=user_dict[user_dict['user_id']==user_id].iloc[0].index_val
    print(user_dict[user_dict['user_id']==user_id].iloc[0])
    
    user=user.item()
    
    
    #print(int(user_dict[user_dict['user_id']==user_id].iloc[0].index_val))
    #act_user_id=user_dict[user_dict['user_id']==user_id].iloc[0].index_val
    #print(act_user_id)
    
    #using the lightfm model to generate rank scores for all recipes
    scores = pd.Series(model.predict(user,list(recipe_dict.index.values)))
    
    #getting the recipes that the user has already rated
    known_items=interactions_train.loc[interactions_train.user_id == user_id]
    known_items=known_items.sort_values(by=['without_0_rating'], ascending=False)
    known_items=known_items[0:nrec_items]
    known_items = pd.merge(known_items,raw_recipes_copy,how='inner',left_on=[known_items.index],right_on=[raw_recipes_copy.index])
    
    #filtering out the recipes that the user had already rated
    scores_df=pd.DataFrame({'recipe_id':recipe_dict['recipe_id'].tolist(),
                       'score':scores
                       })
    scoremask=scores_df.recipe_id.isin(known_items.index)
    scores_df=scores_df[~(scoremask)]
    
    #sorting the recipes by highest rank
    scores_df=scores_df.sort_values(by=['score'],ascending=False)
    return_score_list=scores_df[0:nrec_items]
    return_score_list = pd.merge(return_score_list,raw_recipes_copy,how='inner',left_on=['recipe_id'],right_on=[raw_recipes_copy.index])
    
    #printing known likes
    print ("User: " + str(user_id))
    print("Known Likes:")
    counter = 1
    for index, row in known_items.iterrows():
        print(str(counter) + '- ' + row['name'])
        counter+=1
    
    #printing top recommendations
    print("\n Recommended Recipes:")
    counter = 1
    for index, row in return_score_list.iterrows():
        print(str(counter) + '- ' + row['name'])
        counter+=1

    
    

In [228]:
#generate the top 10 recommendation for 8937
# (6357,8937,628951)
user_id=628951
N=7
user_recommendations(user_id,N)

user_id      628951
index_val     13319
Name: 13319, dtype: int64
User: 628951
Known Likes:
1- low carb chili
2- chicken vegetable soup low carb low fat
3- herbed zucchini noodles  low fat

 Recommended Recipes:
1- roasted cauliflower   16 roasted cloves of garlic
2- crock pot chicken with black beans   cream cheese
3- roasted tomato soup
4- japanese mum s chicken
5- tzatziki
6- basil pesto
7- kittencal s famous caesar salad


In [None]:
#references
#https://towardsdatascience.com/how-i-would-explain-building-lightfm-hybrid-recommenders-to-a-5-year-old-b6ee18571309
#https://towardsdatascience.com/recommendation-system-in-python-lightfm-61c85010ce17