In [1]:
# Import Block
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from surprise import Dataset, Reader
from surprise.prediction_algorithms import BaselineOnly, KNNBasic, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse, mse, mae, fcp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

## Load Data

In [2]:
# Load data
df_interactions_train = pd.read_csv("data/interactions_train_processed.csv")

# Load data into Surprise format
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_interactions_train[['user_id', 'recipe_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Collaborative Filtering

Before building a collaborative filtering model, use the baseline model provided by the `surprise` package to compare the performance of the eventual final model.

In [3]:
# Baseline options (using ALS)
bsl_options = {
    "method": "als",
    "n_epochs": 50,
    "reg_u": 10,
    "reg_i": 15
}

# Initialize BaselineOnly algorithm using bsl_options
algo = BaselineOnly(bsl_options=bsl_options)

# Train and test
algo.fit(trainset)
preds = algo.test(testset)

# Evaluate
rmse(preds), mse(preds), mae(preds), fcp(preds)

Estimating biases using als...
RMSE: 0.9098
MSE: 0.8277
MAE:  0.5416
FCP:  0.5678


(0.9097559411244999,
 0.8276558724113244,
 0.5415949652953759,
 0.5678283162984348)

Investigate the performance of using a KNN-inspired algorithm this task. The expectation is that a KNN-inspired model would not perform very well since the dataset is highly sparsed (99.98% sparsity).

In [4]:
# Sim options for KNNBasic
sim_options = {
    'name': 'pearson',   # Use pearson similarity (other options: 'cosine', 'pearson_baseline')
    'user_based': True,  # Set to True for user-user similarity, False for item-item similarity
    'min_support': 5     # Set similarity=0 for users that have less than 5 common items with user i
}

# Initialize KNNBasic using sim_options
algo = KNNBasic(k=100, min_k=20, sim_options=sim_options)

# Train and test
algo.fit(trainset)
preds = algo.test(testset)

# Evaluate
rmse(preds), mse(preds), mae(preds), fcp(preds)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9592
MSE: 0.9200
MAE:  0.6318
FCP:  0.3202


(0.9591632692947833,
 0.9199941771642571,
 0.6318371869843465,
 0.32019505081645055)

Use GridSearchCV to find the best hyperparameters for the SVD model.

In [5]:
# Set param grid for hyperparameter considerations
param_grid = {
    "n_factors": [5, 10, 20],
    "n_epochs": [100],
    "biased": [True],
    "lr_all": [0.001, 0.002, 0.005],
    "reg_all": [0.02, 0.05, 0.10]
}

# Initialize and fit
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mse", "mae", "fcp"], cv=3)
gs.fit(data)

# Evaluate
print("Best rmse is : ", gs.best_score["rmse"], ", with the parameters ", gs.best_params['rmse'])
print("Best mse is : ", gs.best_score["mse"], ", with the parameters ", gs.best_params['mse'])
print("Best mae is : ", gs.best_score["mae"], ", with the parameters ", gs.best_params['mae'])
print("Best fcp is : ", gs.best_score["fcp"], ", with the parameters ", gs.best_params['fcp'])

Best rmse is :  0.9138127294626375 , with the parameters  {'n_factors': 5, 'n_epochs': 100, 'biased': True, 'lr_all': 0.001, 'reg_all': 0.1}
Best mse is :  0.835070426909705 , with the parameters  {'n_factors': 5, 'n_epochs': 100, 'biased': True, 'lr_all': 0.001, 'reg_all': 0.1}
Best mae is :  0.5398281961017222 , with the parameters  {'n_factors': 5, 'n_epochs': 100, 'biased': True, 'lr_all': 0.001, 'reg_all': 0.02}
Best fcp is :  0.5622347479691534 , with the parameters  {'n_factors': 5, 'n_epochs': 100, 'biased': True, 'lr_all': 0.005, 'reg_all': 0.1}


Since the primary metric with matrix factorization based recommendation systems is rmse, we will configure the model using the best parameters that yielded the best rmse score. These parameters are:
- n_factors = 100
- n_epochs = 
- biased = True
- lr_all = 
- reg_all = 

In [8]:
# Train SVD model
svd = SVD(n_factors=5, n_epochs=100, lr_all=0.001, reg_all=0.1, random_state=42)  # rmse=0.9116
svd.fit(trainset)

# Evaluate the model
preds = svd.test(testset)

# Evaluate
rmse(preds), mse(preds), mae(preds), fcp(preds)

RMSE: 0.9116
MSE: 0.8310
MAE:  0.5417
FCP:  0.5703


(0.9115845735184523,
 0.8309864346768187,
 0.5416966065745548,
 0.5703248220919518)

In [10]:
# Get recommendations for a specific user
def recommend_recipes(model, user_id, df, num_recommendations=10):
    """
    Generate recipe recommendations for a given user.

    Args:
    - model: The trained SVD model.
    - user_id: The ID of the user for whom to recommend recipes.
    - df: The original DataFrame with user-recipe interactions.
    - num_recommendations: Number of recommendations to return.

    Returns:
    - DataFrame with top recommended recipes.
    """
    # Get all recipe IDs
    all_recipes = df['recipe_id'].unique()

    # Get recipes the user has already rated
    rated_recipes = df[df['user_id'] == user_id]['recipe_id'].values

    # Get recipes the user has NOT rated
    unrated_recipes = [recipe for recipe in all_recipes if recipe not in rated_recipes]

    # Predict ratings for unrated recipes
    predictions = []
    for recipe_id in unrated_recipes:
        pred = model.predict(user_id, recipe_id)
        predictions.append((recipe_id, pred.est))

    # Sort predictions by estimated rating
    top_recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:num_recommendations]

    # Convert to DataFrame for better visualization
    recommended_df = pd.DataFrame(top_recommendations, columns=['recipe_id', 'predicted_rating'])

    # Get recipe_name and ingredient_names
    recommended_df = pd.merge(recommended_df, df, how='left', left_on='recipe_id', right_on='recipe_id')
    recommended_df = recommended_df.drop_duplicates(subset=['recipe_id', 'recipe_name', 'ingredient_names','predicted_rating'],keep='first')
    recommended_df = recommended_df[['recipe_id','recipe_name','ingredient_names','predicted_rating']]

    return recommended_df

# Example usage:
user_id = 354714
recommend_recipes(svd, user_id, df_interactions_train)


Unnamed: 0,recipe_id,recipe_name,ingredient_names,predicted_rating
0,497261,quick and easy crabby caesar salads 5fix,"['complete caesar salad in a bag', 'mashed pot...",4.982922
23,43332,paige s buttercream frosting,"['powdered sugar', 'unsalted butter', 'vanilla...",4.981827
60,99909,haluski pan fried cabbage and noodles,"['wide egg noodle', 'butter', 'sweet onion', '...",4.96825
100,163311,sweet and sour stir fry shrimp with broccoli a...,"['vegetable oil', 'carrot', 'onion', 'red bell...",4.954477
113,21171,my best zucchini bread,"['sugar', 'vegetable oil', 'egg', 'vanilla', '...",4.95337
153,124192,dutch stamppot with rookworst,"['potato', 'butternut squash', 'sweet potato',...",4.946682
160,89007,kittencal s caramelized onions,"['onion', 'butter', 'oil', 'brown sugar', 'sal...",4.946416
219,24768,berry cream cheese coffee cake,"['flmy', 'sugar', 'margarine', 'baking powder'...",4.945885
251,107059,substitution for pumpkin pie spice,"['ground cinnamon', 'ground nutmeg', 'ground c...",4.945224
290,16037,coconut creme fudge,"['white chocolate chip', 'flaked coconut', 'bu...",4.940735


# Content Filtering

## Cosine Similarity


In [12]:
# TF-IDF on ingredients
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_interactions_train['ingredient_names'])

In [13]:
# Function to get recommendations
def content_based_recommendations(recipe_name, top_n=10):
    """
    Generate recipe recommendations using content-based filtering (cosine similarity).

    Args:
    - recipe_name: The name of the recipe that that users want the recommendations to be similar to.
    - top_n: Number of recommendations to return.

    Returns:
    - DataFrame with top recommended recipes.
    """
    # Get recipe_id based on recipe_name
    idx = df_interactions_train[df_interactions_train['recipe_name'] == recipe_name].index[0]

    # Find cosine similarity scores
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get recipe indices based on the top sim scores
    recipe_indices = [i[0] for i in sim_scores]

    # Create output in form of dataframe
    output = df_interactions_train.iloc[recipe_indices][['recipe_id', 'recipe_name', 'ingredient_names']]
    output['similarity_score'] = [item[1] for item in sim_scores]

    # Drop duplicate recipes (since recipes show up as many times as they are rated)
    output.drop_duplicates(subset=['recipe_name', 'ingredient_names'],inplace=True)

    # Keep only the top n recipes to recommend (excludes the given recipe itself which has sim_score=1)
    output = output.iloc[1:top_n+1]

    return output

# Test the function
content_based_recommendations('vanilla pudding')

Unnamed: 0,recipe_id,recipe_name,ingredient_names,similarity_score
446775,118421,creamy vanilla pudding,"['milk', 'salt', 'sugar', 'cornstarch', 'egg y...",1.0
577426,406956,microwave vanilla pudding opt banana pudding,"['sugar', 'cornstarch', 'salt', 'milk', 'egg y...",1.0
51411,21755,french pastry cream,"['milk', 'egg yolk', 'sugar', 'cornstarch', 'f...",0.922746
508932,152456,creamy vanilla pudding for two,"['sugar', 'cornstarch', 'salt', 'milk', 'egg y...",0.919951
633072,483829,a very proper english custard,"['milk', 'cream', 'egg yolk', 'sugar', 'cornst...",0.915505
358947,234051,gebrande suiker pudding caramel pudding,"['brown sugar', 'salt', 'milk', 'cornstarch', ...",0.880394
735,229,chocolate pudding ii,"['sugar', 'cornstarch', 'salt', 'milk', 'egg y...",0.879413
585907,228620,custard cream,"['milk', 'sugar', 'egg yolk', 'cornstarch', 'v...",0.874457
971,4308,basic cream fillings with variations,"['flmy', 'sugar', 'salt', 'milk', 'butter', 'e...",0.841678
41437,79063,easy pudding,"['sugar', 'flmy', 'salt', 'milk', 'egg yolk', ...",0.841678


## Hybrid Model

In [None]:
def hybrid(model, user_id, df, num_of_recommendations):
    """
    Returns recipe recommendations based on both collaborative filtering and content-based filtering algorithms. The process is as follows:
    Step 1) Get recommendations from a collaborative filtering based algorithm.
    Step 2) Get the similarity scores between the recipes that user_id has rated and the recipes from the recommendations in Step 1.
    Step 3) Find the average similarity scores of each recommended recipe against the recipes that user_id has rated.
    Step 4) Remove recipes that the user has rated from the created dataframe in Step 3.
    Step 5) Sort recommended recipes by (predicted_rating, avg_sim_score)
    Step 6) Return num_of_recommendations amount of recommded recipe

    Args:
    - model: The trained SVD model.
    - user_id: The ID of the user for whom to recommend recipes.
    - df: The original DataFrame with user-recipe interactions.
    - num_recommendations: Number of recommendations to return.

    Returns:
    - DataFrame with top recommended recipes.
    """
    # Get number of unique recipes
    len_unique = df_interactions_train['recipe_id'].nunique()

    # Get recommended recipes from a collaborative filtering based model
    svd_recs = recommend_recipes(model, user_id, df, num_recommendations=len_unique)

    # Get recipes the user has already rated
    rated_recipes = df[df['user_id'] == user_id]['recipe_id'].values

    # Create a dictionary that stores the similarity scores for each rated recipe to the recommended svd recipes
    rated_recipe_to_score = {}
    for rated_recipe in rated_recipes:
        recipe_name = df[df['recipe_id']==rated_recipe]['recipe_name'].unique()[0]
        content_recs = content_based_recommendations(recipe_name, top_n=len_unique-1)
        content_recs = content_recs[content_recs['recipe_id'].isin(svd_recs['recipe_id'].values)]
        rated_recipe_to_score[rated_recipe] = content_recs

    # Combine all dataframes in the dictionary
    combined_df = pd.concat(rated_recipe_to_score.values())

    # Group by recipe_name and calculate the average score
    result_df = combined_df.groupby(['recipe_id', 'recipe_name'], as_index=False)['similarity_score'].mean()

    # Rename the column to avg_score
    result_df.rename(columns={'similarity_score': 'avg_sim_score'}, inplace=True)

    # Combine with svd recs
    result_df = pd.merge(result_df, svd_recs, how='left', left_on='recipe_id', right_on='recipe_id')

    # Remove any recipes that the user has rated
    result_df = result_df[~result_df['recipe_id'].isin(rated_recipes)]

    # Formatting
    result_df.sort_values(by=['predicted_rating','avg_sim_score'], ascending=False, inplace=True)
    result_df = result_df[['recipe_id', 'recipe_name_x', 'ingredient_names', 'predicted_rating', 'avg_sim_score']]
    result_df.rename(columns={"recipe_name_x": "recipe_name"}, inplace=True)

    return result_df.head(num_of_recommendations)

In [17]:
# Test (user is someone who only gives out 5 stars ratings)
hybrid(svd, user_id=2046, df=df_interactions_train, num_of_recommendations=10)

Unnamed: 0,recipe_id,recipe_name,ingredient_names,predicted_rating,avg_sim_score
96031,241166,yummy oatmeal chocolate chip cookies,"['butter', 'sugar', 'brown sugar', 'egg', 'van...",5.0,0.207702
59042,138631,mexican black bean soup with sausage,"['olive oil', 'onion', 'garlic clove', 'chile'...",5.0,0.165054
28155,66294,black bean and corn salad dip,"['olive oil', 'fresh lime juice', 'red wine vi...",5.0,0.161527
52889,123406,amy s pumpkin chocolate chip cookies,"['sugar', 'pumpkin', 'oil', 'egg', 'vanilla', ...",5.0,0.156713
22916,54915,spaghetti with tomatoes and feta,"['olive oil', 'dried oregano', 'garlic', 'toma...",5.0,0.142205
27586,64895,the best brownies ever,"['all-purpose flmy', 'baking soda', 'salt', 's...",5.0,0.141151
130636,359292,kittencal s creamy greek feta salad dressing,"['mayonnaise', 'garlic clove', 'fresh lemon ju...",5.0,0.133897
7543,20763,double chocolate chip cookies,"['all-purpose flmy', 'baking soda', 'kosher sa...",5.0,0.128492
37809,89007,kittencal s caramelized onions,"['onion', 'butter', 'oil', 'brown sugar', 'sal...",5.0,0.126776
54290,126982,kona inn banana bread,"['sugar', 'butter', 'banana', 'egg', 'flmy', '...",5.0,0.126639


In [18]:
# Test (user is someone who has diverse ratings)
hybrid(svd, user_id=82663, df=df_interactions_train, num_of_recommendations=10)

Unnamed: 0,recipe_id,recipe_name,ingredient_names,predicted_rating,avg_sim_score
17868,43332,paige s buttercream frosting,"['powdered sugar', 'unsalted butter', 'vanilla...",4.764849,0.083368
48918,113608,hawaiian banana nut bread,"['flmy', 'salt', 'baking soda', 'sugar', 'cinn...",4.748388,0.074236
136387,380988,extremely soft white bread bread machine,"['hot water', 'yeast', 'sugar', 'vegetable oil...",4.748135,0.070218
7752,21171,my best zucchini bread,"['sugar', 'vegetable oil', 'egg', 'vanilla', '...",4.738353,0.091688
158450,497261,quick and easy crabby caesar salads 5fix,"['complete caesar salad in a bag', 'mashed pot...",4.732016,0.011674
96108,241455,ginger glazed mahi,"['mahi mahi fillet', 'honey', 'soy sauce', 'ba...",4.731345,0.0322
42829,99909,haluski pan fried cabbage and noodles,"['wide egg noodle', 'butter', 'sweet onion', '...",4.73039,0.054911
23091,55309,caprese salad tomatoes italian marinated toma...,"['roma tomato', 'garlic clove', 'olive oil', '...",4.717016,0.089079
21755,52417,penne sausages a la vodka,"['sausage', 'butter', 'olive oil', 'onion', 'g...",4.713607,0.122849
66813,159273,foolproof one bowl banana cake,"['margarine', 'egg', 'vanilla', 'baking powder...",4.711549,0.115369
