# FAISS

In [65]:
import faiss
import pandas as pd
import numpy as np
import ast

# Load data
pp_recipes = pd.read_csv("data/PP_recipes_updated.csv")
raw_recipes = pd.read_csv("data/RAW_recipes.csv")

# Merge
recipes = pd.merge(pp_recipes, raw_recipes, how='left', left_on='id', right_on='id')

# Drop unncessary columns
recipes.drop(['name_tokens','ingredient_tokens','steps_tokens',
              'techniques','ingredient_ids','contributor_id',
              'submitted','tags','steps','description', 'ingredients'],
              axis=1,
              inplace=True)

# Format
recipes.insert(0, 'name', recipes.pop('name'))
recipes.insert(4, 'n_ingredients', recipes.pop('n_ingredients'))
recipes['ingredient_names'] = recipes['ingredient_names'].apply(ast.literal_eval)

# Step 1: Encode Ingredients
unique_ingredients = sorted(set(ingredient for ingredients in recipes['ingredient_names'] for ingredient in ingredients))
ingredient_to_idx = {ingredient: idx for idx, ingredient in enumerate(unique_ingredients)}

def encode_ingredients(ingredients):
    vector = np.zeros(len(unique_ingredients), dtype='float32')
    for ingredient in ingredients:
        if ingredient in ingredient_to_idx:
            vector[ingredient_to_idx[ingredient]] = 1.0
    return vector

# Step 2: Build FAISS Index
vector_size = len(unique_ingredients)
index = faiss.IndexFlatL2(vector_size)  # L2 (Euclidean) distance metric
#index = faiss.IndexIVFPQ(quantizer, vector_size, 100, 7993, 8)

vectors = np.vstack([encode_ingredients(ing) for ing in recipes['ingredient_names']])
index.add(vectors)


In [4]:
# Step 3: Recommendation Function
def recommend_recipes(user_ingredients, top_n=5):
    user_vector = encode_ingredients(user_ingredients).reshape(1, -1)
    _, indices = index.search(user_vector, top_n)
    return recipes.iloc[indices[0]]['name'].tolist()

In [27]:
# Extract the top 5 recipes into a dictionary
user_inputs = dict(zip(
    recipes['name'].head(500),
    recipes['ingredient_names'].head(500)
))

# Generate recommendations and store in DataFrame
output = pd.DataFrame({
    recipe_name: recommend_recipes(ingredients, top_n=11)
    for recipe_name, ingredients in user_inputs.items()
})

## Evaluation

### Mean Average Ingrement Matching (MAIM)

In [40]:
def ingredient_similarity_score(user_ingredients, recommended_recipe_ingredients):
    common_items = set(user_ingredients) & set(recommended_recipe_ingredients)
    total_unique_items = set(user_ingredients) | set(recommended_recipe_ingredients)
    ratio = len(common_items) / len(total_unique_items)
    return ratio

def average_ingredient_similarity(recipe_example):
    # Get list of unique recommended_recipes
    the_list = list(set(output[recipe_example]))

    # Filter recipes based on the_list
    filtered_recipes = recipes[recipes['name'].isin(the_list)].sort_values(
        by='name',
        key=lambda x: pd.Categorical(x, categories=the_list, ordered=True)
    ).reset_index(drop=True)[['name','ingredient_names']]

    # Get average sim score
    average_sim_score = 0
    for index, row in filtered_recipes.iterrows():
        if index!= 0:
            average_sim_score += ingredient_similarity_score(recipes.iloc[0]['ingredient_names'], row['ingredient_names'])

    return average_sim_score / (len(filtered_recipes)-1)

# Calculation
mean_average_sim_score = 0
for col in output.columns:
    mean_average_sim_score += average_ingredient_similarity(col)
print("Mean average similarity score is:")
mean_average_sim_score / len(output.columns)


Mean average similiary score is:


0.05738603059308753

In [None]:
def mean_reciprocal_rank(y_true, y_pred):
    """
    Calculate Mean Reciprocal Rank (MRR)

    Parameters:
    y_true (list of lists): Ground truth relevant items per user
    y_pred (list of lists): Predicted ranked items per user

    Returns:
    float: MRR score
    """
    reciprocal_ranks = []
    for true_items, pred_items in zip(y_true, y_pred):
        rank = 0
        for idx, item in enumerate(pred_items, start=1):
            if item in true_items:
                rank = idx
                break
        reciprocal_ranks.append(1 / rank if rank > 0 else 0)
    return np.mean(reciprocal_ranks)


def mean_average_precision(y_true, y_pred, k=10):
    """
    Calculate Mean Average Precision (MAP) at K

    Parameters:
    y_true (list of lists): Ground truth relevant items per user
    y_pred (list of lists): Predicted ranked items per user
    k (int): Cutoff for precision calculation

    Returns:
    float: MAP score
    """
    average_precisions = []
    for true_items, pred_items in zip(y_true, y_pred):
        hits = 0
        sum_precisions = 0
        for i, item in enumerate(pred_items[:k], start=1):
            if item in true_items:
                hits += 1
                sum_precisions += hits / i
        if hits > 0:
            average_precisions.append(sum_precisions / hits)
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)


# Example usage
y_true = [[1, 3], [2], [4, 5]]  # Ground truth relevant items
y_pred = [[3, 2, 1], [1, 2, 3], [5, 4, 6]]  # Predicted ranked items

mrr_score = mean_reciprocal_rank(y_true, y_pred)
map_score = mean_average_precision(y_true, y_pred, k=3)

print(f"Mean Reciprocal Rank (MRR): {mrr_score:.4f}")
print(f"Mean Average Precision (MAP)@3: {map_score:.4f}")
