In [None]:
 # Suppresses warnings for cleaner output.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

# Load datasets from CSV files.
recipes = pd.read_csv("recipes.csv")
ratings = pd.read_csv("interactions.csv")

In [3]:
# Print columns of both datasets for inspection.
print(recipes.columns)
print(ratings.columns)

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')
Index(['user_id', 'recipe_id', 'date', 'rating', 'review'], dtype='object')


In [4]:
# Checking the ratings DataFrame.
ratings

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [5]:
# Count how many ratings are zero and display the count.
count_zero_ratings = ratings[ratings['rating'] == 0].count()
count_zero_ratings

user_id      60847
recipe_id    60847
date         60847
rating       60847
review       60847
dtype: int64

In [6]:
# Remove entries where ratings are zero as they might not be useful for recommendation.
ratings.drop(ratings[ratings['rating'] == 0].index, inplace=True)

In [7]:
# Check again to ensure no zero ratings exist.
count_zero_ratings = ratings[ratings['rating'] == 0].count()
count_zero_ratings

user_id      0
recipe_id    0
date         0
rating       0
review       0
dtype: int64

In [8]:
# Check for and display any missing values across columns.
print(recipes.isnull().sum())
print(ratings.isnull().sum())

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64
user_id        0
recipe_id      0
date           0
rating         0
review       169
dtype: int64


In [9]:
# Drop rows with any missing values in the DataFrames.
# The column description won't be relevant so we won't be dropping those roles.
ratings.dropna(inplace=True)
recipes.dropna(subset=['name'], inplace=True)

recipe_no_description = "Description not available"
recipes['description'].fillna(recipe_no_description, inplace=True)

In [10]:
# Create new DataFrames with essential columns for further analysis.
recipes_df = recipes[['name', 'id', 'ingredients']]
ratings_df = ratings[['user_id', 'recipe_id', 'rating']]

In [11]:
# Rename 'id' column to 'recipe_id' to maintain consistency across DataFrames
recipes_df.rename(columns={'id': 'recipe_id'}, inplace=True)

In [12]:
# Save the new datasets to new CSV files
recipes_df.to_csv('recipes_df.csv', index=False)
ratings_df.to_csv('ratings_df.csv', index=False)

In [13]:
# Print columns of the modified recipes DataFrame to confirm changes.
print(recipes_df.columns)
print(ratings_df.columns)

Index(['name', 'recipe_id', 'ingredients'], dtype='object')
Index(['user_id', 'recipe_id', 'rating'], dtype='object')


In [14]:
# Check for duplicates
duplicates = recipes_df[recipes_df.duplicated()]
print(duplicates)
duplicates_ratings = ratings_df[ratings_df.duplicated()]
print(duplicates_ratings)

Empty DataFrame
Columns: [name, recipe_id, ingredients]
Index: []
Empty DataFrame
Columns: [user_id, recipe_id, rating]
Index: []


In [15]:
# Calculate the average rating for each recipe
average_ratings = ratings_df.groupby('recipe_id')['rating'].mean().reset_index()
average_ratings.columns = ['recipe_id', 'average_rating']

# Merge the average ratings with the recipes DataFrame
recipes_df = pd.merge(recipes_df, average_ratings, on='recipe_id', how='left')

In [17]:
# Handle missing values in average_rating by filling with the mean rating
recipes_df['average_rating'].fillna(recipes_df['average_rating'].mean(), inplace=True)

In [18]:
import re

# Compile regex pattern to improve performance in cleaning names.
pattern = re.compile("[^a-zA-Z0-9 ]+")

def clean_name(name):
    if isinstance(name, str):
        # Convert to lowercase
        name = name.lower()
        # Remove special characters
        name = pattern.sub("", name)
        # Remove leading/trailing spaces
        name = name.strip()
    return name

# Apply the cleaning function to recipe names.
recipes_df["clean_name"] = recipes_df["name"].apply(clean_name)
recipes_df.isnull().sum()

name              0
recipe_id         0
ingredients       0
average_rating    0
clean_name        0
dtype: int64

In [19]:
# Import necessary libraries for text processing and similarity calculations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the TfidfVectorizer object with ngram_range set to (1,2).
# This setup makes the vectorizer consider single words and pairs of words.
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Generate the TF-IDF matrix for the cleaned names of recipes.
# This matrix helps in quantifying the importance of words 
# based on their occurrence frequency across the data
tfidf = vectorizer.fit_transform(recipes_df["clean_name"])

In [20]:
# Function to find recipes similar to the user's search query based on text content
def find_similar_recipes_by_text(search_text, based_on='ingredients', top_results=5):
    # Initialize a TF-IDF vectorizer with specified token patterns
    tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))
    
    # Determine whether to compare based on the 'name' or 'ingredients' of recipes
    data_to_compare = recipes_df['clean_name'] if based_on == 'name' else recipes_df['ingredients']
    # Transform the data into a TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_to_compare)
    # Transform the search query into a TF-IDF vector
    search_tfidf = tfidf_vectorizer.transform([search_text])
    
    # Compute cosine similarity between the search query vector and all recipe vectors
    cosine_similarities = cosine_similarity(search_tfidf, tfidf_matrix).flatten()
    
    # Create a DataFrame for results
    results_df = pd.DataFrame({
        'recipe_id': recipes_df['recipe_id'],
        'name': recipes_df['name'],
        'ingredients': recipes_df['ingredients'],
        'average_rating': recipes_df['average_rating'],
        'similarity': cosine_similarities
    })
    
    # Score recipes based on similarity and ratings
    results_df['score'] = results_df['similarity'] + (results_df['average_rating'] / results_df['average_rating'].max() * 0.5)
    
    # Sort and filter results by the combined score, and select the top results
    results_df = results_df.sort_values('score', ascending=False).head(top_results)
    return results_df


In [21]:
import ipywidgets as widgets
from IPython.display import display

# Setup interactive widgets for the user interface
recipe_name_input = widgets.Text(
    value='',
    placeholder='Type a recipe name',
    description='Recipe:',
    disabled=False
)

similarity_basis_input = widgets.Dropdown(
    options=['name', 'ingredients'],
    value='ingredients',
    description='Based on:',
    disabled=False
)

recommendation_list = widgets.Output()

In [22]:
# Function to handle changes in the text input widget
# It triggers when user types text
def on_type(change):
    with recommendation_list:
        # Clear previous results
        recommendation_list.clear_output()
        # Retrieve the new text input
        name = change["new"]
        # Trigger search when user has typed at least 4 characters 
        if len(name) >= 4:
            # Find similar recipes based on the current value in the similarity basis dropdown
            similar_recipes = find_similar_recipes_by_text(name, based_on=similarity_basis_input.value)
            # Display the results if not empty
            if not similar_recipes.empty:
                display(similar_recipes)
            else:
                print("No similar recipes found.")
                
# Checks changes in the text input widget
recipe_name_input.observe(on_type, names='value')

# Display widgets and the output container
display(recipe_name_input, similarity_basis_input, recommendation_list)


Text(value='', description='Recipe:', placeholder='Type a recipe name')

Dropdown(description='Based on:', index=1, options=('name', 'ingredients'), value='ingredients')

Output()