<a href="https://colab.research.google.com/github/Aarushi900/Recipe_Recommendation-_system/blob/main/modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load data
data =pd.read_pickle("/content/food.pkl")
data.shape

(216811, 28)

In [None]:
# Sample 25% of the dataset
sampled_data = data.sample(frac=0.25, random_state=42)

# View sample
display(sampled_data.head())

# Print the shape of the sampled dataset
print("Shape of Sampled Dataset:", sampled_data.shape)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,submitted_month,submitted_year,dairy-free,gluten-free,low-carb,vegan,vegetarian,recipe_id,mean,count
175675,single serve chocolate cake,447588,6,1808722,2011-01-28,"['15-minutes-or-less', 'time-to-make', 'course...",6,"['1) mix flour , sugar , and cocoa powder in t...","this cake is really simple to make, and it's r...","['flour', 'sugar', 'cocoa powder', 'egg', 'mil...",...,Jan,2011,0,0,0,0,0,447588,2.666667,6
90062,greek grilled jumbo shrimp w lemon oregano ...,175258,50,227750,2006-06-28,"['60-minutes-or-less', 'time-to-make', 'course...",9,['clean and devein shrimp leaving shells in pl...,aahhh another fav from gourmet magazine. perfe...,"['large shrimp', 'garlic cloves', 'salt', 'fre...",...,Jun,2006,0,0,1,0,0,175258,5.0,3
97995,herb roasted pork tenderloin,218182,45,164890,2007-03-22,"['60-minutes-or-less', 'time-to-make', 'main-i...",4,['mix all ingredients but pork in zip-lock bag...,i hate grilling so am always looking for nice ...,"['soy sauce', 'worcestershire sauce', 'thyme',...",...,Mar,2007,0,0,0,0,0,218182,5.0,1
131434,murray s fried spaghetti,226114,20,452940,2007-05-03,"['30-minutes-or-less', 'time-to-make', 'course...",6,['in large skillet heat oil and butter over me...,"a great use of leftovers, and one of bird's da...","['olive oil', 'butter', 'cooked ham', 'onion',...",...,May,2007,0,0,0,0,0,226114,4.181818,11
54435,cornflake energy bites,417135,25,422893,2010-03-18,"['30-minutes-or-less', 'time-to-make', 'course...",4,"['place prunes , apricots and water in medium ...","loved the sound of these as a lunch box treat,...","['pitted prunes', 'dried apricot', 'water', 'r...",...,Mar,2010,0,0,0,0,0,417135,5.0,1


Shape of Sampled Dataset: (54203, 28)


In [None]:
# import libraries
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [None]:
# Import libraries
import gensim
from gensim.models import Word2Vec
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Function for word embedding using Word2Vec
def word_embedding(sampled_data, column):
    # Tokenize the text data
    tokenized_data = sampled_data[column].apply(recipe_tokenizer)

    # Train a Word2Vec model
    model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

    # Create word embeddings for each word in the vocabulary
    embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

    return embeddings

In [None]:
# Function to pre-compute and store the combined embeddings
def precompute_embeddings(sampled_data):
    # Step 1: Process 'ingredients' using word2vec and create word embeddings
    embeddings = word_embedding(sampled_data, 'ingredients')

    # Step 2: Concatenate relevant columns (excluding 'ingredients')
    sampled_data['text_data'] = sampled_data[['name', 'tags', 'description']].astype(str).agg(' '.join, axis=1)

    # Step 3: Preprocess the text data (example: lowercase conversion)
    sampled_data['text_data'] = sampled_data['text_data'].str.lower()

    # Step 4: Vectorize the text data (excluding 'ingredients') using TF-IDF
    vectorizer = TfidfVectorizer(min_df=5,
                                 tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(sampled_data['text_data'])

    # Step 5: Retrieve the word embeddings for 'ingredients'
    ingredient_embeddings = [np.mean([embeddings[word] for word in recipe_tokenizer(ingredients) if word in embeddings]
                                      or [np.zeros(100)], axis=0) for ingredients in sampled_data['ingredients']]

    # Step 6: Combine the vectorized data and ingredient embeddings
    combined_embeddings = np.concatenate([vectorized_data.toarray(), np.array(ingredient_embeddings)], axis=1)

    # Step 7: Store combined embeddings in pkl file
    with open('combined_embeddings.pkl', 'wb') as f:
        pickle.dump(combined_embeddings, f)

    # Step 8: Store the trained TF-IDF vectorizer model in a separate pkl file
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

    # Step 9: Done!
    print("Text data and TF-IDF vectorizer model stored in pkl files!")



In [None]:
# store vectorized data and the trained TF-IDF vectorizer model from sampled data
precompute_embeddings(sampled_data)

Text data and TF-IDF vectorizer model stored in pkl files!


In [None]:
# Function to load the combined embeddings and TF-IDF vectorizer model
def load_embeddings_and_vectorizer():
    with open('combined_embeddings.pkl', 'rb') as f:
        combined_embeddings = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    return combined_embeddings, vectorizer

# Function for finding recipes
def find_similar_recipes(sampled_data, user_input, num_similar=5):
    try:
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()
    except FileNotFoundError:
        precompute_embeddings(sampled_data)
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()

    # Process user input
    # Create a DataFrame for user input
    user_data = pd.DataFrame({'text_data': [user_input]})
    user_data['text_data'] = user_data['text_data'].str.lower()

    # Vectorize the user input using the provided vectorizer
    user_vectorized_data = vectorizer.transform(user_data['text_data'])

    # Ensure the number of features in user_vectorized_data matches with combined_embeddings
    num_missing_features = combined_embeddings.shape[1] - user_vectorized_data.shape[1]
    if num_missing_features > 0:
        # Add zero columns to user_vectorized_data to match the number of features
        user_vectorized_data = np.pad(user_vectorized_data.toarray(), ((0, 0), (0, num_missing_features)))

    # Compute cosine similarity with user input
    cosine_sim_matrix = cosine_similarity(user_vectorized_data, combined_embeddings)

    # Retrieve similar recipe indices
    similar_recipes = cosine_sim_matrix[0].argsort()[::-1][:num_similar]

    # Get similar recipe names from food_df
    similar_recipe_names = sampled_data.iloc[similar_recipes]['name'].tolist()

    return similar_recipe_names

In [None]:
# Test
find_similar_recipes(sampled_data, "japanese dishes vegetarian")

['onigiri  japanese rice balls',
 'miso shiru soup',
 'japanese chicken with egg on rice',
 'japanese noodle and mushroom salad',
 'japanese tsume and tare   nitsume or thick  sweet eel sauce']