# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import requests
import re

# Core Food Recipe Recommendation

In [2]:
# Loading Dataset
core_data_recipe = pd.read_csv('core-data-recipe.csv')

In [3]:
# Showing Dataset
core_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,cooking_directions,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'..."
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,skinless boneless chicken breast halves^diced ...,{'directions': u'Prep\n10 m\nCook\n8 h 15 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name..."


In [4]:
# check how data big is or shape of the dataset 
core_data_recipe.shape

(5499, 6)

In [5]:
# Set the desired number of rows
desired_rows = 5000

# Extracting starting 5000 rows from the dataset
core_data_recipe = core_data_recipe.head(n=desired_rows)  

In [6]:
# check how data big is or shape of the dataset
core_data_recipe.shape

(5000, 6)

In [7]:
# check summary or information about the dataset
core_data_recipe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   recipe_id           5000 non-null   int64 
 1   recipe_name         5000 non-null   object
 2   image_url           5000 non-null   object
 3   ingredients         5000 non-null   object
 4   cooking_directions  5000 non-null   object
 5   nutritions          5000 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [8]:
# Checking if dataset having null values or not
core_data_recipe.isnull().sum()

recipe_id             0
recipe_name           0
image_url             0
ingredients           0
cooking_directions    0
nutritions            0
dtype: int64

In [9]:
# Checking if dataset having duplicate values or not
core_data_recipe.duplicated().sum()

0

In [10]:
# Dropping 'cooking_directions' column
core_data_recipe = core_data_recipe.drop(columns=['cooking_directions'])

In [11]:
# Showing 2 rows of dataset after dropping 'cooking_directions' column
core_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,chicken wings^sprigs rosemary^head garlic^oliv...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


In [12]:
def preprocess_ingredients(ingredients_str):
    # Split the string into a list of ingredients
    ingredients_list = ingredients_str.split('^')
    # Remove leading and trailing whitespaces from each ingredient
    ingredients_list = [ingredient.strip() for ingredient in ingredients_list]
    return ingredients_list

In [13]:
# Apply preprocessing to the 'ingredients' column
core_data_recipe['ingredients'] = core_data_recipe['ingredients'].apply(preprocess_ingredients)

In [14]:
# Showing 2 rows of dataset after preprocessing 'ingredients' column
core_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkraut drained, Granny Smith apples slice...","{u'niacin': {u'hasCompleteData': False, u'name..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chicken wings, sprigs rosemary, head garlic, ...","{u'niacin': {u'hasCompleteData': True, u'name'..."


In [15]:
# Function to extract most important nutrients from the 'nutritions' column
def extract_nutrient_names(row):
    nutrients_dict = ast.literal_eval(row)
    key_nutrients = ['calories', 'protein', 'fat', 'carbohydrates', 'fiber', 'saturatedFat', 'cholesterol', 
                     'sodium', 'vitaminA', 'vitaminC', 'vitaminB6', 'niacin', 'calcium', 'iron', 'potassium']
    
    extracted_nutrients = []
    for nutrient in key_nutrients:
        if nutrient in nutrients_dict:
            extracted_nutrients.append(nutrients_dict[nutrient]['name'])
    
    return extracted_nutrients

In [16]:
# Apply the function to extract nutrient names from all rows
extracted_nutrient_names = core_data_recipe['nutritions'].apply(extract_nutrient_names)

In [17]:
core_data_recipe['nutritions'] = extracted_nutrient_names

In [18]:
# Showing 2 rows of dataset after preprocessing 'nutritions' column
core_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkraut drained, Granny Smith apples slice...","[Calories, Protein, Fat, Carbohydrates, Dietar..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chicken wings, sprigs rosemary, head garlic, ...","[Calories, Protein, Fat, Carbohydrates, Dietar..."


In [19]:
# Removing spaces from the 'ingredients' column
core_data_recipe['ingredients'] = core_data_recipe['ingredients'].apply(lambda x:[i.replace(" ","") for i in x])

In [20]:
# Removing spaces from the 'nutritions' column
core_data_recipe['nutritions'] = core_data_recipe['nutritions'].apply(lambda x: [i.replace(" ", "") if i is not None else None for i in x])

In [21]:
# Showing dataset after removing whitespaces from both the columns 'ingredients' and 'nutritions'
core_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkrautdrained, GrannySmithapplessliced, l...","[Calories, Protein, Fat, Carbohydrates, Dietar..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chickenwings, sprigsrosemary, headgarlic, oli...","[Calories, Protein, Fat, Carbohydrates, Dietar..."
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"[focacciabreadquartered, preparedbasilpesto, d...","[Calories, Protein, Fat, Carbohydrates, Dietar..."
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"[redpotatoes, stripsbacon, Sauce:, heavywhippi...","[Calories, Protein, Fat, Carbohydrates, Dietar..."
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,"[skinlessbonelesschickenbreasthalves, dicedtom...","[Calories, Protein, Fat, Carbohydrates, Dietar..."


In [22]:
# Making 'tags' name column by adding 'ingredients' and 'nutritions' column
core_data_recipe['tags'] = core_data_recipe['ingredients'] + core_data_recipe['nutritions']

In [23]:
# Showing dataset after making 'tags' column
core_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,tags
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkrautdrained, GrannySmithapplessliced, l...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[sauerkrautdrained, GrannySmithapplessliced, l..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chickenwings, sprigsrosemary, headgarlic, oli...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[chickenwings, sprigsrosemary, headgarlic, oli..."
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"[focacciabreadquartered, preparedbasilpesto, d...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[focacciabreadquartered, preparedbasilpesto, d..."
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"[redpotatoes, stripsbacon, Sauce:, heavywhippi...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[redpotatoes, stripsbacon, Sauce:, heavywhippi..."
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,"[skinlessbonelesschickenbreasthalves, dicedtom...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[skinlessbonelesschickenbreasthalves, dicedtom..."


In [24]:
# Making 'new_df1' column to only keep essential columns
new_df1 = core_data_recipe[['recipe_id', 'recipe_name', 'image_url', 'tags']]

In [25]:
# Showing dataset after keeping essential columns
new_df1

Unnamed: 0,recipe_id,recipe_name,image_url,tags
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkrautdrained, GrannySmithapplessliced, l..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chickenwings, sprigsrosemary, headgarlic, oli..."
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"[focacciabreadquartered, preparedbasilpesto, d..."
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"[redpotatoes, stripsbacon, Sauce:, heavywhippi..."
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,"[skinlessbonelesschickenbreasthalves, dicedtom..."
...,...,...,...,...
4995,21369,Hash Brown Casserole III,https://images.media-allrecipes.com/userphotos...,"[butter, frozenhashbrowns, shreddedColbycheese..."
4996,218141,Cajun Chicken Pot Pie,https://images.media-allrecipes.com/userphotos...,"[deepdishpastryfordoublecrust, oliveoil, skinl..."
4997,245310,Herbed Grilled Cheese and Pork Sandwiches,https://images.media-allrecipes.com/userphotos...,[Smithfield®Rosemary&OliveOilSeasonedPorkTende...
4998,22162,Uglies,https://images.media-allrecipes.com/userphotos...,"[groundbeefchuck, choppedonion, garlicpowder, ..."


In [26]:
# This code converts lists in the 'tags' column to a single string with items separated by spaces and replaces non-list elements with an empty string
new_df1['tags'] = new_df1['tags'].apply(lambda x: " ".join(str(item) for item in x) if isinstance(x, list) else "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(lambda x: " ".join(str(item) for item in x) if isinstance(x, list) else "")


In [27]:
# Showing 2 rows of dataset after preprocessing in 'tags' column
new_df1.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,tags
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkrautdrained GrannySmithapplessliced larg...
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,chickenwings sprigsrosemary headgarlic oliveoi...


In [28]:
# Converting 'tags' column into lowercase
new_df1['tags'] = new_df1['tags'].apply(lambda x: x.lower())

# This line converts all text in the 'recipe_name' column of the new_df3 DataFrame to lowercase
new_df1['recipe_name'] = new_df1['recipe_name'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['recipe_name'] = new_df1['recipe_name'].apply(lambda x: x.lower())


In [29]:
# Showing 2 rows of dataset after converting 'tags' columns into lowercase
new_df1.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,tags
0,240488,"pork loin, apples, and sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkrautdrained grannysmithapplessliced larg...
1,218939,foolproof rosemary chicken wings,https://images.media-allrecipes.com/userphotos...,chickenwings sprigsrosemary headgarlic oliveoi...


In [30]:
# Showing 1st value of 'tags' column
new_df1['tags'][0]

'sauerkrautdrained grannysmithapplessliced largeonion carawayseeds appleciderdivided brownsugar rub: thaiseasoning salt garlicpowder groundblackpepper bonelessporkloinroast calories protein fat carbohydrates dietaryfiber saturatedfat cholesterol sodium vitamina-iu vitaminc vitaminb6 niacinequivalents calcium iron potassium'

In [31]:
# This code initializes a CountVectorizer object (cv1) with a limit of 5000 features and removes common English stopwords
cv1 = CountVectorizer(max_features=5000, stop_words='english')

In [32]:
# This code transforms the text data in the 'tags' column of DataFrame 'new_df1' into a matrix of token counts using the previously defined CountVectorizer object 'cv1'
vectors1 = cv1.fit_transform(new_df1['tags']).toarray()

In [33]:
# Prints the variable vectors1, which contains the transformed text data in the form of a matrix of token counts
vectors1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# This code retrieves the token count vector representing the first entry in the transformed data stored in the variable vectors1
vectors1[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [35]:
# The code returns an array containing the feature names (i.e., the tokens or words) extracted by the CountVectorizer object cv1
cv1.get_feature_names_out()

array(['10', '100to110degreesf', '10biscuitspercan', ...,
       'zucchinisthinlysliced', 'zucchinithicklysliced',
       'zucchinithinlysliced'], dtype=object)

In [36]:
# This code calculates the total number of unique features extracted by the CountVectorizer object cv1
len(cv1.get_feature_names_out())

5000

In [37]:
# This loop iterates over each feature name extracted by the CountVectorizer object cv1 and prints each feature name individually
for i in cv1.get_feature_names_out():
    print(i)

10
100to110degreesf
10biscuitspercan
10inch
10to12poundseach
11
110degreesf
110degreesfto115degreesf
11x9x23
12
120degreesfto130degreesf
120degreesto130degreesf
125degreesf52degreesc
12x16
12x18
14to16inches
15
16
20
2breadsper250gpackage
2inch
2inchcubes
2inchdiameterhogcasings
2inches
2inchesthick
2inchpieces
2inchrounds
2inchstrips
2inchthick
2inchthickcentercuthamslice
2inchthickfiletmignonsteaks
2inchthickness
2inchthickslices
2inchwidestrips
2pound
31
3lessfatcreamcheesecubed
3ofliquidreserved
40perpound
45degreesc
4cupliquidreserved
4inch
4inchesthick
4inchrounds
4inchslices
4inchslicesagainstthegrain
4inchstrips
4inchthick
4inchthickslices
4x4
5inchdiameter
61
6inch
6x6
7oz
80
85
8inchlong
8inchthickslices
8to10oz
90
93
99
about11
achiote
achiotepaste
acinidipepepasta
acornsquash
acornsquashhalvedandseeded
activedryyeast
addedchickenstock
addeddicedtomatoes
addedtomatosauce
additionalketchup
additionalshreddedcheddarcheese
adoboall
adobosaucefromcannedchilies
adobosaucefromchip

In [38]:
# This line initializes a Porter Stemmer object named ps1, which is commonly used for stemming words in natural language processing tasks
ps1 = PorterStemmer()

In [39]:
# This function stems the input string text using a Porter Stemmer object ps1 and returns the stemmed text as a string
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps1.stem(i))
        return " ".join(y)

In [40]:
# Applying stem function to each element in tags column
new_df1['tags'] = new_df1['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(stem)


In [41]:
# This line calculates the cosine similarity between all pairs of rows in the matrix vectors1
similarity1 = cosine_similarity(vectors1)

In [42]:
# This code prints the cosine similarity matrix similarity1, showing the similarity scores between all pairs of rows in the matrix vectors1
similarity1

array([[1.        , 0.64465837, 0.65982888, ..., 0.6172134 , 0.68494952,
        0.73294091],
       [0.64465837, 1.        , 0.74438737, ..., 0.78334945, 0.72727273,
        0.69631062],
       [0.65982888, 0.74438737, 1.        , ..., 0.71269665, 0.74438737,
        0.71269665],
       ...,
       [0.6172134 , 0.78334945, 0.71269665, ..., 1.        , 0.69631062,
        0.66666667],
       [0.68494952, 0.72727273, 0.74438737, ..., 0.69631062, 1.        ,
        0.69631062],
       [0.73294091, 0.69631062, 0.71269665, ..., 0.66666667, 0.69631062,
        1.        ]])

In [43]:
# This code returns the shape of the cosine similarity matrix similarity1, indicating the number of rows and columns in the matrix
similarity1.shape

(5000, 5000)

In [44]:
# This code retrieves the first row of the cosine similarity matrix similarity1
similarity1[0]

array([1.        , 0.64465837, 0.65982888, ..., 0.6172134 , 0.68494952,
       0.73294091])

# Raw Food Recipe Recommendation

In [45]:
# Loading Dataset
raw_data_recipe = pd.read_csv('raw-data-recipe.csv')

In [46]:
# Showing Dataset
raw_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,cooking_directions,nutritions,reviews
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,{'directions': u'Prep\n5 m\nCook\n2 h 45 m\nRe...,"{u'niacin': {u'hasCompleteData': False, u'name...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name...","{3574785: {'rating': 5, 'followersCount': 0, '..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'...","{13774946: {'rating': 5, 'followersCount': 0, ..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{1563136: {'rating': 5, 'followersCount': 0, '..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{2945555: {'rating': 5, 'followersCount': 6690..."


In [47]:
# check how data big is or shape of the dataset 
raw_data_recipe.shape

(49698, 9)

In [48]:
# Set the desired number of rows
desired_rows = 5000

# Extracting starting 5000 rows from the dataset
raw_data_recipe = raw_data_recipe.head(n=desired_rows)  

In [49]:
# check how data big is or shape of the dataset
raw_data_recipe.shape

(5000, 9)

In [50]:
# check summary or information about the dataset
raw_data_recipe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   recipe_id           5000 non-null   int64  
 1   recipe_name         5000 non-null   object 
 2   aver_rate           5000 non-null   float64
 3   image_url           5000 non-null   object 
 4   review_nums         5000 non-null   int64  
 5   ingredients         5000 non-null   object 
 6   cooking_directions  5000 non-null   object 
 7   nutritions          5000 non-null   object 
 8   reviews             5000 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 351.7+ KB


In [51]:
# Checking if dataset having null values or not
raw_data_recipe.isnull().sum()

recipe_id             0
recipe_name           0
aver_rate             0
image_url             0
review_nums           0
ingredients           0
cooking_directions    0
nutritions            0
reviews               0
dtype: int64

In [52]:
# Checking if dataset having duplicate values or not
raw_data_recipe.duplicated().sum()

0

In [53]:
# Dropping 'aver_rate', 'review_nums', 'cooking_directions' column
raw_data_recipe = raw_data_recipe.drop(columns=['aver_rate','review_nums','cooking_directions']) 

In [54]:
# Showing 2 rows of dataset after dropping columns
raw_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,reviews
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,pork belly^smoked paprika^kosher salt^ground b...,"{u'niacin': {u'hasCompleteData': False, u'name...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name...","{3574785: {'rating': 5, 'followersCount': 0, '..."


In [55]:
# Apply preprocessing to the 'ingredients' column
raw_data_recipe['ingredients'] = raw_data_recipe['ingredients'].apply(preprocess_ingredients)

In [56]:
# Showing 2 rows of dataset after preprocessing 'ingredients' column
raw_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,reviews
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,"[pork belly, smoked paprika, kosher salt, grou...","{u'niacin': {u'hasCompleteData': False, u'name...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkraut drained, Granny Smith apples slice...","{u'niacin': {u'hasCompleteData': False, u'name...","{3574785: {'rating': 5, 'followersCount': 0, '..."


In [57]:
# Apply the function to extract nutrient names from all rows
extracted_nutrient_names = raw_data_recipe['nutritions'].apply(extract_nutrient_names)

In [58]:
raw_data_recipe['nutritions'] = extracted_nutrient_names

In [59]:
# Showing 2 rows of dataset after preprocessing 'nutritions' column
raw_data_recipe.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,reviews
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,"[pork belly, smoked paprika, kosher salt, grou...","[Calories, Protein, Fat, Carbohydrates, Dietar...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkraut drained, Granny Smith apples slice...","[Calories, Protein, Fat, Carbohydrates, Dietar...","{3574785: {'rating': 5, 'followersCount': 0, '..."


In [60]:
# his code retrieves the first review from the 'raw_data_recipe'
raw_data_recipe['reviews'][0]

'{8542392: {\'rating\': 5, \'followersCount\': 11, \'madeRecipesCount\': 18, \'favoritesCount\': 200, \'dateLastModified\': u\'2017-04-22T12:46:43.663\', \'text\': u"Best breakfast ever! I ran out of paprika while seasoning, so I used garlic piercer on the other half of the batch. Very good! Can\'t wait to make it again... and figure out how to use the drippings!", \'followingCount\': 0}, 11174581: {\'rating\': 5, \'followersCount\': 8, \'madeRecipesCount\': 55, \'favoritesCount\': 101, \'dateLastModified\': u\'2013-06-20T15:50:25.96\', \'text\': u"Awesome!\\nIt\'s amazing.", \'followingCount\': 0}, 8262477: {\'rating\': 5, \'followersCount\': 0, \'madeRecipesCount\': 1, \'favoritesCount\': 52, \'dateLastModified\': u\'2015-02-14T07:27:51.307\', \'text\': u\'The flavors came together well and it really was simple to prepare. My husband and I both enjoyed it!\', \'followingCount\': 0}}\n'

In [61]:
# This function extracts text data from a review dictionary
def extract_text(review):
    # Assuming 'review' is a string containing the review data
    try:
        review_dict = ast.literal_eval(review)
        texts = [entry['text'] for entry in review_dict.values()]
        return texts
    except Exception as e:
        print("Error:", e)
        return None

In [62]:
# Apply the function to extract text data from 'reviews' column
raw_data_recipe['reviews'] = raw_data_recipe['reviews'].apply(extract_text)

In [63]:
# Showing dataset after preprocessing in 'reviews' column
raw_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,reviews
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,"[pork belly, smoked paprika, kosher salt, grou...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[Best breakfast ever! I ran out of paprika whi...
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkraut drained, Granny Smith apples slice...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[Like most, I changed it a bit. Not a fan of ..."
2,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chicken wings, sprigs rosemary, head garlic, ...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[This is so good and very easy to make., i enj..."
3,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"[focaccia bread quartered, prepared basil pest...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[We love this recipe. It tastes like somethin...
4,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"[red potatoes, strips bacon, Sauce:, heavy whi...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[Delicious! I did not use the recipe for the d...


In [64]:
# This line removes any spaces from each ingredient in the 'ingredients' list within the raw_data_recipe
raw_data_recipe['ingredients'] = raw_data_recipe['ingredients'].apply(lambda x:[i.replace(" ","") for i in x])

In [65]:
# This line removes any spaces from each nutrition item in the 'nutritions' list within the raw_data_recipe, handling None values gracefully
raw_data_recipe['nutritions'] = raw_data_recipe['nutritions'].apply(lambda x: [i.replace(" ", "") if i is not None else None for i in x])

In [66]:
# This line creates a new column 'tag' in the raw_data_recipe DataFrame by concatenating the 'ingredients', 'nutritions', and 'reviews' columns
raw_data_recipe['tag'] = raw_data_recipe['ingredients'] + raw_data_recipe['nutritions'] + raw_data_recipe['reviews']

In [67]:
# Showing Dataset 
raw_data_recipe.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,nutritions,reviews,tag
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,"[porkbelly, smokedpaprika, koshersalt, groundb...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[Best breakfast ever! I ran out of paprika whi...,"[porkbelly, smokedpaprika, koshersalt, groundb..."
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"[sauerkrautdrained, GrannySmithapplessliced, l...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[Like most, I changed it a bit. Not a fan of ...","[sauerkrautdrained, GrannySmithapplessliced, l..."
2,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"[chickenwings, sprigsrosemary, headgarlic, oli...","[Calories, Protein, Fat, Carbohydrates, Dietar...","[This is so good and very easy to make., i enj...","[chickenwings, sprigsrosemary, headgarlic, oli..."
3,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"[focacciabreadquartered, preparedbasilpesto, d...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[We love this recipe. It tastes like somethin...,"[focacciabreadquartered, preparedbasilpesto, d..."
4,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"[redpotatoes, stripsbacon, Sauce:, heavywhippi...","[Calories, Protein, Fat, Carbohydrates, Dietar...",[Delicious! I did not use the recipe for the d...,"[redpotatoes, stripsbacon, Sauce:, heavywhippi..."


In [68]:
# This line creates a new DataFrame new_df2 containing only the columns 'recipe_id', 'recipe_name', 'image_url', and 'tag' from the raw_data_recipe DataFrame
new_df2 = raw_data_recipe[['recipe_id', 'recipe_name', 'image_url', 'tag']]

In [69]:
# This line modifies the 'tag' column in the new_df2 DataFrame to join each list element into a single string, separated by a space, if the element is a list. If the element is not a list, it sets the value to an empty string
new_df2['tag'] = new_df2['tag'].apply(lambda x: " ".join(str(item) for item in x) if isinstance(x, list) else "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['tag'] = new_df2['tag'].apply(lambda x: " ".join(str(item) for item in x) if isinstance(x, list) else "")


In [70]:
# Showing 2 rows from the dataset after doing preprocesing
new_df2.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,tag
0,222388,Homemade Bacon,https://images.media-allrecipes.com/userphotos...,porkbelly smokedpaprika koshersalt groundblack...
1,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkrautdrained GrannySmithapplessliced larg...


In [71]:
# This line converts all text in the 'tag' column of the new_df2 DataFrame to lowercase
new_df2['tag'] = new_df2['tag'].apply(lambda x: x.lower())

# This line converts all text in the 'recipe_name' column of the new_df3 DataFrame to lowercase
new_df2['recipe_name'] = new_df2['recipe_name'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['tag'] = new_df2['tag'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['recipe_name'] = new_df2['recipe_name'].apply(lambda x: x.lower())


In [72]:
# Showing 2 rows from the dataset after converting 'tag' column into lowercase
new_df2.head(2)

Unnamed: 0,recipe_id,recipe_name,image_url,tag
0,222388,homemade bacon,https://images.media-allrecipes.com/userphotos...,porkbelly smokedpaprika koshersalt groundblack...
1,240488,"pork loin, apples, and sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkrautdrained grannysmithapplessliced larg...


In [73]:
# This line initializes a CountVectorizer object named cv2 with a maximum of 5000 features and removes English stop words during the tokenization process
cv2 = CountVectorizer(max_features=5000, stop_words='english')

In [74]:
# This line fits and transforms the 'tag' column of the new_df2 DataFrame into a matrix of token counts using the CountVectorizer object cv2, and then converts it to a NumPy array
vectors2 = cv2.fit_transform(new_df2['tag']).toarray()

In [75]:
# This line prints the matrix representation of the transformed text data stored in the variable vectors2
vectors2

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 10,  0, ...,  0,  0,  0]], dtype=int64)

In [76]:
# This line retrieves the first row (vector) of the matrix representation of the transformed text data stored in the variable vectors2
vectors2[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [77]:
# This line retrieves the feature names (tokens) generated by the CountVectorizer object cv2
cv2.get_feature_names_out()

array(['00', '10', '100', ..., 'zuccini', 'zuchini', 'zuchinni'],
      dtype=object)

In [78]:
# This line retrieves the number of feature names (tokens) generated by the CountVectorizer object cv2
len(cv2.get_feature_names_out())

5000

In [79]:
# This loop iterates over each feature name (token) generated by the CountVectorizer object cv2 and prints each one
for i in cv2.get_feature_names_out():
    print(i)

00
10
100
10oz
11
12
120
125
12oz
13
130
135
13x9
14
140
145
14oz
15
150
15min
15oz
16
160
165
16oz
17
170
18
180
19
1c
1cup
1hr
1lb
1st
1t
1tbs
1tbsp
1tsp
20
200
2012
20min
21
22
225
23
24
25
250
26
27
275
28
28oz
29
2c
2cup
2cups
2hrs
2lb
2lbs
2nd
2nds
2t
2tbs
2tbsp
2tsp
2x
30
300
30min
32
325
32oz
35
350
350f
36
375
3c
3lb
3lbs
3rd
3rds
3t
3x
40
400
425
45
450
460
48
4c
4lb
4oz
4t
4th
50
500
55
57
5lb
5lbs
5oz
60
6oz
70
75
80
85
8oz
8x8
90
93
98
99
9x13
9x9
a1
able
absolute
absolutely
absolutley
absolutly
absorb
absorbed
absorbs
abundance
accent
acceptable
access
accident
accidentally
accidently
accommodate
accomodate
accompanied
accompaniment
accompany
according
accordingly
account
accurate
accustomed
achieve
acid
acidic
acidity
acini
acorn
actual
actually
ad
adams
adapt
adaptable
adapted
add
added
addicted
addicting
addictive
adding
addition
additional
additionally
additions
adds
adequate
adjust
adjusted
adjusting
adjustment
adjustments
admit
admittedly
adobe
adobo
adore
adored
ad

In [80]:
# This line applies a stemming function to each element in the 'tag' column of the new_df2 DataFrame
new_df2['tag'] = new_df2['tag'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['tag'] = new_df2['tag'].apply(stem)


In [81]:
# This line computes the cosine similarity between the rows of the vectors2 matrix, resulting in a similarity matrix named similarity2
similarity2 = cosine_similarity(vectors2)

In [82]:
# This line prints the cosine similarity matrix similarity2, which represents the pairwise cosine similarity between the rows of the vectors2 matrix
similarity2

array([[1.        , 0.2191303 , 0.22452606, ..., 0.46355253, 0.21901702,
        0.19319454],
       [0.2191303 , 1.        , 0.33844059, ..., 0.15954282, 0.41238356,
        0.59277756],
       [0.22452606, 0.33844059, 1.        , ..., 0.22401626, 0.35591366,
        0.43520561],
       ...,
       [0.46355253, 0.15954282, 0.22401626, ..., 1.        , 0.19546751,
        0.15039239],
       [0.21901702, 0.41238356, 0.35591366, ..., 0.19546751, 1.        ,
        0.41262211],
       [0.19319454, 0.59277756, 0.43520561, ..., 0.15039239, 0.41262211,
        1.        ]])

In [83]:
# This line retrieves the shape of the cosine similarity matrix similarity2
similarity2.shape

(5000, 5000)

In [84]:
# This line retrieves the first row of the cosine similarity matrix similarity2
similarity2[0]

array([1.        , 0.2191303 , 0.22452606, ..., 0.46355253, 0.21901702,
       0.19319454])

# Indian Food Recipe Recommendation

In [85]:
# Loading Dataset
food = pd.read_csv('indian_food.csv')

In [86]:
# Showing Dataset
food.head()

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region,image_url
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East,https://upload.wikimedia.org/wikipedia/commons...
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West,https://upload.wikimedia.org/wikipedia/commons...
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North,https://greedyeats.com/wp-content/uploads/2023...
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West,https://www.cookwithmanali.com/wp-content/uplo...
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East,https://recipes.net/wp-content/uploads/2023/05...


In [87]:
# check how data big is or shape of the dataset 
food.shape

(255, 10)

In [88]:
# check summary or information about the dataset
food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            255 non-null    object
 1   ingredients     255 non-null    object
 2   diet            255 non-null    object
 3   prep_time       255 non-null    int64 
 4   cook_time       255 non-null    int64 
 5   flavor_profile  255 non-null    object
 6   course          255 non-null    object
 7   state           255 non-null    object
 8   region          254 non-null    object
 9   image_url       255 non-null    object
dtypes: int64(2), object(8)
memory usage: 20.0+ KB


In [89]:
# Checking if dataset having null values or not
food.isnull().sum()

name              0
ingredients       0
diet              0
prep_time         0
cook_time         0
flavor_profile    0
course            0
state             0
region            1
image_url         0
dtype: int64

In [90]:
# Dropping null values from the dataset
food.dropna(inplace=True)

In [91]:
# Checking if dataset having null values or not
food.isnull().sum()

name              0
ingredients       0
diet              0
prep_time         0
cook_time         0
flavor_profile    0
course            0
state             0
region            0
image_url         0
dtype: int64

In [92]:
# Checking if dataset having duplicate values or not
food.duplicated().sum()

0

In [93]:
# Renamming the column 'name'
food.rename(columns={'name': 'recipe_name'}, inplace=True)

In [94]:
# Dropping columns
food = food.drop(columns=['prep_time','cook_time','state','region'])

In [95]:
# Showing dataset
food.head()

Unnamed: 0,recipe_name,ingredients,diet,flavor_profile,course,image_url
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,sweet,dessert,https://upload.wikimedia.org/wikipedia/commons...
1,Boondi,"Gram flour, ghee, sugar",vegetarian,sweet,dessert,https://upload.wikimedia.org/wikipedia/commons...
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,sweet,dessert,https://greedyeats.com/wp-content/uploads/2023...
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,sweet,dessert,https://www.cookwithmanali.com/wp-content/uplo...
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,sweet,dessert,https://recipes.net/wp-content/uploads/2023/05...


In [96]:
# This line creates a new column 'tags' in the 'food' DataFrame by concatenating the 'ingredients', 'diet', 'flavor_profile', and 'course' columns
food['tags'] = food['ingredients'] + food['diet'] + food['flavor_profile'] + food['course']

In [97]:
# Showing Dataset
food.head()

Unnamed: 0,recipe_name,ingredients,diet,flavor_profile,course,image_url,tags
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,sweet,dessert,https://upload.wikimedia.org/wikipedia/commons...,"Maida flour, yogurt, oil, sugarvegetariansweet..."
1,Boondi,"Gram flour, ghee, sugar",vegetarian,sweet,dessert,https://upload.wikimedia.org/wikipedia/commons...,"Gram flour, ghee, sugarvegetariansweetdessert"
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,sweet,dessert,https://greedyeats.com/wp-content/uploads/2023...,"Carrots, milk, sugar, ghee, cashews, raisinsve..."
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,sweet,dessert,https://www.cookwithmanali.com/wp-content/uplo...,"Flour, ghee, kewra, milk, clarified butter, su..."
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,sweet,dessert,https://recipes.net/wp-content/uploads/2023/05...,"Milk powder, plain flour, baking powder, ghee,..."


In [98]:
# This line creates a new DataFrame new_df3 containing only the 'food_name' and 'tags' columns from the 'food' DataFrame
new_df3 = food[['recipe_name','image_url','tags']]

In [99]:
# Show dataset
new_df3.head()

Unnamed: 0,recipe_name,image_url,tags
0,Balu shahi,https://upload.wikimedia.org/wikipedia/commons...,"Maida flour, yogurt, oil, sugarvegetariansweet..."
1,Boondi,https://upload.wikimedia.org/wikipedia/commons...,"Gram flour, ghee, sugarvegetariansweetdessert"
2,Gajar ka halwa,https://greedyeats.com/wp-content/uploads/2023...,"Carrots, milk, sugar, ghee, cashews, raisinsve..."
3,Ghevar,https://www.cookwithmanali.com/wp-content/uplo...,"Flour, ghee, kewra, milk, clarified butter, su..."
4,Gulab jamun,https://recipes.net/wp-content/uploads/2023/05...,"Milk powder, plain flour, baking powder, ghee,..."


In [100]:
# This line retrieves the value of the 'tags' column for the first row in the new_df3 DataFrame
new_df3['tags'][0]

'Maida flour, yogurt, oil, sugarvegetariansweetdessert'

In [101]:
# This line removes commas from each entry in the 'tags' column of the new_df3 DataFrame
new_df3['tags'] = new_df3['tags'].str.replace(',', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df3['tags'] = new_df3['tags'].str.replace(',', '')


In [102]:
# This line converts all text in the 'tags' column of the new_df3 DataFrame to lowercase
new_df3['tags'] = new_df3['tags'].apply(lambda x: x.lower())

# This line converts all text in the 'recipe_name' column of the new_df3 DataFrame to lowercase
new_df3['recipe_name'] = new_df3['recipe_name'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df3['tags'] = new_df3['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df3['recipe_name'] = new_df3['recipe_name'].apply(lambda x: x.lower())


In [103]:
# This line retrieves the value of the 'tags' column for the first row in the new_df3 DataFrame, after applying the lowercase transformation
new_df3['tags'][0]

'maida flour yogurt oil sugarvegetariansweetdessert'

In [104]:
# This line initializes a CountVectorizer object named cv3 with a maximum of 254 features and removes English stop words during the tokenization process
cv3 = CountVectorizer(max_features=254, stop_words='english')

In [105]:
# This line transforms the text data in the 'tags' column of the new_df3 DataFrame into a matrix of token counts using the CountVectorizer object cv3, and then converts it to a NumPy array
vectors3 = cv3.fit_transform(new_df3['tags']).toarray()

In [106]:
# This line prints the matrix representation of the transformed text data stored in the variable vectors3
vectors3

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [107]:
# This line retrieves the first row (vector) of the matrix representation of the transformed text data stored in the variable vectors3
vectors3[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [108]:
# This line retrieves the feature names (tokens) generated by the CountVectorizer object cv3
cv3.get_feature_names_out()

array(['1main', '1snack', 'almonds', 'almondsvegetariansweetdessert',
       'aloo', 'alum', 'amaranth', 'amchur', 'anise', 'arbi', 'arhar',
       'atta', 'avocado', 'baby', 'baking', 'banana', 'basmati', 'bay',
       'beans', 'bell', 'bengal', 'besan', 'biryani', 'bitter', 'black',
       'boiled', 'bombay', 'boondi', 'bottle', 'bread', 'breasts',
       'brinjal', 'brinjalvegetarianspicymain', 'brown', 'buffalo',
       'butter', 'butternon', 'buttervegetarianspicymain',
       'buttervegetarianspicysnack', 'buttervegetariansweetdessert',
       'cabbage', 'canned', 'capsicumvegetarianspicymain', 'cardamom',
       'cardamomvegetariansweetdessert', 'carrot', 'cashew', 'cashews',
       'chana', 'cheese', 'chenna', 'chhena', 'chicken', 'chickpea',
       'chickpeas', 'chili', 'chilies', 'chilivegetarianspicymain',
       'chilivegetarianspicysnack', 'chilli', 'chillies', 'chilliesnon',
       'chilliesvegetarianspicymain', 'chillivegetarian',
       'chillivegetarianspicymain', 'chi

In [109]:
# This line retrieves the number of feature names (tokens) generated by the CountVectorizer object cv3
len(cv3.get_feature_names_out())

254

In [110]:
# This loop iterates over each feature name (token) generated by the CountVectorizer object cv3 and prints each one
for i in cv3.get_feature_names_out():
    print(i)

1main
1snack
almonds
almondsvegetariansweetdessert
aloo
alum
amaranth
amchur
anise
arbi
arhar
atta
avocado
baby
baking
banana
basmati
bay
beans
bell
bengal
besan
biryani
bitter
black
boiled
bombay
boondi
bottle
bread
breasts
brinjal
brinjalvegetarianspicymain
brown
buffalo
butter
butternon
buttervegetarianspicymain
buttervegetarianspicysnack
buttervegetariansweetdessert
cabbage
canned
capsicumvegetarianspicymain
cardamom
cardamomvegetariansweetdessert
carrot
cashew
cashews
chana
cheese
chenna
chhena
chicken
chickpea
chickpeas
chili
chilies
chilivegetarianspicymain
chilivegetarianspicysnack
chilli
chillies
chilliesnon
chilliesvegetarianspicymain
chillivegetarian
chillivegetarianspicymain
chillivegetarianspicysnack
chole
cinnamon
cinnamonnon
clarified
coconut
coconutvegetarian
coconutvegetarianspicymain
coconutvegetarianspicysnack
coconutvegetariansweetdessert
condensed
cooked
corn
cottage
course
cream
cucumber
curd
curry
dal
dates
desiccated
dough
dried
drumsticks
dry
fat
fennel
fenugre

In [111]:
# This line applies a stemming function to each element in the 'tags' column of the new_df3 DataFrame
new_df3['tags'] = new_df3['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df3['tags'] = new_df3['tags'].apply(stem)


In [112]:
# Find the index of the row where recipe_name is 'shahi paneer'
shahi_paneer_index = new_df3[new_df3['recipe_name'] == 'shahi paneer'].index

# Check if 'shahi paneer' exists in the DataFrame
if not shahi_paneer_index.empty:
    # Drop the row corresponding to 'shahi paneer' using its index
    new_df3.drop(shahi_paneer_index, inplace=True)
    print("Row for 'shahi paneer' deleted successfully.")
else:
    print("No data found for 'shahi paneer'.")

Row for 'shahi paneer' deleted successfully.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df3.drop(shahi_paneer_index, inplace=True)


In [113]:
# This line computes the cosine similarity between the rows of the vectors3 matrix, resulting in a similarity matrix named similarity3
similarity3 = cosine_similarity(vectors3)

In [114]:
# This line prints the cosine similarity matrix similarity3, which represents the pairwise cosine similarity between the rows of the vectors3 matrix
similarity3

array([[1.        , 0.4472136 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 1.        , 0.2236068 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.2236068 , 1.        , ..., 0.        , 0.1490712 ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.12598816,
        0.        ],
       [0.        , 0.        , 0.1490712 , ..., 0.12598816, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [115]:
# This line retrieves the shape of the cosine similarity matrix similarity3
similarity3.shape

(254, 254)

In [116]:
# This line retrieves the first row of the cosine similarity matrix similarity3, which represents the cosine similarity scores between the first food item and all other food items in the dataset
similarity3[0]

array([1.        , 0.4472136 , 0.        , 0.13483997, 0.1118034 ,
       0.        , 0.26967994, 0.2236068 , 0.2236068 , 0.        ,
       0.4472136 , 0.4472136 , 0.1490712 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.2       , 0.18257419,
       0.        , 0.        , 0.        , 0.16903085, 0.        ,
       0.        , 0.2236068 , 0.18257419, 0.2236068 , 0.        ,
       0.        , 0.2236068 , 0.        , 0.        , 0.        ,
       0.        , 0.36514837, 0.2236068 , 0.12909944, 0.        ,
       0.        , 0.        , 0.18257419, 0.51639778, 0.        ,
       0.1490712 , 0.31622777, 0.25819889, 0.        , 0.        ,
       0.2       , 0.        , 0.        , 0.2       , 0.2236068 ,
       0.15811388, 0.        , 0.        , 0.        , 0.31622777,
       0.2236068 , 0.4       , 0.        , 0.25819889, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.15811388,
       0.        , 0.        , 0.        , 0.3380617 , 0.     

In [117]:
def get_food_recommendations_from_api(food):
    app_id = 'YOUR_API_ID'
    app_key = 'YOUR_API_KEY'
    
    url = f'https://api.edamam.com/api/recipes/v2?type=public&app_id={app_id}&app_key={app_key}&q={food}'

    try:
        # Make a GET request to the API
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON response
            api_data = response.json()

            # Initialize an empty list to store food labels
            labels = []

            # Extracting labels from the response
            hits = api_data.get('hits', [])
            for hit in hits[:5]:  # Loop through the first 6 hits
                label = hit.get('recipe', {}).get('label')
                if label:
                    labels.append(label)

            return labels
        else:
            # Print an error message if the request was not successful
            print(f"Error: {response.status_code} - {response.text}")
            return None
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
    
def recommend_food(food):
    # Check in dataset 1
    if food in new_df1['recipe_name'].values:
        print("Here are 5 recommendations for similar foods from your dataset:")
        food_index = new_df1[new_df1['recipe_name'] == food].index[0]
        distances = similarity1[food_index]
        food_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        for i in food_list:
            print("-", new_df1.iloc[i[0]].recipe_name)
        return
    
    # Check in dataset 2
    elif food in new_df2['recipe_name'].values:
        print("Here are 5 recommendations for similar foods from your dataset:")
        food_index = new_df2[new_df2['recipe_name'] == food].index[0]
        distances = similarity2[food_index]
        food_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        for i in food_list:
            print("-", new_df2.iloc[i[0]].recipe_name)
        return
    
    # Check in dataset 3
    elif food in new_df3['recipe_name'].values:
        print("Here are 5 recommendations for similar foods from your dataset:")
        food_index = new_df3[new_df3['recipe_name'] == food].index[0]
        distances = similarity3[food_index]
        food_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        for i in food_list:
            print("-", new_df3.iloc[i[0]].recipe_name)
        return
    
    # If not found in any dataset, call the API
    else:
        recommendations = get_food_recommendations_from_api(food)
        if recommendations:
            print("Here are some recommendations from the API:")
            for recommendation in recommendations:
                print("-", recommendation)
        else:
            print("Sorry, no recommendations found for that food item.")

In [118]:
# Example usage
user_input = input("Enter a food item or recipe: ")
recommend_food(user_input.lower())

Enter a food item or recipe: gajar ka halwa
Here are 5 recommendations for similar foods from your dataset:
- soan papdi
- basundi
- gulab jamun
- kaju katli
- rabri
