## K-Means Clustering

In [2]:
import pandas as pd

df = pd.read_csv('../data/dataset.csv')

In [3]:
df.columns

Index(['id', 'food', 'image', 'instructions', 'food_type', 'Calories',
       'FatContent', 'ProteinContent', 'glutenFree', 'dairyFree', 'sourceUrl',
       'ingredients', 'food_cluster'],
      dtype='object')

In [4]:
df = df.drop(columns=['id', 'image', 'instructions', 'sourceUrl'])

In [5]:
df

Unnamed: 0,food,food_type,Calories,FatContent,ProteinContent,glutenFree,dairyFree,ingredients,food_cluster
0,Fried Anchovies with Sage,Non-Veg,170.9,2.5,3.2,False,True,"['anchovies', 'anchovies', 'baking', 'powder',...",0
1,Anchovies Appetizer With Breadcrumbs & Scallions,Non-Veg,1110.7,58.8,63.4,False,True,"['marinated', 'anchovies', 'marinated', 'ancho...",0
2,"Bread, Butter And Anchovies",Non-Veg,311.1,0.2,0.3,False,False,"['tuscan', 'bread', 'pickles', 'baby', 'capers...",2
3,Fried Anchovies,Non-Veg,536.1,24.0,29.3,False,True,"['marinated', 'anchovies', 'marinated', 'ancho...",3
4,Marinated Fresh Anchovies: Alici Marinate,Non-Veg,103.6,0.4,4.3,True,True,"['anchovies', 'anchovies', 'garlic', 'olive', ...",3
...,...,...,...,...,...,...,...,...,...
1936,Curzan Seasoning (St Croix),Veg,1.7,0.0,0.1,True,True,"['dried', 'thyme', 'dried', 'parsley', 'dried'...",3
1937,Wild Mushroom Stroganoff,Veg,232.8,19.1,5.0,False,False,"['butter', 'butter', 'shallot', 'portabella', ...",3
1938,Slow Cooker Dijon Brussels Sprouts,Veg,120.1,9.4,3.2,False,False,"['Brussels', 'sprout', 'butter', 'Dijon', 'mus...",3
1939,Sweetcorn and Red Pepper Salad,Veg,206.5,11.9,5.2,False,False,"['celery', 'ribs', 'garlic', 'clove', 'tomatoe...",1


In [6]:
df.ingredients[0]

"['anchovies', 'anchovies', 'baking', 'powder', 'egg', 'flour', 'sage', 'salt', 'seltzer', 'water', 'vegetable', 'oil']"

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pickle
import pandas as pd

# Preprocessing: Convert 'food' column to a single string per row
df['food'] = df['food'].apply(lambda x: ' '.join(str(x).split()))


# Vectorize the 'food' column
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['food'])

# Clustering with KMeans
kmeans = KMeans(n_clusters=5, random_state=2, init='k-means++').fit(X)

# The labels_ attribute contains the cluster assignments for each food item
df['food_cluster'] = kmeans.labels_


# Vectorize the 'food' column
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['food'])

# Clustering with KMeans
kmeans = KMeans(n_clusters=5, random_state=2).fit(X)


# Reduce dimensionality to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# Function to preprocess input criteria
def preprocess_input(food_type, calories, fat_content, protein_content, gluten_free, dairy_free):
    gluten_free = 'True' if gluten_free else 'False'
    dairy_free = 'True' if dairy_free else 'False'
    input_string = f"{food_type} {calories} {fat_content} {protein_content} {gluten_free} {dairy_free}"
    return input_string

# Function to find cluster based on input criteria
def find_cluster(input_string, vectorizer, kmeans):
    input_vector = vectorizer.transform([input_string])
    cluster = kmeans.predict(input_vector)[0]
    # After finding the cluster
    print(f"Cluster found for input: {cluster}")
    return cluster

# Function to recommend food items from the same cluster
def recommend_food_by_cluster(df, cluster, num_recommendations=5):
    similar_food_items = df[df['food_cluster'] == cluster]
    recommendations = similar_food_items.sample(n=num_recommendations)
    return recommendations

def recommend_food(df, vectorizer, kmeans, food_type, calories, fat_content, protein_content, gluten_free, dairy_free, ingredients, num_recommendations=5):
    input_string = preprocess_input(food_type, calories, fat_content, protein_content, gluten_free, dairy_free)
    cluster = find_cluster(input_string, vectorizer, kmeans)
    similar_food_items = df[df['food_cluster'] == cluster]
    # Filter based on ingredients
    filtered_items = similar_food_items[similar_food_items['ingredients'].apply(lambda x: all(ingredient in x for ingredient in ingredients))]
    if len(filtered_items) < num_recommendations:
        print("Not enough items in the cluster matching the criteria.")
        # After filtering based on ingredients
        print(f"Filtered items: {filtered_items}")
        return filtered_items
    else:
        recommendations = filtered_items.sample(n=num_recommendations, replace=False)  # Sampling without replacement
        return recommendations


# # Load vectorizer model
# with open('vectorizer.pkl', 'rb') as f:
#     vectorizer = pickle.load(f)

# # Load kmeans model
# with open('kmeans.pkl', 'rb') as f:
#     kmeans = pickle.load(f)


# ingredients = ['fish', 'rice']
# recommendations = recommend_food(df, vectorizer, kmeans, 'Non-Veg', 700, 32, 45, True, False, ingredients)


In [11]:
ingredients = ['chicken', 'salt']
recommendations = recommend_food(df, vectorizer, kmeans, 'Non-Veg', 500, 20, 30, True, True, ingredients)


Cluster found for input: 3


In [12]:
res = recommendations[['food', 'food_type', 'Calories', 'FatContent', 'ProteinContent', 'glutenFree', 'dairyFree', 'ingredients']]

res

Unnamed: 0,food,food_type,Calories,FatContent,ProteinContent,glutenFree,dairyFree,ingredients
138,Hunter's Chicken Stew,Non-Veg,1012.3,44.7,9.2,True,True,"['anchovy', 'anchovy', 'bay', 'leaves', 'olive..."
814,Chicken Confit,Non-Veg,440.6,38.9,2.5,True,False,"['anchovy', 'anchovy', 'pepper', 'chicken', 'c..."
962,Tuscan Chicken Liver Crostini,Non-Veg,982.3,60.6,59.7,False,True,"['anchovy', 'anchovy', 'coarse', 'textured', '..."
121,Green Goddess Chicken Wraps,Non-Veg,199.4,10.4,2.2,False,False,"['anchovy', 'anchovy', 'baby', 'spinach', 'bur..."
218,Puglia Lady Rethought For America,Non-Veg,55.0,2.0,1.5,False,False,"['anchovy', 'anchovy', 'rabe', 'parmeggiano', ..."


In [13]:
res = res.to_dict(orient='records')

res

[{'food': "Hunter's Chicken Stew",
  'food_type': 'Non-Veg',
  'Calories': 1012.3,
  'FatContent': 44.7,
  'ProteinContent': 9.2,
  'glutenFree': True,
  'dairyFree': True,
  'ingredients': "['anchovy', 'anchovy', 'bay', 'leaves', 'olives', 'canned', 'tomatoes', 'chianti', 'chicken', 'coarse', 'salt', 'and', 'pepper', 'rosemary', 'garlic', 'olive', 'oil']"},
 {'food': 'Chicken Confit',
  'food_type': 'Non-Veg',
  'Calories': 440.6,
  'FatContent': 38.9,
  'ProteinContent': 2.5,
  'glutenFree': True,
  'dairyFree': False,
  'ingredients': "['anchovy', 'anchovy', 'pepper', 'chicken', 'cutlets', 'extra', 'virgin', 'olive', 'oil', 'garlic', 'cloves', 'kosher', 'salt', 'lemon', 'juice', 'parmesan', 'cheese', 'pepper', 'flakes', 'rosemary', 'sprigs', 'shallots', 'thyme']"},
 {'food': 'Tuscan Chicken Liver Crostini',
  'food_type': 'Non-Veg',
  'Calories': 982.3,
  'FatContent': 60.6,
  'ProteinContent': 59.7,
  'glutenFree': False,
  'dairyFree': True,
  'ingredients': "['anchovy', 'anchovy'

## Search Functionality using Cosine Similarity

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../backend/dataset.csv')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert 'food' to a single string per row
df['food'] = df['food'].apply(lambda x: ' '.join(str(x).split()))

# Vectorize the 'food' column
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['food'])

# Calculate the cosine similarity matrix
cosine_sim = cosine_similarity(X, X)

def recommend_food(food_name, df, cosine_sim):
    # Find the index of the given food item
    idx = df[df['food'] == food_name].index[0]
    
    # Get the pairwise similarity scores with other food items
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the food items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 5 most similar food items
    sim_scores = sim_scores[1:6]
    
    # Get the food indices
    food_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar food items
    return df['food'].iloc[food_indices]

In [2]:
recommendations = recommend_food('Pizza', df, cosine_sim)
print(recommendations)

1835     Tomato and Artichoke Pizza
929                 Christmas Pizza
1179    Portabella and Tomato Pizza
198      Springtime Asparagus Pizza
789      Springtime Asparagus Pizza
Name: food, dtype: object
