In [4]:
# Requirements 
#!pip install sentence-transformers --quiet

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [5]:
# Loading and preparing data
columns_needed = [
    'code', 'product_name', 'brands', 'ingredients_text', 'carbohydrates_100g',
    'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sugars_100g'
]
df = pd.read_csv("/kaggle/input/world-food-facts/en.openfoodfacts.org.products.tsv", 
                 sep='\t', low_memory=False)[columns_needed]
df = df.dropna(subset=['code', 'product_name'])
df = df[df.isna().sum(axis=1) <= 2].head(10000)

In [7]:
# Combining metadata into a searchable blob of text
def build_text_blob(row):
    return f"{row['product_name']} {row.get('brands', '')} {row.get('ingredients_text', '')} " \
           f"Carbs: {row.get('carbohydrates_100g', 'N/A')}g, Fat: {row.get('fat_100g', 'N/A')}g, " \
           f"Protein: {row.get('proteins_100g', 'N/A')}g, Sugar: {row.get('sugars_100g', 'N/A')}g"

print("🔄 Generating text blobs for embedding...")
df['text_blob'] = df.apply(build_text_blob, axis=1)

🔄 Generating text blobs for embedding...


In [8]:
# Generating sentence embeddings
print("🔄 Embedding product descriptions...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text_blob'].tolist(), show_progress_bar=True)

🔄 Embedding product descriptions...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [9]:
# Search function for user query
def search_products(query, top_k=5):
    query_vec = model.encode([query])[0]
    scores = cosine_similarity([query_vec], embeddings)[0]
    top_k_indices = np.argsort(scores)[-top_k:][::-1]
    return df.iloc[top_k_indices][['code', 'product_name', 'brands', 'ingredients_text']]

In [10]:
# Example usage
query = "low sugar breakfast cereal"
results = search_products(query)
print("\n🟢 Top matches for query:")
print(results.to_string(index=False))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🟢 Top matches for query:
         code                                     product_name                            brands                                                                                                                                                                                                                                                ingredients_text
0011150302012                 Crunchy Granola, Vanilla, Almond                          Roundy's                                                                                    Whole rolled pats, rice syrup, naturally milled cane sugar, whole almonds, crisp rice (crisp rice muffs, sugar, salt, and barley malt). oat bran, ground flax seed, vanilla.
0011153041949         Crunchy Oat Squares With Cinnamon Cereal         Foodtown,  Foodtown  Inc. Whole grain oat flour, sugar, corn flour, whole grain wheat flour, rice flour, salt, tripotassium phosphate, cinnamon, natural flavors, color (caramel, color annatto extra

In [11]:
# Example usage
query = "Chocolates"
results = search_products(query)
print("\n🟢 Top matches for query:")
print(results.to_string(index=False))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🟢 Top matches for query:
         code                                                                product_name                                    brands                                                                                                                                                                                                                                                ingredients_text
0009542011390 Dark Chocolate With A Creamy Chocolate Filling And Dark Cookie Piecescookie                     Lindt & Sprungli Gmbh Sugar, chocolate, milkfat, cocoa butter, low fat cocoa powder processed with alkali, lactose, wheat flour, skim milk, palm oil, soya lecithin (emulsifier), cocoa powder processed with alkali, artificial flavor, salt, leavening (sodium bicarbonate, ammoniu
0011150980579                                                      Dark Chocolate Cashews                                  Roundy's                                                                   

In [12]:
# Example usage
query = "milk items"
results = search_products(query)
print("\n🟢 Top matches for query:")
print(results.to_string(index=False))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🟢 Top matches for query:
         code                             product_name         brands                                                                                                                                                                                                                                                                                                                                                ingredients_text
0011110503206                                     Milk Mountain Dairy                                                                                                                                                                                        Skim milk, high fructose corn syrup, sugar, nonfat milk, cocoa processed with alkali, natural and artificial flavor, salt, carrageenan, vitamin a palmitate, vitamin d3.
0011225092343                           Ice Cream Cups      Valu Time    Milkfat and nonfat milk, strawberry sauce (corn syrup, wa

In [13]:
# Example usage
query = "pasta"
results = search_products(query)
print("\n🟢 Top matches for query:")
print(results.to_string(index=False))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🟢 Top matches for query:
         code                                        product_name         brands                                                                                                                  ingredients_text
0011110873170 Italian Spaghetti Pasta, Enriched Spaghetti Product The Kroger Co.                               Durum wheat semolina, niacin, ferrous lactate (iron), thiamine mononitrate, riboflavin, folic acid.
0011150176095     Thin Spaghetti Pasta, Enriched Macaroni Product       Roundy's Durum wheat semolina [enriched with iron (ferrous sulfate) and b vitamins (niacin, thiamin mononitrate, riboflavin, folic acid)].
0011150176040          Spaghetti Pasta, Enriched Macaroni Product       Roundy's  Durum wheat semolina [enriched with iron (ferrous sulfate) and b vitamins (niacin, thiamin mononitrate riboflavin, folic acid)].
0011110852717        Elbow Ridged Pasta, Italian Macaroni Product The Kroger Co.                                                  

In [14]:
# Example usage
query = "gluten-free"
results = search_products(query)
print("\n🟢 Top matches for query:")
print(results.to_string(index=False))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🟢 Top matches for query:
         code                                         product_name                        brands                                                                                                                                                                                                                                                                                                                                                                                             ingredients_text
0011110873545                   Gluten Free All Purpose Baking Mix          Simple Truth Organic                                                                                                                                                                   Organic white rice flour, organic cornstarch, organic degerminated corn flour, organic tapioca starch, organic cane sugar, cream of tartar, untreated sea salt, organic gum blend (organic guar and locust bean gums), sodium