In [176]:
import pandas as pd
import numpy as np

In [177]:
df =  pd.read_csv('taco_table_en.csv')
# # If you are using the original taco_table.csv (pt-BR) you may remove these columns
# drop_cols = [
#      'Ashes (g)', 'Calcium (mg)',
#     'Magnesium (mg)', 'Manganese (mg)', 'Phosphorus (mg)', 'Iron (mg)',
#     'Sodium (mg)', 'Potassium (mg)', 'Copper (mg)', 'Zinc (mg)',
#     'Retinol (µg)', 'RE (µg)', 'RAE (µg)', 'Thiamine (mg)',
#     'Riboflavin (mg)', 'Pyridoxine (mg)', 'Niacin (mg)', 'Vitamin C (mg)',
#     'Humidity (%)',  'Energy (kJ)',  'Cholesterol (mg)'
# ]
# df.drop(columns=drop_cols, inplace=True)

df.fillna(0, inplace=True)

if df['id'].dtype != 'string':
    df['id'] = df['id'].astype(str)

df_original = df.copy()
df.columns

Index(['id', 'Food Description', 'Energy (kcal)', 'Protein (g)', 'Lipids (g)',
       'Carbohydrate (g)', 'Dietary Fiber (g)', 'Saturated (g)',
       'Mono-unsaturated (g)', 'Poly-unsaturated (g)'],
      dtype='object')

In [178]:
import chromadb
from chromadb.config import Settings
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler

client = chromadb.Client(Settings())
collection_name = "FoodDB"
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)
collection = client.create_collection(name=collection_name)

scaler = MinMaxScaler()
if df['id'].dtype != 'string':
    df['id'] = df['id'].astype(str)
    
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df

Unnamed: 0,id,Food Description,Energy (kcal),Protein (g),Lipids (g),Carbohydrate (g),Dietary Fiber (g),Saturated (g),Mono-unsaturated (g),Poly-unsaturated (g)
0,1,"Rice, brown, cooked",0.140271,0.071233,0.010,0.259036,0.052734,0.005825,0.005298,0.004792
1,10,"Cookie, sweet, strawberry filled",0.532805,0.156164,0.196,0.712851,0.029297,0.118447,0.086093,0.027157
2,100,"Broccoli, cooked",0.028281,0.057534,0.005,0.044177,0.066406,0.001942,0.000000,0.003195
3,101,"Broccoli, raw",0.028281,0.098630,0.003,0.040161,0.056641,0.001942,0.001325,0.003195
4,102,"Yam (Cará), cooked",0.088235,0.041096,0.001,0.189759,0.050781,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
590,96,"Eggplant, raw",0.022624,0.032877,0.001,0.044177,0.056641,0.000000,0.000000,0.000000
591,97,"Beetroot, cooked",0.036199,0.035616,0.001,0.072289,0.037109,0.000000,0.000000,0.000000
592,98,"Beetroot, raw",0.055430,0.052055,0.001,0.111446,0.066406,0.000000,0.000000,0.000000
593,99,"Cookie, sweet cassava starch (polvilho)",0.495475,0.035616,0.122,0.808233,0.023438,0.046602,0.056954,0.009585


In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Creating vector database..."):
    doc_id = str(idx)
    metadata = {"Food Description": row["Food Description"]}
    embedding = row[numeric_cols].tolist()
    document = df_original.loc[idx].to_json()
    collection.add(documents=[document], ids=[doc_id], metadatas=[metadata], embeddings=[embedding])

In [None]:
from pprint import pprint
import json

food_id = "400"
idx_list = df.index[df['id'] == food_id].tolist()
if not idx_list:
    raise ValueError(f"Food ID {food_id} not found.")
idx = idx_list[0]
query_embedding = df.loc[idx, numeric_cols].tolist()
query_desc = df.loc[idx]

print("Description of the embedding being searched:")
pprint(query_desc)
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    include=["embeddings", "documents", "metadatas"]
)
results = pd.DataFrame(json.loads(x) for x in results['documents'][0])
results

Description of the embedding being searched:
id                                      400
Food Description        Chicken, liver, raw
Energy (kcal)                       0.11991
Protein (g)                        0.482192
Lipids (g)                            0.035
Carbohydrate (g)                        0.0
Dietary Fiber (g)                       0.0
Saturated (g)                      0.025243
Mono-unsaturated (g)               0.009272
Poly-unsaturated (g)               0.009585
Name: 334, dtype: object


Unnamed: 0,id,Food Description,Energy (kcal),Protein (g),Lipids (g),Carbohydrate (g),Dietary Fiber (g),Saturated (g),Mono-unsaturated (g),Poly-unsaturated (g)
0,400,"Chicken, liver, raw",106.0,17.6,3.5,0.0,0.0,1.3,0.7,0.6
1,399,"Chicken, thigh, skinless, raw",120.0,17.8,4.9,0.0,0.0,1.6,2.1,0.8
2,426,"Turkey, frozen, raw",94.0,18.1,1.8,0.0,0.0,0.4,0.4,0.7
3,307,"Whiting, fillet, raw",107.0,16.7,4.0,0.0,0.0,0.9,2.3,0.3
4,548,Sarapatel (pork offal stew),123.0,18.5,4.4,1.1,0.0,1.4,1.1,0.7
