In [None]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType
import numpy as np
import pyarrow.parquet as pq

In [None]:
# Load the Parquet file
parquet_path = "filtered_recipes.parquet"  # Update with your file path
table = pq.read_table(parquet_path)

# Convert to Pandas DataFrame
recipes_df = table.to_pandas()

# Show sample data
#print(recipes_df.head())

   ID                                               Name  \
0   0                     Lentil, Apple, and Turkey Wrap   
1   1         Boudin Blanc Terrine with Red Onion Confit   
2   4                           Spinach Noodle Casserole   
3   5                                      The Best Blts   
4   6  Ham and Spring Vegetable Salad with Shallot Vi...   

                                        Instructions  \
0  1. Place the stock, lentils, celery, carrot, t...   
1  Combine first 9 ingredients in heavy  saucepan...   
2  Preheat oven to 350°F. Lightly grease 8x8x2-in...   
3  Mix basil, mayonnaise and butter in processor ...   
4  Cook potatoes and carrots in  pot of boiling s...   

                                         Ingredients   Category       Label  
0  [vegetable or chicken stock, brown lentils, fr...    garlish  vegetarian  
1  [whipping cream, onions, salt, bay leaves, clo...  main dish        meat  
2  [spinach souffle, extra-wide egg noodles, crea...  main dish  veg

In [3]:
# Load the NPZ file
npz_path = "recipe_data_gpu0.npz"  # Update with your file path
data = np.load(npz_path, allow_pickle=True)

# Extract IDs and embeddings
ids = data["ids"]
embeddings = data["embeddings"]

# Check shape
print(f"IDs shape: {ids.shape}")
print(f"Embeddings shape: {embeddings.shape}")

IDs shape: (535505,)
Embeddings shape: (535505, 4096)


In [41]:
# Connect to the local Weaviate instance
client = weaviate.connect_to_local()

# Define the collection schema correctly
class_name = "Recipe"

In [43]:
from weaviate.classes.config import Configure, VectorDistances
if not client.collections.exists(class_name):
    client.collections.create(
        name=class_name,
        description="A collection of recipes with ingredients and instructions",
        vectorizer_config=Configure.Vectorizer.none(),
        vector_index_config=Configure.VectorIndex.hnsw(
            distance_metric=VectorDistances.L2
        ),
        properties=[
            Property(name="name", data_type=DataType.TEXT, description="Recipe Name"),
            Property(name="ingredients", data_type=DataType.TEXT, description="List of Ingredients"),
            Property(name="instructions", data_type=DataType.TEXT, description="Cooking Instructions"),
            Property(name="label", data_type=DataType.TEXT, description="Recipe Label"),
            Property(name="category", data_type=DataType.TEXT, description="Recipe Category")
        ]
    )

In [44]:
# Process in batches
batch_size = 500  # Adjust based on memory capacity
num_batches = len(ids) // batch_size + 1

collection = client.collections.get("Recipe")

for batch_idx in range(15):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(ids))

    # Process only a batch at a time
    batch_ids = ids[start_idx:end_idx]
    batch_embeddings = embeddings[start_idx:end_idx]

    for i, recipe_id in enumerate(batch_ids):
        recipe_data = recipes_df[recipes_df["ID"] == recipe_id]

        if not recipe_data.empty:
            row = recipe_data.iloc[0]  # Get the first match

            # ✅ Convert Ingredients to String if List
            ingredient_list = row["Ingredients"]
            if isinstance(ingredient_list, list):
                ingredient_list = ", ".join(map(str, ingredient_list))  # Ensure all items are strings

            # ✅ Ensure all fields are converted to native Python types
            properties = {
                "name": str(row["Name"]),
                "ingredients": str(ingredient_list),
                "instructions": str(row["Instructions"]),
                "label": str(row["Label"]),
                "category": str(row["Category"])
            }

            # ✅ Convert embeddings explicitly to a list
            vector_embedding = batch_embeddings[i].tolist()

            collection.data.insert(
                properties=properties,
                vector=vector_embedding  # Convert NumPy array to Python list
            )

    print(f"✅ Processed batch {batch_idx + 1} of {num_batches}")

print("🎉 All recipes uploaded successfully!")

✅ Processed batch 1 of 1072
✅ Processed batch 2 of 1072
✅ Processed batch 3 of 1072
✅ Processed batch 4 of 1072
✅ Processed batch 5 of 1072
✅ Processed batch 6 of 1072
✅ Processed batch 7 of 1072
✅ Processed batch 8 of 1072
✅ Processed batch 9 of 1072
✅ Processed batch 10 of 1072
✅ Processed batch 11 of 1072
✅ Processed batch 12 of 1072
✅ Processed batch 13 of 1072
✅ Processed batch 14 of 1072
✅ Processed batch 15 of 1072
🎉 All recipes uploaded successfully!


In [45]:
print("Sample IDs from dataset:", ids[:10])

Sample IDs from dataset: [ 0  4  6  9 11 14 16 18 21 24]


In [46]:
# ✅ Find the index of recipe ID = 1
recipe_index = np.where(ids == 4)[0]
print(f"Index of Recipe ID 4: {recipe_index}")

# ✅ Get the corresponding embedding (convert to list for Weaviate)
if len(recipe_index) > 0:
    query_embedding = embeddings[recipe_index[0]].tolist()
else:
    print("⚠️ Recipe with ID 4 not found.")
    query_embedding = None

Index of Recipe ID 4: [1]


In [47]:
from weaviate.classes.query import MetadataQuery
search_results = collection.query.near_vector(
    near_vector=query_embedding,  # Use the retrieved embedding
    limit=5,  # Return top 5 similar recipes
    return_properties=["name", "ingredients", "instructions", "category", "label"],  # ✅ Corrected metadata format
    return_metadata=MetadataQuery(distance=True)
)

# ✅ Display results
print("\n Top 5 Similar Recipes to Recipe ID 4:")
for i, recipe in enumerate(search_results.objects):
    print(f"\n🔹Match {i+1}")
    print(f"Name: {recipe.properties['name']}")
    print(f"Category: {recipe.properties['category']}")
    print(f"Label: {recipe.properties['label']}")
    print(f"Instructions: {recipe.properties['instructions'][:300]}...")  # Show first 300 chars
    print(f"Ingredients: {recipe.properties['ingredients'][:5]}...")  # Show first 5 ingredients
    print(f"Distance: {recipe.metadata.distance}")



 Top 5 Similar Recipes to Recipe ID 4:

🔹Match 1
Name: Spinach Noodle Casserole
Category: main dish
Label: vegetarian
Instructions: Preheat oven to 350°F. Lightly grease 8x8x2-inch glass baking dish. Blend spinach, noodles, sour cream, pesto sauce and nutmeg in  bowl. Spoon mixture into prepared dish. Sprinkle cheese over. Bake until set, about 45 minutes. Let stand 10 minutes....
Ingredients: ['spi...
Distance: -1.1920928955078125e-07

🔹Match 2
Name: Spinach Lasagna
Category: main dish
Label: meat
Instructions: Heat oven to 375°F. In a  sauté pan over  heat, heat oil. Add onions and garlic; toss to coat. Add 1 cup stock; cover and reduce heat; simmer until onions are soft, 20 minutes. Reserve 1/2 cup liquid. Add scallions, oregano, salt and pepper to onion mixture; increase heat to  and cook, stirring, un...
Ingredients: ['oli...
Distance: 0.18536847829818726

🔹Match 3
Name: Spinach Soufflé with Shallots and Smoked Gouda Cheese
Category: garlish
Label: vegetarian
Instructions: Preh

In [48]:
sample_vector = embeddings[0].tolist()  # Pick the first embedding
search_results = collection.query.near_vector(
    near_vector=sample_vector,
    limit=5,
    return_properties=["name", "ingredients", "instructions", "category", "label"],
    return_metadata=MetadataQuery(distance=True)
)

# ✅ Display results
print("\n Top 5 Similar Recipes to Recipe ID 0:")
for i, recipe in enumerate(search_results.objects):
    print(f"\n🔹Match {i+1}")
    print(f"Name: {recipe.properties['name']}")
    print(f"Category: {recipe.properties['category']}")
    print(f"Label: {recipe.properties['label']}")
    print(f"Instructions: {recipe.properties['instructions'][:300]}...")  # Show first 300 chars
    print(f"Ingredients: {recipe.properties['ingredients'][:5]}...")  # Show first 5 ingredients
    print(f"Distance: {recipe.metadata.distance}")


 Top 5 Similar Recipes to Recipe ID 0:

🔹Match 1
Name: Lentil, Apple, and Turkey Wrap
Category: garlish
Label: vegetarian
Instructions: 1. Place the stock, lentils, celery, carrot, thyme, and salt in a  saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer t...
Ingredients: ['veg...
Distance: 5.960464477539063e-08

🔹Match 2
Name: Red-Lentil Soup
Category: garlish
Label: meat
Instructions: Cook onion in oil with 1/2 teaspoon salt in a  heavy saucepan over  heat, stirring occasionally, until softened, about 8 minutes. Add garlic, cumin, bay leaf, and thyme and cook, stirring, 1 minute more. Add lentils, broth, water, 1/2 teaspoon salt, and 1/2 teaspoon pepper and simmer, partially cove...
Ingredients: ['oni...
Distance: 0.23089313507080078

🔹Match 3
Name: Turkey Wraps with Chipotle Mayonnaise
Category: main dish

In [None]:
#client.collections.delete("Recipe")
#print("🚮 Collection 'Recipe' deleted successfully")

🚮 Collection 'Recipe' deleted successfully


In [49]:
client.close()      