In [54]:
import pandas as pd
import requests

df = pd.read_csv("../api/data.csv")

In [None]:
def get_embeddings(strings_list, api_url="http://localhost:8000/encode"):
    """
    Get embeddings for a list of strings (text or base64-encoded images)

    Args:
        strings_list (list): List of strings (text or base64-encoded images)
        api_url (str): URL of the embedding API

    Returns:
        list: List of embeddings
    """
    embeddings_list = []

    for string in strings_list:
        try:
            response = requests.post(api_url, json={"content": string})
            response.raise_for_status()
            result = response.json()

            if result["status"] == "success":
                embeddings_list.append(
                    {
                        "input": (
                            string[:50] + "..." if len(string) > 50 else string
                        ),  # Truncate long strings in log
                        "type": result["type"],
                        "embeddings": result["embeddings"],
                    }
                )
            else:
                print(
                    f"Error processing string: {result.get('message', 'Unknown error')}"
                )
                embeddings_list.append(
                    {
                        "input": string[:50] + "...",
                        "type": "error",
                        "error": result.get("message", "Unknown error"),
                    }
                )

        except Exception as e:
            print(f"Error in API call: {str(e)}")
            embeddings_list.append(
                {"input": string[:50] + "...", "type": "error", "error": str(e)}
            )

    return embeddings_list

In [1]:
import tqdm
import torch
import json

device = "cpu"


def calculate_embeddings(file_name="./embeddings.json", sample_size=50000):
    """
    - Calculate embeddings for titles and images in the DataFrame and save them to a JSON file.
      :param file_name: Path to the JSON file where embeddings will be saved.
      :param sample_size: Number of samples for which embeddings will be calculated.
    - This method calculates embeddings for the titles and images in the dataset using the pre-trained model.
    - These embeddings are then stored in a JSON file, which will be used later for matching and recommending recipes.
    """
    sample_df = df  # .head(sample_size)  # Use the first 10k samples
    embeddings = {}
    for index, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        title_text = row["Title"]
        # Convert the text to a PyTorch tensor and move it to the device
        title_inputs = torch.tensor(get_embeddings([title_text])).to(device)
        title_embedding = title_inputs.cpu().numpy().tolist()
        # Handle image embedding if the image is available
        if not pd.isna(row["Image"]):
            try:
                print("Processing image for ID:", row["ID"])
                base64_string = row["Image"]["bytes"]
                image_inputs = torch.tensor(get_embeddings([base64_string])).to(device)
                image_embedding = image_inputs.cpu().numpy().tolist()
                final_embedding = [
                    (x + y) / 2 for x, y in zip(title_embedding, image_embedding)
                ]
            except Exception as e:
                print(f"Error processing image for ID {row['ID']}: {e}")
                final_embedding = title_embedding
        else:
            final_embedding = title_embedding
        # Save with ID as the key
        embeddings[row["ID"]] = final_embedding
    # Save the embeddings to a JSON file
    with open(file_name, "w") as f:
        json.dump(embeddings, f)
    print(f"Embeddings saved successfully to {file_name}.")

In [31]:
from elasticsearch import Elasticsearch

# Create a client instance
# For local Elasticsearch (default settings)
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "TyE3xKUcfnJT_3VkbC4k"),  # Default username is "elastic"
    verify_certs=False,  # Only in development! Not recommended for production
)
# For remote Elasticsearch with authentication
# es = Elasticsearch(
#     "https://your-elasticsearch-host:9200",
#     basic_auth=("username", "password")
# )

# Verify the connection
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch




eyJ2ZXIiOiI4LjE0LjAiLCJhZHIiOlsiMTcyLjE4LjAuMjo5MjAwIl0sImZnciI6IjgxY2QzZTMyMmI3MjIyY2YzMDFmODQ3ZGQyNmIzZTJkZTE3YWQzYjY4YWRlNGY1MTYzMDNjNGViMmNkNTdkNGQiLCJrZXkiOiJ6OTFDNkpNQnlGYllncXFSS093dzpCQWFieXptSFJsU21wckVvREpUcWJRIn0=


In [45]:
# Elasticsearch mappings for the models

USER_MAPPING = {
    "mappings": {
        "properties": {
            "email": {"type": "keyword"},  # exact match for emails
            "name": {"type": "text"},
            "password": {"type": "keyword"},  # we shouldn't store plain passwords in ES
            "embedding": {
                "type": "dense_vector",
                "dims": 768,  # adjust dimension based on your embedding model
            },
        }
    }
}

RECIPE_MAPPING = {
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    }  # allows both full-text and exact matching
                },
            },
            "ingredients": {"type": "text", "index": True, "analyzer": "standard"},
            "instructions": {"type": "text"},
            "prep_time": {"type": "integer"},
            "cook_time": {"type": "integer"},
            "cuisine": {"type": "keyword"},
            "course": {"type": "keyword"},
            "diet": {"type": "keyword"},
            "image": {"type": "text", "index": False},  # URLs stored as keywords
            "url": {"type": "text", "index": False},
            "embedding": {
                "type": "dense_vector",
                "dims": 768,  # adjust dimension based on your embedding model
                "index": True,
                "similarity": "cosine",
            },
        }
    }
}

FEEDBACK_MAPPING = {
    "mappings": {
        "properties": {
            "email": {"type": "keyword"},
            "input_description": {"type": "text"},
            "input_image": {"type": "keyword"},
            "recipe_id": {"type": "integer"},
            "rating": {"type": "integer"},
            "comment": {"type": "text"},
        }
    }
}

RECIPE_ADD_MAPPING = {
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
            "ingredients": {"type": "text"},
            "instructions": {"type": "text"},
            "prep_time": {"type": "integer"},
            "cook_time": {"type": "integer"},
            "cuisine": {"type": "keyword"},
            "course": {"type": "keyword"},
            "diet": {"type": "keyword"},
            "image": {"type": "keyword"},
            "url": {"type": "keyword"},
            "embedding": {"type": "dense_vector", "dims": 1536},
            "accepted": {"type": "boolean"},  # Additional field for RecipeAdd
        }
    }
}

USER_REVIEW_MAPPING = {
    "mappings": {
        "properties": {
            "email": {"type": "keyword"},
            "reviews": {
                "type": "nested",  # Using nested type for the array of reviews
                "properties": {
                    "email": {"type": "keyword"},
                    "input_description": {"type": "text"},
                    "input_image": {"type": "keyword"},
                    "recipe_id": {"type": "integer"},
                    "rating": {"type": "integer"},
                    "comment": {"type": "text"},
                },
            },
        }
    }
}


# 3. Modified create_indices function with index settings
async def create_indices(es_client):
    """Create Elasticsearch indices with their mappings."""
    indices = {
        "users": USER_MAPPING,
        "recipes": RECIPE_MAPPING,
        "feedback": FEEDBACK_MAPPING,
        "recipe_submissions": RECIPE_ADD_MAPPING,
        "user_reviews": USER_REVIEW_MAPPING,
    }

    # Add index settings
    index_settings = {
        "number_of_shards": 3,
        "number_of_replicas": 1,
        "refresh_interval": "1s",
    }

    for index_name, mapping in indices.items():
        if not await es_client.indices.exists(index=index_name):
            # Merge settings with mapping
            if "settings" not in mapping:
                mapping["settings"] = index_settings
            else:
                mapping["settings"].update(index_settings)

            await es_client.indices.create(index=index_name, body=mapping)
            print(f"Created index: {index_name}")

In [51]:
from elasticsearch import AsyncElasticsearch  # Change this import

# Create async client instance
es = AsyncElasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "TyE3xKUcfnJT_3VkbC4k"),
    verify_certs=False,
)

  _transport = transport_class(


In [52]:
await create_indices(es)

Created index: users
Created index: recipes
Created index: feedback
Created index: recipe_submissions
Created index: user_reviews
