# Semantic Search Methods Exploration Notebook

In [1]:
import os
import sys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

import asyncio
from typing import Dict, List
from models.message import Message
from models.product import Product
from services.weaviate_service import WeaviateService
from services.openai_service import OpenAIService
from prompts.prompt_manager import PromptManager
from services.query_processor import QueryProcessor
from config import Config

In [2]:
# Load configuration
config = Config()

# Initialize services
weaviate_service = WeaviateService()
await weaviate_service.initialize_weaviate(
        config.OPENAI_API_KEY, config.WEAVIATE_URL, config.RESET_WEAVIATE
    )

openai_service = OpenAIService(config.OPENAI_API_KEY, config)
prompt_manager = PromptManager()
query_processor = QueryProcessor(openai_service=openai_service, prompt_manager=prompt_manager)

2024-09-09 14:48:40,169 - INFO - ===:> Initializing Weaviate
2024-09-09 14:48:40,215 - INFO - HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
2024-09-09 14:48:40,219 - INFO - HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
2024-09-09 14:48:40,222 - INFO - HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
2024-09-09 14:48:40,225 - INFO - HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
2024-09-09 14:48:40,233 - INFO - HTTP Request: POST http://localhost:8080/v1/graphql "HTTP/1.1 200 OK"
2024-09-09 14:48:40,238 - INFO - HTTP Request: POST http://localhost:8080/v1/graphql "HTTP/1.1 200 OK"
2024-09-09 14:48:40,239 - INFO -  Weaviate schema is valid: True
2024-09-09 14:48:40,239 - INFO -  Weaviate schema info: Weaviate Schema Information

Class: Product (Entries: 766)
Properties:
  - Name: name, Type: text, Description: The name of the product.
  - Name: ids, Type: text, Description: ids of the products
  - Name: m

In [3]:
# Define the query processor result
query_processor_result = {
    "filters": {
        "processor": "Quad-core ARM Cortex-A53",
        "memory": "4GB"
    },
    "expanded_queries": [
        "Find all devices with Quad-core ARM Cortex-A53 as processor with 4GB RAM",
        "Devices with Quad-core ARM Cortex-A53 processor and 4GB RAM",
        "Embedded systems featuring Quad-core ARM Cortex-A53 and 4GB memory",
        "Single-board computers with ARM Cortex-A53 Quad-core processor and 4GB RAM"
    ],
    "query_context": {
        "num_products_requested": 5,
        "sort_preference": None
    }
}

In [6]:
import json
from typing import List, Dict, Any, Tuple
from models.product import Product


async def display_search_results(query: str, features: List[str], limit: int = 5):
    print(f"Searching for: {query}")
    results = await weaviate_service.search_products(query, limit=limit)
    print("-" * 80)
    for result, certainty in results:
        print(f"Certainty: {certainty:.4f}")
        print(f"Result: {json.dumps(result, indent=2)}")
        print("-" * 40)
    print("-" * 80)
    return results


def get_unique_products(products: List[Tuple[Dict[str, Any], float]]) -> List[Product]:
    unique_products = {}
    for prod, _ in products:
        if prod["name"] not in unique_products:
            unique_products[prod["name"]] = Product(
                id=prod.get("id", ""),
                name=prod["name"],
                ids=prod.get("ids", ""),
                manufacturer=prod.get("manufacturer", ""),
                form_factor=prod.get("form_factor", ""),
                processor=prod.get("processor", ""),
                core_count=prod.get("core_count", ""),
                processor_tdp=prod.get("processor_tdp", ""),
                memory=prod.get("memory", ""),
                io=prod.get("io", ""),
                operating_system=prod.get("operating_system", ""),
                environmentals=prod.get("environmentals", ""),
                certifications=prod.get("certifications", ""),
                short_summary=prod.get("short_summary", ""),
                full_summary=prod.get("full_summary", ""),
                full_product_description=prod.get("full_product_description", ""),
            )
    return list(unique_products.values())


def compare_products(extended_products: List[str], original_products: List[str]) -> None:
    """Compares products retrieved from original and expanded queries."""
    max_length = max(len(extended_products), len(original_products))

    print(f"{'Original Products':<80}{'Extended Products':<80}")
    print("-" * 160)

    for i in range(max_length):
        original_product = original_products[i] if i < len(original_products) else ""
        extended_product = extended_products[i] if i < len(extended_products) else ""

        print(f"{original_product:<80}{extended_product:<80}")

    print("-" * 160)
    print(f"Total original products: {len(original_products)}")
    print(f"Total extended products: {len(extended_products)}")

    common_products = set(original_products) & set(extended_products)
    print(f"Common products: {len(common_products)}")

    unique_to_original = set(original_products) - set(extended_products)
    print(f"Unique to original: {len(unique_to_original)}")

    unique_to_extended = set(extended_products) - set(original_products)
    print(f"Unique to extended: {len(unique_to_extended)}")

In [7]:

# 1. Direct Query-based Search
print("1. Direct Query-based Search")
original_query = "Find all devices with Quad-core ARM Cortex-A53 as processor with 4GB RAM"
original_products = await display_search_results(original_query, 10)

1. Direct Query-based Search
Searching for: Find all devices with Quad-core ARM Cortex-A53 as processor with 4GB RAM


2024-09-09 15:55:52,038 - INFO - HTTP Request: POST http://localhost:8080/v1/graphql "HTTP/1.1 200 OK"


--------------------------------------------------------------------------------
Certainty: 0.8973
Result: {
  "name": "RMNMMI Series SMARC CPU Module",
  "ids": "RM-N8MMI-Q208I",
  "manufacturer": "IBASE",
  "form_factor": "82mm x 50mm",
  "processor": "NXP ARM Cortex-A Cortex-M i.MX 8M Mini Quad 1.8GHz",
  "core_count": "Quad-core",
  "processor_tdp": "Not available",
  "memory": "4GB LPDDR4",
  "io": "GPIO, I2C, CAN Bus, LAN, USB, UART, SPI, PCIe, SATA",
  "operating_system": "Yocto, Android",
  "environmentals": "Operating Temperature: -40\u00b0C to 85\u00b0C, Humidity: 90% RH @ 60\u00b0C non-condensing, Shock: 50G 11ms, Vibration: 5Hz to 500Hz 0.5G",
  "certifications": "CE, FCC Class B",
  "short_summary": "RMNMMI Series SMARC CPU Module with NXP ARM Cortex-A Cortex-M i.MX 8M Mini Quad 1.8GHz processor, 4GB LPDDR4, and rich peripheral I/O support.",
  "full_summary": "The RMNMMI Series SMARC CPU Module features an NXP ARM Cortex-A Cortex-M i.MX 8M Mini Quad 1.8GHz processor, 4GB 

In [None]:

# 2. Expanded Queries Search
print("\n2. Expanded Queries Search")
expanded_products = []
for query in query_processor_result["expanded_queries"]:
    products = await display_search_results(query, ["name", "processor", "memory"], 5)
    expanded_products.extend(products)

unique_expanded_products = get_unique_products(expanded_products)

# 3. Filtered Search
print("\n3. Filtered Search")
filtered_products = await weaviate_service.search_products(
    original_query,
    limit=10,
    filter_dict=query_processor_result["filters"]
)
display_search_results(filtered_products, ["name", "processor", "memory"])

# 4. Semantic Search with move_to
print("\n4. Semantic Search with move_to")
move_to_products = await weaviate_service.search_products(
    original_query,
    limit=10,
    move_to={"concepts": ["processor", "memory"], "force": 0.5}
)
display_search_results(move_to_products, ["name", "processor", "memory"])

# 5. Hybrid Search (Combine Direct and Semantic)
print("\n5. Hybrid Search (Combine Direct and Semantic)")
direct_results = set(p["name"] for p in original_products)
semantic_results = set(p["name"] for p in move_to_products)
hybrid_results = direct_results.union(semantic_results)
print(f"Hybrid results: {hybrid_results}")

# 6. Embedding Analysis
print("\n6. Embedding Analysis")
async def get_embedding(text):
    return await openai_service.create_embedding(text)

query_embedding = await get_embedding(original_query)

# Sample a few products for embedding analysis
sample_products = original_products[:3] + expanded_products[:3] + filtered_products[:3]
product_embeddings = [await get_embedding(p["name"]) for p in sample_products]

# Calculate cosine similarities
similarities = [cosine_similarity([query_embedding], [pe])[0][0] for pe in product_embeddings]

for product, similarity in zip(sample_products, similarities):
    print(f"Product: {product['name']}, Similarity: {similarity}")

# 7. Analysis and Insights
print("\n7. Analysis and Insights")
print("Comparing result sets:")
compare_products([p["name"] for p in original_products], [p["name"] for p in unique_expanded_products])

print("\nAnalyzing effectiveness of filters:")
filter_effectiveness = len([p for p in filtered_products if p["processor"] == query_processor_result["filters"]["processor"] and p["memory"] == query_processor_result["filters"]["memory"]]) / len(filtered_products)
print(f"Filter effectiveness: {filter_effectiveness * 100}%")

print("\nComparing move_to results with direct search:")
move_to_effectiveness = len(set(p["name"] for p in move_to_products).intersection(set(p["name"] for p in original_products))) / len(move_to_products)
print(f"move_to effectiveness: {move_to_effectiveness * 100}%")

# 8. Conclusion and Recommendations
print("\n8. Conclusion and Recommendations")
# This section will be filled based on the analysis results