<a href="https://colab.research.google.com/github/AliAbdallah21/ChromaDB-Employee-Records-Search/blob/main/Similarity_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Step 1: Install Required Libraries

%%capture
!pip install chromadb sentence-transformers

print("Libraries installed successfully. Proceed to the next step for imports.")


In [None]:
# @title Step 2: Import Necessary Modules

import chromadb
from chromadb.utils import embedding_functions
import os

print("Modules imported successfully. Now, initialize the ChromaDB client and embedding function.")


In [None]:
# @title Step 3: Initialize ChromaDB Client and Embedding Function
client = chromadb.Client() # In-memory client initialized.

print("ChromaDB client initialized (in-memory).")

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

print(f"Embedding function loaded using model: {embedding_function.model_name}")
print("ChromaDB client and embedding function are ready. Proceed to create a collection.")


In [None]:
# @title Step 4: Create or Get a ChromaDB Collection

collection_name = "employee_collection"

try:
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_function
    )
    print(f"Collection '{collection_name}' created or retrieved successfully.")
except Exception as e:
    print(f"Error creating/getting collection: {e}")
    # In Colab, sometimes a previous run might leave a corrupted state.
    # This block attempts to delete and re-create for a clean slate.
    try:
        print(f"Attempting to delete and re-create collection '{collection_name}'...")
        client.delete_collection(name=collection_name)
        collection = client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_function
        )
        print(f"Collection '{collection_name}' re-created successfully.")
    except Exception as e_recreate:
        print(f"Failed to delete and re-create collection: {e_recreate}")
        print("Please consider restarting your Colab runtime (Runtime -> Restart runtime) and running all cells from the beginning.")

print("Collection is ready for adding documents.")



In [None]:
# @title Step 5: Define Employee Data and Add to Collection

# Defining a list of employee dictionaries
employees = [
    {
        "id": "employee_1",
        "name": "John Doe",
        "experience": 5,
        "department": "Engineering",
        "role": "Software Engineer",
        "skills": "Python, JavaScript, React, Node.js, databases",
        "location": "New York",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_2",
        "name": "Jane Smith",
        "experience": 8,
        "department": "Marketing",
        "role": "Marketing Manager",
        "skills": "Digital marketing, SEO, content strategy, analytics, social media",
        "location": "Los Angeles",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_3",
        "name": "Alice Johnson",
        "experience": 3,
        "department": "HR",
        "role": "HR Coordinator",
        "skills": "Recruitment, employee relations, HR policies, training programs",
        "location": "Chicago",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_4",
        "name": "Michael Brown",
        "experience": 12,
        "department": "Engineering",
        "role": "Senior Software Engineer",
        "skills": "Java, Spring Boot, microservices, cloud architecture, DevOps",
        "location": "San Francisco",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_5",
        "name": "Emily Wilson",
        "experience": 2,
        "department": "Marketing",
        "role": "Marketing Assistant",
        "skills": "Content creation, email marketing, market research, social media management",
        "location": "Austin",
        "employment_type": "Part-time"
    },
    {
        "id": "employee_6",
        "name": "David Lee",
        "experience": 15,
        "department": "Engineering",
        "role": "Engineering Manager",
        "skills": "Team leadership, project management, software architecture, mentoring",
        "location": "Seattle",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_7",
        "name": "Sarah Clark",
        "experience": 8,
        "department": "HR",
        "role": "HR Manager",
        "skills": "Performance management, compensation planning, policy development, conflict resolution",
        "location": "Boston",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_8",
        "name": "Chris Evans",
        "experience": 20,
        "department": "Engineering",
        "role": "Senior Architect",
        "skills": "System design, distributed systems, cloud platforms, technical strategy",
        "location": "New York",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_9",
        "name": "Jessica Taylor",
        "experience": 4,
        "department": "Marketing",
        "role": "Marketing Specialist",
        "skills": "Brand management, advertising campaigns, customer analytics, creative strategy",
        "location": "Miami",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_10",
        "name": "Alex Rodriguez",
        "experience": 18,
        "department": "Engineering",
        "role": "Lead Software Engineer",
        "skills": "Full-stack development, React, Python, machine learning, data science",
        "location": "Denver",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_11",
        "name": "Hannah White",
        "experience": 6,
        "department": "HR",
        "role": "HR Business Partner",
        "skills": "Strategic HR, organizational development, change management, employee engagement",
        "location": "Portland",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_12",
        "name": "Kevin Martinez",
        "experience": 10,
        "department": "Engineering",
        "role": "DevOps Engineer",
        "skills": "Docker, Kubernetes, AWS, CI/CD pipelines, infrastructure automation",
        "location": "Phoenix",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_13",
        "name": "Rachel Brown",
        "experience": 7,
        "department": "Marketing",
        "role": "Marketing Director",
        "skills": "Strategic marketing, team leadership, budget management, campaign optimization",
        "location": "Atlanta",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_14",
        "name": "Matthew Garcia",
        "experience": 3,
        "department": "Engineering",
        "role": "Junior Software Engineer",
        "skills": "JavaScript, HTML/CSS, basic backend development, learning frameworks",
        "location": "Dallas",
        "employment_type": "Full-time"
    },
    {
        "id": "employee_15",
        "name": "Olivia Moore",
        "experience": 12,
        "department": "Engineering",
        "role": "Principal Engineer",
        "skills": "Technical leadership, system architecture, performance optimization, mentoring",
        "location": "San Francisco",
        "employment_type": "Full-time"
    },
]



In [None]:
# @title Step 6: Define Employee Documents
# These documents will be used for similarity search based on skills, roles, and experience
employee_documents = []
for employee in employees:
    document = f"{employee['role']} with {employee['experience']} years of experience in {employee['department']}. "
    document += f"Skills: {employee['skills']}. Located in {employee['location']}. "
    document += f"Employment type: {employee['employment_type']}."
    employee_documents.append(document)

print(f"Prepared {len(employee_documents)} employee text documents for embedding.")


In [None]:
# @title Step 7: Add Data to Collection

# Adding data to the collection in the Chroma database
# The 'add' method inserts or updates data into the specified collection
print("Attempting to add data to the collection using employee_documents...")
try:
    collection.add(
        # Extracting employee IDs to be used as unique identifiers for each record
        ids=[employee["id"] for employee in employees],
        # Using the comprehensive text documents we created
        documents=employee_documents,
        # Adding comprehensive metadata for filtering and search
        metadatas=[{
            "name": employee["name"],
            "department": employee["department"],
            "role": employee["role"],
            "experience": employee["experience"],
            "location": employee["location"],
            "employment_type": employee["employment_type"]
        } for employee in employees]
    )
    print(f"\nSuccessfully added {len(employees)} employee records to the collection.")
    print(f"Current document count in collection: {collection.count()}")
except Exception as e:
    print(f"Error adding documents: {e}")
    print("Please check your input data and the collection's status.")

print("Employee data added. The collection is now populated.")


In [None]:
# @title Step 8: Query the Collection

# Retrieving all items from the specified collection
# The 'get' method fetches all records stored in the collection
all_items = collection.get()

# Logging the retrieved items to the console for inspection or debugging
print("\nCollection contents:")
print(f"Number of documents: {len(all_items['documents'])}")
# You can uncomment the lines below to print a sample of the retrieved data for detailed inspection
print("Sample retrieved items:")
for i in range(min(3, len(all_items['documents']))):
    print(f"  ID: {all_items['ids'][i]}, Document: '{all_items['documents'][i]}', Metadata: {all_items['metadatas'][i]}")

print("Collection contents retrieved and summarized.")


In [None]:
# @title Step 9: Define Advanced Search Function (Updated)

# Function to perform various types of searches within the collection
def perform_advanced_search(collection, all_items):
    print("\n--- Starting Advanced Search Operations ---")
    try:
        print("=== Similarity Search Examples ===")

        # Example 1: Search for Python developers
        print("\n1. Searching for Python developers:")
        query_text = "Python developer with web development experience"
        print(f"Executing query: '{query_text}'")
        results = collection.query(
            query_texts=[query_text],
            n_results=3
        )

        # Check if the results are empty or undefined, and handle accordingly
        if not results or not results['ids'] or len(results['ids'][0]) == 0:
            print(f'No documents found similar to "{query_text}".')
        else:
            # Log the header for the top similar documents based on the query term
            print(f'Top {min(3, len(results["ids"][0]))} similar documents to "{query_text}":')
            # Loop through the top results and log the document details
            for i in range(min(3, len(results['ids'][0]))):
                # Extract the document ID and similarity score from the results
                doc_id = results['ids'][0][i]
                score = results['distances'][0][i]
                # Retrieve the document text corresponding to the current ID from the results
                text = results['documents'][0][i]
                # Check if the text is available; if not, log 'Text not available'
                if not text:
                    print(f' - ID: {doc_id}, Text: "Text not available", Score: {score:.4f}')
                else:
                    print(f' - ID: {doc_id}, Text: "{text}", Score: {score:.4f}')


        # Example 2: Search for leadership roles
        print("\n2. Searching for leadership and management roles:")
        query_text = "team leader manager with experience"
        print(f"Executing query: '{query_text}'")
        results = collection.query(
            query_texts=[query_text],
            n_results=3
        )

        # Check if the results are empty or undefined, and handle accordingly
        if not results or not results['ids'] or len(results['ids'][0]) == 0:
            print(f'No documents found similar to "{query_text}".')
        else:
            # Log the header for the top similar documents based on the query term
            print(f'Top {min(3, len(results["ids"][0]))} similar documents to "{query_text}":')
            # Loop through the top results and log the document details
            for i in range(min(3, len(results['ids'][0]))):
                # Extract the document ID and similarity score from the results
                doc_id = results['ids'][0][i]
                score = results['distances'][0][i]
                # Retrieve the document text corresponding to the current ID from the results
                text = results['documents'][0][i]
                # Check if the text is available; if not, log 'Text not available'
                if not text:
                    print(f' - ID: {doc_id}, Text: "Text not available", Score: {score:.4f}')
                else:
                    print(f' - ID: {doc_id}, Text: "{text}", Score: {score:.4f}')

        print("\nSimilarity search examples completed.")

        print("\n=== Metadata Filtering Examples ===")

        # Example 1: Filter by department
        print("\n3. Finding all Engineering employees:")
        print("Executing filter: {'department': 'Engineering'}")
        results = collection.get(
            where={"department": "Engineering"}
        )
        print(f"Found {len(results['ids'])} Engineering employees:")
        if results and results['ids']:
            for i, doc_id in enumerate(results['ids']):
                metadata = results['metadatas'][i]
                print(f"  - {metadata['name']}: {metadata['role']} ({metadata['experience']} years)")
        else:
            print("  No employees found for this filter.")


        # Example 2: Filter by experience range
        print("\n4. Finding employees with 10+ years experience:")
        print("Executing filter: {'experience': {'$gte': 10}}")
        results = collection.get(
            where={"experience": {"$gte": 10}}
        )
        print(f"Found {len(results['ids'])} senior employees:")
        if results and results['ids']:
            for i, doc_id in enumerate(results['ids']):
                metadata = results['metadatas'][i]
                print(f"  - {metadata['name']}: {metadata['role']} ({metadata['experience']} years)")
        else:
            print("  No employees found for this filter.")


        # Example 3: Filter by location
        print("\n5. Finding employees in California (San Francisco or Los Angeles):")
        print("Executing filter: {'location': {'$in': ['San Francisco', 'Los Angeles']}}")
        results = collection.get(
            where={"location": {"$in": ["San Francisco", "Los Angeles"]}}
        )
        print(f"Found {len(results['ids'])} employees in California:")
        if results and results['ids']:
            for i, doc_id in enumerate(results['ids']):
                metadata = results['metadatas'][i]
                print(f"  - {metadata['name']}: {metadata['location']}")
        else:
            print("  No employees found for this filter.")

        print("\nMetadata filtering examples completed.")


        print("\n=== Combined Search: Similarity + Metadata Filtering ===")

        # Example: Find experienced Python developers in specific locations
        print("\n6. Finding senior Python developers in major tech cities:")
        query_text = "senior Python developer full-stack"
        print(f"Executing query: '{query_text}' with filters (8+ years, major tech cities)")
        results = collection.query(
            query_texts=[query_text],
            n_results=5,
            where={
                "$and": [
                    {"experience": {"$gte": 8}},
                    {"location": {"$in": ["San Francisco", "New York", "Seattle"]}}
                ]
            }
        )
        # Check if the results are empty or undefined, and handle accordingly
        if not results or not results['ids'] or len(results['ids'][0]) == 0:
            print(f'No documents found for combined query: "{query_text}".')
        else:
            print(f"Query results for '{query_text}' with filters:")
            print(f"Found {len(results['ids'][0])} matching employees:")
            for i, (doc_id, document, distance) in enumerate(zip(
                results['ids'][0], results['documents'][0], results['distances'][0]
            )):
                metadata = results['metadatas'][0][i]
                print(f"  {i+1}. {metadata['name']} ({doc_id}) - Distance: {distance:.4f}")
                print(f"     {metadata['role']} in {metadata['location']} ({metadata['experience']} years)")
                print(f"     Document snippet: {document[:80]}...")


        print("\nCombined search examples completed.")
        print("\n--- Advanced Search Operations Finished ---")

    except Exception as error:
        print(f"Error in advanced search: {error}")
        print("Please ensure 'collection' and 'all_items' are correctly passed and ChromaDB is initialized.")

print("Advanced search function 'perform_advanced_search' updated with all search logic and improved output.")


In [None]:
# @title Step 10: Call Advanced Search Function

# Call the perform_advanced_search function with the collection and all_items as arguments.
# 'collection' and 'all_items' are expected to be defined by previous cells.
print("Calling perform_advanced_search function...")
perform_advanced_search(collection, all_items)
print("Advanced search function call completed.")
