In [None]:
import sys

import pandas as pd

sys.path.insert(0, "..")

from src.config.paths import CHROMA_DIR, CVS_PATH_PROCESSED, JOBS_PATH_PROCESSED
from src.constants import GUARDRAIL_MAX_RETRIES
from src.db_ingestion.chroma_client import add_to_collection, get_client, get_collection
from src.talent_selection_flow.crews.metadata_extraction_crew.crews import (
    CVMetadataExtractorCrew,
    JobMetadataExtractorCrew,
)
from src.talent_selection_flow.crews.metadata_extraction_crew.enums import (
    EducationLevel,
    EmploymentType,
    ExperienceLevel,
)

In [None]:
# Load data
cvs_data = pd.read_csv(CVS_PATH_PROCESSED, sep=";")
jobs_data = pd.read_csv(JOBS_PATH_PROCESSED, sep=";")

In [None]:
# Init ChromaDB client
client = get_client(CHROMA_DIR)

# Get or create the "cvs" collection in ChromaDB
cvs_collection = get_collection(client, "cvs")
jobs_collection = get_collection(client, "jobs")

# Init metadata extractors
cv_crew = CVMetadataExtractorCrew(guardrail_max_retries=GUARDRAIL_MAX_RETRIES)
job_crew = JobMetadataExtractorCrew(guardrail_max_retries=GUARDRAIL_MAX_RETRIES)

In [None]:
# Store cvs
add_to_collection(
    metadata_extractor=cv_crew,
    corpus=cvs_data,
    collection=cvs_collection,
    max_rpm=10,
    verbose=False,
    educationlevel_options="/".join(EducationLevel),
    experiencelevel_options="/".join(ExperienceLevel),
)

In [None]:
# Store jobs
add_to_collection(
    metadata_extractor=job_crew,
    corpus=jobs_data,
    collection=jobs_collection,
    max_rpm=10,
    verbose=False,
    employmenttype_options="/".join(EmploymentType),
    experiencelevel_options="/".join(ExperienceLevel),
)

In [None]:
# Test semantic search for cv collection
query_texts = "python machine learning engineer"

results = cvs_collection.query(query_texts=[query_texts], n_results=5, where={"country": "US"})
results

In [None]:
# Test semantic search for job collection
query_texts = "python machine learning engineer"

results = jobs_collection.query(query_texts=[query_texts], n_results=5, where={"country": "US"})
results