In [None]:
import pandas as pd

import sys
sys.path.insert(0, "..")

from src.config.paths import CHROMA_DIR, CVS_PATH_PROCESSED, JOBS_PATH_PROCESSED
from src.db_ingestion.chroma_client import get_client, get_collection, add_to_collection
from src.db_ingestion.metadata_extractor import CVMetadataExtractorCrew, JobMetadataExtractorCrew

In [None]:
# Load data
cvs_data = pd.read_csv(CVS_PATH_PROCESSED, sep=";")
jobs_data = pd.read_csv(JOBS_PATH_PROCESSED, sep=";")

In [None]:
# Init ChromaDB client
client = get_client(CHROMA_DIR)

# Get or create the "cvs" collection in ChromaDB
cvs_collection = get_collection(client, "cvs")
jobs_collection = get_collection(client, "jobs")

# Init metadata extractors
cv_crew = CVMetadataExtractorCrew()
job_crew = JobMetadataExtractorCrew()

In [None]:
# Store cvs
add_to_collection(
    metadata_extractor=CVMetadataExtractorCrew(),
    corpus=cvs_data,
    collection=cvs_collection,
    max_rpm=10,
)

In [None]:
# Store jobs
add_to_collection(
    metadata_extractor=JobMetadataExtractorCrew(),
    corpus=jobs_data,
    collection=jobs_collection,
    max_rpm=10,
)

In [None]:
# Test semantic search for cv collection
query_texts = "python machine learning engineer"

results = cvs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results

In [None]:
# Test semantic search for job collection
query_texts = "python machine learning engineer"

results = jobs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results