In [1]:
import pandas as pd

import sys
sys.path.insert(0, "..")

from src.config.paths import CHROMA_DIR, CVS_PATH_PROCESSED, JOBS_PATH_PROCESSED
from src.db_ingestion.chroma_client import get_client, get_collection, add_to_collection
from src.db_ingestion.metadata_extractor import CVMetadataExtractorCrew, JobMetadataExtractorCrew

In [2]:
# Load data
cvs_data = pd.read_csv(CVS_PATH_PROCESSED, sep=";")
jobs_data = pd.read_csv(JOBS_PATH_PROCESSED, sep=";")

In [3]:
# Init ChromaDB client
client = get_client(CHROMA_DIR)

# Get or create the "cvs" collection in ChromaDB
cvs_collection = get_collection(client, "cvs")
jobs_collection = get_collection(client, "jobs")

# Init metadata extractors
cv_crew = CVMetadataExtractorCrew()
job_crew = JobMetadataExtractorCrew()

In [4]:
# Store cvs
add_to_collection(
    metadata_extractor=CVMetadataExtractorCrew(),
    corpus=cvs_data,
    collection=cvs_collection,
    max_rpm=5,
)

[32m2026-02-13 14:38:48.554[0m | [1mINFO    [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m61[0m - [1mAdding 50 documents to `cvs` collection.[0m


 26%|██▌       | 13/50 [03:50<10:16, 16.67s/it]
 78%|███████▊  | 39/50 [10:21<02:39, 14.46s/it]
100%|██████████| 50/50 [13:22<00:00, 16.05s/it]


In [5]:
# Store jobs
add_to_collection(
    metadata_extractor=JobMetadataExtractorCrew(),
    corpus=jobs_data,
    collection=jobs_collection,
    max_rpm=10,
)

[32m2026-02-13 14:54:10.116[0m | [1mINFO    [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m61[0m - [1mAdding 50 documents to `jobs` collection.[0m


100%|██████████| 50/50 [08:31<00:00, 10.22s/it]


In [None]:
# Test semantic search for cv collection
query_texts = "python machine learning engineer"

results = cvs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results

In [None]:
# Test semantic search for job collection
query_texts = "python machine learning engineer"

results = jobs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results