In [None]:
import pandas as pd
from tqdm import tqdm

import sys
sys.path.insert(0, "..")

from src.config.paths import CHROMA_DIR, CVS_PATH_PROCESSED, JOBS_PATH_PROCESSED
from src.db_ingestion.chroma_client import get_client, get_collection
from src.db_ingestion.metadata_extractor import CVMetadataExtractorCrew, JobMetadataExtractorCrew
from src.db_ingestion.schemas import CVMetadata, JobMetadata

In [2]:
# Load data
cvs_data = pd.read_csv(CVS_PATH_PROCESSED, sep=";")
jobs_data = pd.read_csv(JOBS_PATH_PROCESSED, sep=";")

In [4]:
# Example instantiation
cv_text = """
John Doe
Email: john.doe@example.com
Phone: +34 600 123 456
Location: Madrid, Spain

Professional Summary
Full‑stack software engineer with 5 years of experience building scalable backend services and APIs. 
Strong background in Python, FastAPI, Docker, and machine learning workflows. Passionate about clean code, 
automation, and cloud‑native architectures.

Skills
- Python
- FastAPI
- Docker
- Machine Learning
- SQL & PostgreSQL
- REST API Development
- Git & CI/CD

Professional Experience
Software Engineer — TechNova Solutions (2019–2024)
- Designed and implemented backend microservices using FastAPI and Docker.
- Built ML‑powered features for analytics using scikit‑learn and TensorFlow.
- Collaborated with cross‑functional teams to optimize infrastructure and CI/CD pipelines.
- Improved system performance by 30% through architectural refactoring.

Education
Bachelor’s Degree in Computer Science
Universidad Complutense de Madrid — 2019

Languages
- English (Fluent)
- Spanish (Native)
"""
cv_metadata = CVMetadata(
    cv_id="CV-005",
    category="Software Engineering",
    years_experience=5,
    skills="Python, FastAPI, Docker, Machine Learning",
    education_level="Bachelor's Degree",
    languages="English, Spanish",
    summary="Full‑stack developer with strong experience in backend systems and ML."
)
cv_metadata.model_dump()

{'cv_id': 'CV-005',
 'category': 'Software Engineering',
 'years_experience': 5,
 'skills': 'Python, FastAPI, Docker, Machine Learning',
 'education_level': "Bachelor's Degree",
 'languages': 'English, Spanish',
 'summary': 'Full‑stack developer with strong experience in backend systems and ML.'}

In [None]:
# Init ChromaDB client
client = get_client(CHROMA_DIR)

# Get or create the "cvs" collection in ChromaDB
cv_collection = get_collection(client, "cvs", if_embedding=False)
job_collection = get_collection(client, "jobs", if_embedding=False)

# Init metadata extractors
cv_crew = CVMetadataExtractorCrew()
job_crew = JobMetadataExtractorCrew()

In [None]:
for index, row in tqdm(cvs_data.iterrows(), total=len(cvs_data)):

    inputs = {"content": row["content"]}
    cv_metadata = cv_crew.crew().kickoff(inputs=inputs)

    cv_collection.add(
        ids=[row["doc_id"]],
        documents=[row["content"]],
        metadatas=[cv_metadata.pydantic.model_dump()]
    )

In [None]:
for index, row in tqdm(jobs_data.iterrows(), total=len(jobs_data)):

    inputs = {"content": row["content"]}
    job_metadata = job_crew.crew().kickoff(inputs=inputs)

    job_collection.add(
        ids=[row["doc_id"]],
        documents=[row["content"]],
        metadatas=[job_metadata.pydantic.model_dump()]
    )

In [None]:
# Test semantic search
query_texts = "python machine learning engineer"

results = cv_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"experience_level": "entry"}
)
results

In [None]:
# Test semantic search
query_texts = "python machine learning engineer"

results = job_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"experience_level": "entry"}
)
results