In [1]:
import pandas as pd

import sys
sys.path.insert(0, "..")

from src.config.paths import CHROMA_DIR, CVS_PATH_PROCESSED, JOBS_PATH_PROCESSED
from src.constants import GUARDRAIL_MAX_RETRIES
from src.db_ingestion.chroma_client import get_client, get_collection, add_to_collection
from src.db_ingestion.metadata_extraction_crew.crew import CVMetadataExtractorCrew, JobMetadataExtractorCrew
from src.db_ingestion.metadata_extraction_crew.enums import ExperienceLevel, EducationLevel, EmploymentType

In [2]:
# Load data
cvs_data = pd.read_csv(CVS_PATH_PROCESSED, sep=";")
jobs_data = pd.read_csv(JOBS_PATH_PROCESSED, sep=";")

In [3]:
# Init ChromaDB client
client = get_client(CHROMA_DIR)

# Get or create the "cvs" collection in ChromaDB
cvs_collection = get_collection(client, "cvs")
jobs_collection = get_collection(client, "jobs")

# Init metadata extractors
cv_crew = CVMetadataExtractorCrew(guardrail_max_retries=GUARDRAIL_MAX_RETRIES)
job_crew = JobMetadataExtractorCrew(guardrail_max_retries=GUARDRAIL_MAX_RETRIES)

In [4]:
# Store cvs
add_to_collection(
    metadata_extractor=CVMetadataExtractorCrew(),
    corpus=cvs_data,
    collection=cvs_collection,
    max_rpm=10,
    verbose=False,
    educationlevel_options="/".join(EmploymentType),
    experiencelevel_options="/".join(ExperienceLevel),
)

[32m2026-02-14 19:08:32.527[0m | [1mINFO    [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m65[0m - [1mAdding 50 documents to `cvs` collection.[0m


  0%|          | 0/50 [00:00<?, ?it/s]

be
[32m2026-02-14 19:08:39.835[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'scheduling appointments, C, Cardiology, customer service, directing, leadership, managing, Office, Patient Care, surgery, answering phones, phones', 'industries': 'Public Relations, Healthcare, Education', 'experience_level': <ExperienceLevel.INTERMEDIATE: 'intermediate'>, 'country': 'US', 'summary': 'Experienced Public Relations Officer with 4+ years of leadership and customer service experience. Skilled in communication, organization, and working with diverse populations. Adept at Microsoft Office and able to work independently or collaboratively.', 'education_level': <EducationLevel.BACHELOR: 'bachelor'>, 'languages': 'unknown'}[0m


  2%|▏         | 1/50 [00:07<06:23,  7.82s/it]

be
[32m2026-02-14 19:08:46.543[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'ATM, auditing, call center, cash handling, cash register, credit, client, customer satisfaction, excellent customer service, customer service, debit, fashion, FSA, inventory, money, negotiating, policies, research, safety, selling, sales, supervising, tax', 'industries': 'Retail, Banking, Customer Service', 'experience_level': <ExperienceLevel.INTERMEDIATE: 'intermediate'>, 'country': 'US', 'summary': 'Customer care professional with over 4 years of experience in customer relations. Skilled in problem-solving, providing compassionate customer service, and exceeding customer satisfaction targets. Experienced in cash handling, sales, and supervising.', 'education_level': <EducationLevel.HIGHSCHOOL: 'highschool'>, 'languages': 'unknown'}[0m


  4%|▍         | 2/50 [00:14<05:41,  7.12s/it]

be
[32m2026-02-14 19:08:55.942[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'Quantitative analysis, metrics, Account reconciliation, balancing, Complex problem solving, Staff leadership, development, Expert in MS Office Suite, Conflict resolution, Strong interpersonal skills, Procedure development, Software Applications, PeopleSoft, Commercial Electronic Office, FileNet, Hogan, Microsoft Outlook, QuickBooks, SEI Trust 3000, Trust Real Estate Management System, Trust Mineral Management System, SharePoint, Kronos, LexisNexis, Lotus Notes, Computer Skills, Proficient in Microsoft Excel, Word, PowerPoint, Publisher, Access, 12,000 KSPM ten-key, Accurate typing 75+WPM, Professional Skills, Strong analytical skills, ability to effectively multi-task, dependable, outstanding customer service skills, leadership skills, excellent problem solving skills', 'industries': 'Banking, Finance, Operations M

  6%|▌         | 3/50 [00:23<06:22,  8.14s/it]

be
[32m2026-02-14 19:09:00.350[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'Database, Faxing, Filing, general office duties, Internet Explorer, Excel, mail, office, Outlook, Power point, Microsoft Word, repairs, spreadsheets, supply inventory, technician, Answering phones, Good Customer Service, Multi-Task Management, Spreadsheets, 10-key, Data entry, Translator', 'industries': 'Education, Administration', 'experience_level': <ExperienceLevel.INTERMEDIATE: 'intermediate'>, 'country': 'US', 'summary': 'Committed and motivated Administrative Assistant with exceptional customer service and decision making skills. Strong work ethic, professional demeanor and great initiative. Skilled in Microsoft Office, time management, and multitasking.', 'education_level': <EducationLevel.OTHER: 'other'>, 'languages': 'unknown'}[0m


  8%|▊         | 4/50 [00:28<05:06,  6.67s/it]

be
[32m2026-02-14 19:09:11.963[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'Online Staff Training, Employee Management, Instructional Strategies, Conflict Resolution, Performance Evaluation, Behavioral Management, Project management, Leadership/communication skills, Human resources, Employee relations, Customer-oriented, Dreamweaver, Illustrator, Photoshop, Acrobat Pro, Photoshop Elements, Final Cut Pro, MS office, Mac OSX, Mac IOS', 'industries': 'Education, Banking, Human Resources', 'experience_level': <ExperienceLevel.SENIOR: 'senior'>, 'country': 'US', 'summary': 'Results-oriented and resourceful education professional with proven ability to effectively develop and implement educational strategies, policy and practices that improve student learning outcomes and benefit all stakeholders. Strengths in strategic planning, policy development, budget management, process evaluation, program

 10%|█         | 5/50 [00:41<06:17,  8.40s/it]


KeyboardInterrupt: 

In [4]:
# Store jobs
add_to_collection(
    metadata_extractor=JobMetadataExtractorCrew(),
    corpus=jobs_data,
    collection=jobs_collection,
    max_rpm=10,
    verbose=False,
    employmenttype_options="/".join(EducationLevel),
    experiencelevel_options="/".join(ExperienceLevel),
)

[32m2026-02-14 19:10:20.990[0m | [1mINFO    [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m65[0m - [1mAdding 50 documents to `jobs` collection.[0m


  0%|          | 0/50 [00:00<?, ?it/s]

be
[32m2026-02-14 19:10:28.225[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'CRM, paid search, SEO, affiliates, online display advertising, customer segmentation, data analysis, Excel, metrics analysis, A|B testing, wireframes, business requirements', 'industries': 'fashion, ecommerce', 'experience_level': <ExperienceLevel.SENIOR: 'senior'>, 'country': 'GB', 'summary': "Lead Stylistpick's in-house marketing team and manage external digital agencies to drive customer acquisition and retention through performance marketing campaigns across UK, France and Spain.", 'title': 'Head of Online Marketing', 'city': 'London', 'employment_type': <EmploymentType.UNKNOWN: 'unknown'>, 'responsibilities': 'Collaborate with business leaders to define and execute performance-marketing strategies, Work closely with Brand Marketing leaders to represent the voice of Stylistpick’s brand in all performance based 

  2%|▏         | 1/50 [00:07<06:15,  7.67s/it]
[32m2026-02-14 19:10:37.865[0m | [31m[1mERROR   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m86[0m - [31m[1mFailed extraction for `doc_id=14507` due to error: 1 validation error for JobMetadata
employment_type
  Input should be 'full-time', 'part-time', 'contract', 'freelance', 'other' or 'unknown' [type=enum, input_value='bachelor', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/enum[0m
  4%|▍         | 2/50 [00:16<06:51,  8.57s/it]
[32m2026-02-14 19:10:58.671[0m | [31m[1mERROR   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m86[0m - [31m[1mFailed extraction for `doc_id=15515` due to error: 1 validation error for JobMetadata
employment_type
  Input should be 'full-time', 'part-time', 'contract', 'freelance', 'other' or 'unknown' [type=enum, input_value='bachelor', input_type=str]
    For further information visit https://

be
[32m2026-02-14 19:11:08.631[0m | [34m[1mDEBUG   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m84[0m - [34m[1mMetadata: {'skills': 'researching, interviewing, writing, voicing, adapting material for radio, television and internet, producing and editing reports, recording and processing interviews, knowledge of target region, culture, institutions and political situation, adapting English material to target language and vice versa, multimedia skills, taking still photographs, recording video, uploading photos and videos, assessing information, validating comprehensiveness, accuracy and balance of news products, demonstrating high standards of journalism and news experience, functioning as news announcer, wearing suitable attire, familiarity with Best Practices Guide', 'industries': 'unknown', 'experience_level': <ExperienceLevel.SENIOR: 'senior'>, 'country': 'US', 'summary': 'Perform a combination of services that will include researching, inter

  8%|▊         | 4/50 [00:52<10:07, 13.21s/it]


KeyboardInterrupt: 

In [None]:
# Store jobs
add_to_collection(
    metadata_extractor=JobMetadataExtractorCrew(),
    corpus=jobs_data,
    collection=jobs_collection,
    max_rpm=10,
    verbose=False,
    employmenttype_options="/".join(EducationLevel),
    experiencelevel_options="/".join(ExperienceLevel),
)

[32m2026-02-14 18:40:39.808[0m | [1mINFO    [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m66[0m - [1mAdding 50 documents to `jobs` collection.[0m


 22%|██▏       | 11/50 [02:01<08:23, 12.92s/it]
[32m2026-02-14 18:42:47.360[0m | [31m[1mERROR   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m88[0m - [31m[1mFailed extraction for doc_id=10540 due to error: 1 validation error for JobMetadata
employment_type
  Input should be 'full-time', 'part-time', 'contract', 'freelance', 'other' or 'unknown' [type=enum, input_value='temporary', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/enum[0m
 72%|███████▏  | 36/50 [06:43<02:48, 12.06s/it]
[32m2026-02-14 18:47:41.922[0m | [31m[1mERROR   [0m | [36msrc.db_ingestion.chroma_client[0m:[36madd_to_collection[0m:[36m88[0m - [31m[1mFailed extraction for doc_id=9818 due to error: 1 validation error for JobMetadata
employment_type
  Input should be 'full-time', 'part-time', 'contract', 'freelance', 'other' or 'unknown' [type=enum, input_value='temporary', input_type=str]
    For further information visit https://e

In [None]:
# Test semantic search for cv collection
query_texts = "python machine learning engineer"

results = cvs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results

In [None]:
# Test semantic search for job collection
query_texts = "python machine learning engineer"

results = jobs_collection.query(
    query_texts=[query_texts],
    n_results=5,
    where={"country": "US"}
)
results