In [12]:
import sys
from docstrange import DocumentExtractor

def extract_document_text(document_path: str):
    """
    Extracts and prints the text content from a given document.

    Args:
        document_path (str): The file path to the document (e.g., 'document.pdf').
    """
    print(f"Starting text extraction from: {document_path}\n")

    # Initialize the DocumentExtractor in local CPU mode.
    # This ensures that all processing happens on your machine and no data
    # is sent to a cloud API.
    try:
        extractor = DocumentExtractor()
    except Exception as e:
        print(f"Error initializing DocumentExtractor: {e}")
        print("Please ensure you have installed the necessary dependencies.")
        print("If you are running for the first time, you may need an internet connection to download models.")
        return

    try:
        # The extract() method processes the document.
        # It handles various formats like PDF, DOCX, and images.
        result = extractor.extract(document_path)
        
        # We can extract the content in various formats. Here, we'll get
        # the cleaned, LLM-optimized Markdown text.
        extracted_text = result.extract_markdown()

        if extracted_text:
            print("--- Extracted Text ---")
            print(extracted_text)
            print("----------------------")
        else:
            print("No text could be extracted from the document.")

    except FileNotFoundError:
        print(f"Error: The file '{document_path}' was not found.")
        print("Please check the file path and try again.")
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
    
    return extracted_text

if __name__ == "__main__":
    # In a real-world scenario, you would replace 'your_document.pdf'
    # with the actual path to your document.
    # For a command-line script, you could also use sys.argv to get the path.
    sample_document_path = './CV_Image.png'

 

    extract_document_text(sample_document_path)


Starting text extraction from: ./CV_Image.png

--- Extracted Text ---
# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
**Phone**
0240542834

**Email**
aidooenochkwadwo@gmail.com

**Kumasi, Ghana**

## Qualities
* Curiosity
* Problem Solving
* System Understanding
* Technical Skills
* Analytical Thinking
* Problem Solving Skills
* Teamwork
* Initiative and Self-motivation
* Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Comb

In [13]:
extracted_text = extract_document_text('./CV_Image.png')
with open('extracted_text.txt', 'w', encoding='utf-8') as f:
    f.write(extracted_text)

Starting text extraction from: ./CV_Image.png

--- Extracted Text ---
# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
**Phone**
0240542834

**Email**
aidooenochkwadwo@gmail.com

**Kumasi, Ghana**

## Qualities
* Curiosity
* Problem Solving
* System Understanding
* Technical Skills
* Analytical Thinking
* Problem Solving Skills
* Teamwork
* Initiative and Self-motivation
* Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Comb

In [17]:
import re
import json

def structure_cv_schema(text):
    schema = {
        "name": None,
        "email": None,
        "phone": None,
        "summary": None,
        "work_experience": [],
        "education": [],
        "skills": [],
        "soft_skills": [],
        "certifications": [],
        "projects": [],
        "languages": None,
        "hobbies": None,
        "other": None
    }

    # Extract email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    if email_match:
        schema['email'] = email_match.group(0)

    # Extract phone (simple pattern, may need adjustment)
    phone_match = re.search(r'(\+?\d[\d\s\-]{7,}\d)', text)
    if phone_match:
        schema['phone'] = phone_match.group(0)

    # Extract name (assume first line is name if it looks like a name)
    lines = text.splitlines()
    if lines:
        first_line = lines[0].strip()
        if len(first_line.split()) >= 2 and not any(x in first_line.lower() for x in ['curriculum', 'resume', 'cv']):
            schema['name'] = first_line

    # Extract sections by keywords
    section_patterns = {
        'education': r'(education|academic background|qualifications)',
        'work_experience': r'(experience|employment|work history|professional experience|work experience)',
        'skills': r'(skills|technical skills|competencies)',
        'summary': r'(summary|profile|about me)',
        'certifications': r'(certifications|certificates)',
        'projects': r'(projects|project experience)',
        'soft_skills': r'(soft skills|personal skills|interpersonal skills)',
        'languages': r'(languages|language proficiency)',
        'hobbies': r'(hobbies|interests)',
        'other': r'(other|additional information)'
    }

    # Find section indices
    section_indices = {}
    for key, pattern in section_patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            section_indices[key] = match.start()

    # Sort sections by appearance
    sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])

    # Extract section contents
    for i, (section, start_idx) in enumerate(sorted_sections):
        end_idx = sorted_sections[i+1][1] if i+1 < len(sorted_sections) else len(text)
        content = text[start_idx:end_idx].strip()
        if section in ['education', 'work_experience', 'certifications', 'projects', 'soft_skills']:
            schema[section] = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns[section], line, re.IGNORECASE)]
        elif section == 'skills':
            skills_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['skills'], line, re.IGNORECASE)]
            if skills_lines:
                schema['skills'] = re.split(r',|;|\n', ' '.join(skills_lines))
                schema['skills'] = [s.strip() for s in schema['skills'] if s.strip()]
        elif section == 'summary':
            summary_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['summary'], line, re.IGNORECASE)]
            schema['summary'] = ' '.join(summary_lines)
        elif section in ['languages', 'hobbies', 'other']:
            lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns[section], line, re.IGNORECASE)]
            schema[section] = ', '.join(lines) if lines else None

    return schema

# Example usage: structure the extracted markdown or text
# Replace 'markdown_content' with your extracted text variable
# structured_cv = structure_cv_schema(markdown_content)
# print(json.dumps(structured_cv, indent=2))


In [18]:
structured_cv = structure_cv_schema(extracted_text)

In [19]:
structured_cv

{'name': '# Aidoo Enoch Kwadwo',
 'email': 'aidooenochkwadwo@gmail.com',
 'phone': '0240542834',
 'summary': 'A Data Analyst with about two years of',
 'work_experience': ['##'],
 'education': ['**Bachelor of Science in Computer Science.**',
  'KNUST, Ghana',
  '2023',
  '## Courses and'],
 'skills': ['* Analytical Thinking * Teamwork * Initiative and Self-motivation * Discipline and Resilient **Tools:** Python',
  'R',
  'AWS',
  'Microsoft Excel',
  'Google Sheets',
  'Power BI',
  'SQL **Packages/Frameworks:** NumPy',
  'Pandas',
  'Scikit-Learn',
  'Matplotlib',
  'Pytorch **Machine Learning:** Data Analysis',
  'Classification Modeling',
  'Deep Neural Networks',
  'Regression Modelling',
  'MLOPs',
  'Computer Vision',
  'Natural Language Processing',
  'Recommendation Systems ##'],
 'soft_skills': [],
 'certifications': ['* Coursera Crash Course on Python.',
  'Oct 2021',
  '* AWS Machine Learning Foundation.',
  'Oct 2021',
  '* Introduction to Deep Learning with Pytorch.',
  '

In [None]:
from docstrange import DocumentExtractor
import json

# Initialize the document extractor
extractor = DocumentExtractor()

# Define a sample JSON schema that represents your desired database schema
# The schema is designed to extract a comprehensive set of details from a resume
resume_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "summary": {"type": "string"},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {"type": "string"},
                    "title": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"},
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["company", "title"]
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string"},
                    "field_of_study": {"type": "string"},
                    "institution": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"}
                },
                "required": ["degree", "institution"]
            }
        },
        "skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "soft_skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "certifications": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "issuing_organization": {"type": "string"},
                    "date": {"type": "string"}
                },
                "required": ["name"]
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "technologies": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["name"]
            }
        },
        "languages": {"type": "string"},
        "hobbies": {"type": "string"},
        "other": {"type": "string"}
    }
}

# Assume 'my_document.pdf' is a document like a resume
# You would replace this with the path to your actual file
document_path = './CV_Image.png'

# Extract data from the document using the JSON schema
try:
    result = extractor.extract(document_path)
    structured_data = result.extract_data(json_schema=resume_schema)

    # Print the resulting structured JSON
    print(json.dumps(structured_data, indent=2))

except FileNotFoundError:
    print(f"Error: Document not found at '{document_path}'. Please provide a valid file path.")
except Exception as e:
    print(f"An error occurred during extraction: {e}")


{
  "structured_data": {
    "name": "Aidoo Enoch Kwadwo",
    "email": "aidoenochkwadwo@gmail.com",
    "phone": "0240542834",
    "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
    "work_experience": [
      {
        "company": "Really Great Tech",
        "title": "Data Analytics/AI/ML Engineer",
        "location": null,
        "start_date": "November 2023",
        "end_date": "October 2024",
        "responsibilities": [
          "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in an AI project.",
          

In [20]:
with open('extracted_file.txt', 'w', encoding='utf-8') as f:
    f.write(json.dumps(structured_data, indent=2, ensure_ascii=False))

In [1]:
import os
import json
import numpy as np
from rapidfuzz import fuzz
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from dotenv import load_dotenv
from FlagEmbedding import FlagReranker
import hashlib

# ✅ Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_Token")

if not HF_TOKEN:
    raise ValueError("❌ Hugging Face API token missing. Add HF_Token=your_key to .env")

# Initialize FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

# --- Helpers ---
def normalize(text):
    return str(text).lower().strip() if text else ""

def fuzzy_match(target, candidate_list, threshold=80):
    if not target or not candidate_list:
        return False
    target_norm = normalize(target)
    for candidate in candidate_list:
        if fuzz.token_set_ratio(target_norm, normalize(candidate)) >= threshold:
            return True
    return False

location_synonyms = {
    "us": ["usa", "united states", "america"],
    "uk": ["united kingdom", "england"],
    "remote": ["remote", "anywhere"],
}

def location_match(required, candidate):
    if not required or not candidate:
        return False
    req_norm = normalize(required)
    cand_norm = normalize(candidate)
    for synonym in location_synonyms.get(req_norm, [req_norm]):
        if synonym in cand_norm:
            return True
    return False

def fuzzy_education_match(required_edu, candidate_edu_list):
    if not required_edu or not candidate_edu_list:
        return 100.0
    if isinstance(required_edu, dict):
        degree = required_edu.get("degree", "")
        field = required_edu.get("field", "") or required_edu.get("field_of_study", "")
        required_str = f"{degree} in {field}".strip()
    else:
        required_str = str(required_edu)
    required_norm = normalize(required_str)
    max_ratio = 0
    for edu in candidate_edu_list:
        if isinstance(edu, dict):
            degree = edu.get("degree", "")
            field = edu.get("field_of_study", "")
            candidate_str = f"{degree} in {field}".strip()
        else:
            candidate_str = str(edu)
        ratio = fuzz.token_set_ratio(required_norm, normalize(candidate_str))
        max_ratio = max(max_ratio, ratio)
    return max_ratio

def hash_email(email):
    return hashlib.md5(email.lower().strip().encode()).hexdigest() if email else None

# --- FlagReranker scoring ---
def flag_rerank_score(query, passage, normalize=True):
    try:
        score = reranker.compute_score([query, passage])
        if normalize:
            import math
            score = 1 / (1 + math.exp(-score)) * 100  # sigmoid to 0–100
        return float(score)
    except Exception as e:
        print(f"⚠️ FlagReranker error: {e}")
        return 0.0

# --- Main scoring function ---
def score_cv_against_job(cv_json_path, job_description_path, min_experience_years=2):
    weights = {
        "hard_filters": 0.25,
        "semantic_similarity": 0.45,
        "experience_alignment": 0.15,
        "education_alignment": 0.15,
    }

    # ✅ Ollama embeddings (local)
    embeddings = OllamaEmbeddings(model="mxbai-embed-large")

    # Load job description
    try:
        with open(job_description_path, "r", encoding="utf-8") as f:
            job_text = f.read().strip()
    except FileNotFoundError:
        print(f"❌ Job description file not found: {job_description_path}")
        return 0.0

    job_req = {
        "required_skills": [],
        "required_certifications": [],
        "location_eligibility": "",
        "min_years_experience": min_experience_years,
        "required_education": "",
    }

    required_skills = set(normalize(s) for s in job_req.get("required_skills", []) if s)
    required_certifications = set(normalize(c) for c in job_req.get("required_certifications", []) if c)
    location_req = job_req.get("location_eligibility", "")
    min_years = job_req.get("min_years_experience", min_experience_years) or min_experience_years
    required_education = job_req.get("required_education", "")

    # Load CV JSON
    try:
        with open(cv_json_path, "r", encoding="utf-8") as f:
            cv_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"❌ Error loading CV JSON {cv_json_path}: {e}")
        return 0.0

    structured_data = cv_data.get("CV_data", {}).get("structured_data", {})
    years_of_experience = float(structured_data.get("years_of_experience", 0.0) or 0.0)
    candidate_skills = [normalize(s) for s in structured_data.get("skills", []) + structured_data.get("soft_skills", []) if s]
    candidate_certifications = [normalize(c.get("name", "")) for c in structured_data.get("certifications", []) if isinstance(c, dict) and c.get("name")]
    candidate_location = normalize(structured_data.get("location", ""))
    candidate_education = structured_data.get("education", [])

    email = normalize(structured_data.get("email", ""))
    cv_id = hash_email(email) or os.path.splitext(os.path.basename(cv_json_path))[0]
    print(f"DEBUG - cv_id: {cv_id}")

    # --- Hard Score ---
    hard_score, total_weight = 0.0, 0.0
    if required_skills:
        skill_matches = sum(1 for s in required_skills if fuzzy_match(s, candidate_skills))
        hard_score += (skill_matches / len(required_skills)) * 0.5
        total_weight += 0.5
    if required_certifications:
        cert_matches = sum(1 for c in required_certifications if fuzzy_match(c, candidate_certifications))
        hard_score += (cert_matches / len(required_certifications)) * 0.3
        total_weight += 0.3
    if location_req:
        hard_score += 0.2 if location_match(location_req, candidate_location) else 0.0
        total_weight += 0.2
    hard_score = (hard_score / total_weight * 100) if total_weight > 0 else 0.0

    # --- Semantic Similarity with FlagReranker ---
    try:
        vectorstore = Chroma(
            persist_directory="./chroma_db",
            embedding_function=embeddings,
            collection_name="cv_sections",
        )
        retrieved = vectorstore.get(where={"cv_id": cv_id})
        documents, metadatas = retrieved.get("documents", []), retrieved.get("metadatas", [])
        if not documents:
            print(f"⚠️ No embeddings found for CV: {cv_json_path}")
            return 0.0
    except Exception as e:
        print(f"⚠️ Error accessing Chroma: {e}")
        return 0.0

    section_scores = []
    for doc_text, metadata in zip(documents, metadatas):
        score = flag_rerank_score(job_text, doc_text, normalize=True)
        weight = 1.5 if metadata.get("section") in ["work_experience", "skills", "summary"] else 1.0
        section_scores.append(score * weight)
    semantic_score = np.mean(section_scores) if section_scores else 0.0

    # --- Experience Alignment ---
    experience_score = min(years_of_experience / min_years, 1.0) * 100 if min_years > 0 else 100.0

    # --- Education Alignment ---
    education_score = fuzzy_education_match(required_education, candidate_education)

    # --- Hybrid Score ---
    hybrid_score = (
        hard_score * weights["hard_filters"]
        + semantic_score * weights["semantic_similarity"]
        + experience_score * weights["experience_alignment"]
        + education_score * weights["education_alignment"]
    )
    return round(hybrid_score, 2)

# --- Example usage ---
if __name__ == "__main__":
    cv_json_path = "./extracted_files/CV_Image.json"
    job_description_path = "./job_description.txt"
    score = score_cv_against_job(cv_json_path, job_description_path, min_experience_years=3)
    print(f"CV Hybrid Score: {score}/100")


DEBUG - cv_id: 9d427c27755752a2c7b94210180e91e9
DEBUG - cv_id: 9d427c27755752a2c7b94210180e91e9


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type for unary -: 'list'
⚠️ FlagReranker error: bad operand type 

**Verifying Stored Embeddings**

In [2]:
from langchain_community.vectorstores import Chroma
import numpy as np

cv_id = "9d427c27755752a2c7b94210180e91e9"
vectorstore = Chroma(persist_directory="./chroma_db", collection_name="cv_sections")
retrieved = vectorstore.get(where={"cv_id": cv_id}, include=['embeddings', 'metadatas', 'documents'])

embeddings = retrieved.get("embeddings", [])
metadatas = retrieved.get("metadatas", [])
documents = retrieved.get("documents", [])

print(f"Retrieved {len(embeddings)} embeddings for cv_id: {cv_id}")
for i, (emb, meta, doc) in enumerate(zip(embeddings, metadatas, documents)):
    emb_array = np.array(emb, dtype=np.float64)
    print(f"Embedding {i} (section: {meta['section']}, chunk: {meta['chunk_id']}):")
    print(f"  Shape: {emb_array.shape}")
    print(f"  Type: {type(emb)}")
    print(f"  First few values: {emb_array[:5]}")
    print(f"  Document content (first 100 chars): {doc[:100]}")

  vectorstore = Chroma(persist_directory="./chroma_db", collection_name="cv_sections")


Retrieved 15 embeddings for cv_id: 9d427c27755752a2c7b94210180e91e9
Embedding 0 (section: email, chunk: 0):
  Shape: (1024,)
  Type: <class 'numpy.ndarray'>
  First few values: [-0.01467171 -0.02097356 -0.04572035  0.01120208  0.00452306]
  Document content (first 100 chars): "aidoenochkwadwo@gmail.com"
Embedding 1 (section: summary, chunk: 0):
  Shape: (1024,)
  Type: <class 'numpy.ndarray'>
  First few values: [-0.00620162  0.03033508  0.00012151 -0.02254748  0.02840543]
  Document content (first 100 chars): "A Data Analyst with about two years of professional experience specialized in transforming complex 
Embedding 2 (section: work_experience, chunk: 0):
  Shape: (1024,)
  Type: <class 'numpy.ndarray'>
  First few values: [-0.00464036 -0.01416096 -0.03717647 -0.04362679 -0.01361977]
  Document content (first 100 chars): [{"company": "Really Great Tech", "title": "Data Analytics/AI/ML Engineer", "location": "", "start_d
Embedding 3 (section: work_experience, chunk: 1):
  Shape: (102

In [3]:
import json
import os
import numpy as np
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from rapidfuzz import fuzz
import hashlib

# --- Helper functions ---
def normalize(text):
    return text.lower().strip()

def fuzzy_match(target, candidate_list, threshold=80):
    target_norm = normalize(target)
    for candidate in candidate_list:
        candidate_norm = normalize(candidate)
        if fuzz.token_set_ratio(target_norm, candidate_norm) >= threshold:
            return True
    return False

location_synonyms = {
    "us": ["usa", "united states", "america"],
    "uk": ["united kingdom", "england"],
    "remote": ["remote", "anywhere"]
}

def location_match(required, candidate):
    req_norm = normalize(required)
    cand_norm = normalize(candidate)
    for synonym in location_synonyms.get(req_norm, [req_norm]):
        if synonym in cand_norm:
            return True
    return False

def fuzzy_education_match(required_edu, candidate_edu_list):
    """
    Compare required education (string or dict) against candidate education list (dicts).
    Returns best fuzzy match score (0-100).
    """
    if not required_edu:
        return 100.0  # no requirement = full score

    # Convert required_edu into string if it's a dict
    if isinstance(required_edu, dict):
        degree = required_edu.get("degree", "")
        field = required_edu.get("field", "") or required_edu.get("field_of_study", "")
        required_str = f"{degree} in {field}".strip()
    else:
        required_str = str(required_edu)

    required_norm = normalize(required_str)
    max_ratio = 0

    for edu in candidate_edu_list:
        if isinstance(edu, dict):
            degree = edu.get("degree", "")
            field = edu.get("field_of_study", "")
            candidate_str = f"{degree} in {field}".strip()
        else:
            candidate_str = str(edu)
        ratio = fuzz.token_set_ratio(required_norm, normalize(candidate_str))
        max_ratio = max(max_ratio, ratio)

    return max_ratio

def hash_email(email):
    return hashlib.md5(email.lower().strip().encode()).hexdigest()

# --- Main scoring function ---
def score_cv_against_job(cv_json_path, job_description_path, min_experience_years=2):
    weights = {
        "hard_filters": 0.25,
        "semantic_similarity": 0.45,
        "experience_alignment": 0.15,
        "education_alignment": 0.15
    }

    embeddings = OllamaEmbeddings(model="mxbai-embed-large")

    # Load job description
    try:
        with open(job_description_path, 'r', encoding='utf-8') as f:
            job_text = f.read().strip()
            if not job_text:
                print(f"❌ Job description is empty: {job_description_path}")
                return 0.0
    except FileNotFoundError:
        print(f"❌ Job description file not found: {job_description_path}")
        return 0.0

    # Parse job requirements using LLM
    llm = ChatOllama(model="llama3.2:latest", format="json")
    prompt_template = PromptTemplate.from_template("""
        Extract the following from the job description as JSON:
        - required_skills: list of required skills
        - required_certifications: list of required certifications
        - location_eligibility: required location or "remote"
        - min_years_experience: minimum years of experience
        - required_education: required degree and field

        Job Description:
        {job_text}

        Output only JSON.
    """)
    chain = prompt_template | llm
    response = chain.invoke({"job_text": job_text})
    try:
        job_req = json.loads(response.content)
    except json.JSONDecodeError:
        print("⚠️ Error parsing LLM response; using defaults.")
        job_req = {
            "required_skills": [],
            "required_certifications": [],
            "location_eligibility": "",
            "min_years_experience": min_experience_years,
            "required_education": ""
        }

    required_skills = set(normalize(s) for s in job_req.get("required_skills", []))
    required_certifications = set(normalize(c) for c in job_req.get("required_certifications", []))
    location_req = job_req.get("location_eligibility", "").lower()
    min_years = job_req.get("min_years_experience", min_experience_years)
    required_education = job_req.get("required_education", "")

    # Load CV
    try:
        with open(cv_json_path, 'r', encoding='utf-8') as f:
            cv_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"❌ Error loading CV JSON {cv_json_path}: {e}")
        return 0.0

    structured_data = cv_data.get("CV_data", {}).get("structured_data", {})
    years_of_experience = structured_data.get("years_of_experience", 0.0)
    candidate_skills = [normalize(s) for s in structured_data.get("skills", []) + structured_data.get("soft_skills", [])]
    candidate_certifications = [normalize(c.get("name","")) for c in structured_data.get("certifications", []) if isinstance(c, dict)]
    candidate_location = structured_data.get("location", "").lower()
    candidate_education = structured_data.get("education", [])

    # --- Generate cv_id from hashed email ---
    email = structured_data.get("email", "")
    if email:
        cv_id = hash_email(email)
    else:
        cv_id = os.path.splitext(os.path.basename(cv_json_path))[0]

    print(f"DEBUG - Email: {email}")
    print(f"DEBUG - cv_id: {cv_id}")

    # --- Hard Score ---
    hard_score = 0.0
    skill_matches = sum(1 for s in required_skills if fuzzy_match(s, candidate_skills))
    skill_fraction = skill_matches / len(required_skills) if required_skills else 1.0
    hard_score += skill_fraction * 0.5

    cert_matches = sum(1 for c in required_certifications if fuzzy_match(c, candidate_certifications))
    cert_fraction = cert_matches / len(required_certifications) if required_certifications else 1.0
    hard_score += cert_fraction * 0.3

    if location_req:
        hard_score += 0.2 if location_match(location_req, candidate_location) else 0.0

    hard_score = hard_score * 100

    # --- Semantic Similarity ---
    try:
        vectorstore = Chroma(
            persist_directory="./chroma_db",
            embedding_function=embeddings,
            collection_name="cv_sections"
        )
        retrieved = vectorstore.get(where={"cv_id": cv_id}, include=['embeddings', 'metadatas', 'documents'])
        documents = retrieved.get("documents", [])
        metadatas = retrieved.get("metadatas", [])
        embeddings_retrieved = retrieved.get("embeddings", [])
        if not documents or not embeddings_retrieved:
            print(f"⚠️ No embeddings or documents found for CV: {cv_json_path}")
            return 0.0
        print(f"DEBUG - Retrieved {len(embeddings_retrieved)} embeddings for cv_id: {cv_id}")
    except Exception as e:
        print(f"⚠️ Error accessing Chroma: {e}")
        return 0.0

    try:
        job_vector = np.array(embeddings.embed_query(job_text), dtype=np.float64)
        print(f"DEBUG - Job vector shape: {job_vector.shape}")
        if job_vector.ndim != 1 or job_vector.shape[0] != 1024:
            print(f"⚠️ Invalid job vector shape: {job_vector.shape}")
            return 0.0
        if not np.all(np.isfinite(job_vector)):
            print("⚠️ Invalid values in job vector (NaN or inf)")
            return 0.0
    except Exception as e:
        print(f"⚠️ Error embedding job description: {e}")
        return 0.0

    section_scores = []
    for i, (doc_vector, metadata) in enumerate(zip(embeddings_retrieved, metadatas)):
        try:
            # Ensure doc_vector is a 1D NumPy array
            doc_vector = np.array(doc_vector, dtype=np.float64).flatten()
            print(f"DEBUG - Embedding {i} (section: {metadata['section']}, chunk: {metadata['chunk_id']}) shape: {doc_vector.shape}")
            # Verify shapes match
            if doc_vector.shape != job_vector.shape:
                print(f"⚠️ Shape mismatch for CV section {i}: job_vector {job_vector.shape}, doc_vector {doc_vector.shape}")
                continue
            # Check for non-numeric or invalid values
            if not np.all(np.isfinite(doc_vector)):
                print(f"⚠️ Invalid values in doc_vector for CV section {i}")
                continue
            # Compute cosine similarity
            norm_product = np.linalg.norm(job_vector) * np.linalg.norm(doc_vector)
            if norm_product == 0:
                print(f"⚠️ Zero norm detected for CV section {i}")
                continue
            similarity = np.dot(job_vector, doc_vector) / norm_product
            # Ensure similarity is within valid range [-1, 1]
            similarity = np.clip(similarity, -1.0, 1.0)
            weight = 1.5 if metadata.get("section") in ["work_experience", "skills", "summary"] else 1.0
            section_scores.append(similarity * weight)
        except Exception as e:
            print(f"⚠️ Error computing similarity for CV section {i}: {e}")
            continue

    if section_scores:
        mean_score = np.mean(section_scores) * 100
        max_score = max(section_scores) * 100
        print(f"DEBUG - Mean similarity score: {mean_score}, Max similarity score: {max_score}")
        # Hybrid pooling (balanced overall coverage vs standout sections)
        semantic_score = 0.7 * mean_score + 0.3 * max_score
    else:
        print("⚠️ No valid section scores computed")
        semantic_score = 0.0

    # --- Experience Alignment ---
    experience_score = min(years_of_experience / min_years, 1.0) * 100 if min_years > 0 else 100.0

    # --- Education Alignment ---
    education_score = fuzzy_education_match(required_education, candidate_education)

    # --- Hybrid Score ---
    hybrid_score = (
        hard_score * weights["hard_filters"] +
        semantic_score * weights["semantic_similarity"] +
        experience_score * weights["experience_alignment"] +
        education_score * weights["education_alignment"]
    )

    print(f"DEBUG - Hard score: {hard_score}")
    print(f"DEBUG - Semantic score: {semantic_score}")
    print(f"DEBUG - Experience score: {experience_score}")
    print(f"DEBUG - Education score: {education_score}")
    return round(hybrid_score, 2)

# --- Example usage ---
if __name__ == "__main__":
    cv_json_path = "./extracted_files/CV_Image.json"
    job_description_path = "./job_description.txt"
    score = score_cv_against_job(cv_json_path, job_description_path, min_experience_years=3)
    print(f"CV Hybrid Score: {score}/100")

DEBUG - Email: aidoenochkwadwo@gmail.com
DEBUG - cv_id: 9d427c27755752a2c7b94210180e91e9
⚠️ Error accessing Chroma: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
CV Hybrid Score: 0.0/100


**Retrieving Collections**

In [1]:
import chromadb

client = chromadb.PersistentClient(path="./chroma_db")

#collection = client.get_collection(name="cv_sections")



In [6]:
collection

Collection(name=cv_sections)

In [2]:
collections = client.list_collections(limit=100)


In [3]:
collections

[Collection(name=cv_sections)]

In [7]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
import chromadb
import numpy as np
import json

print("=" * 80)
print("CV EMBEDDED VECTORS RETRIEVAL - DIRECT ACCESS")
print("=" * 80)

# --- Method 1: Direct ChromaDB client access ---
print("\n[Method 1] Using ChromaDB client directly...")

try:
    # Create direct client
    client = chromadb.PersistentClient(path="./chroma_db")
    
    # List all collections
    collections = client.list_collections()
    print(f"\n✓ Found {len(collections)} collection(s):")
    for coll in collections:
        print(f"  - {coll.name} (count: {coll.count()})")
    
    # Try both possible collection names
    collection_names = ["cv_sections", "cv_sections1"]
    collection = None
    
    for name in collection_names:
        try:
            collection = client.get_collection(name=name)
            print(f"\n✓ Accessing collection: {name}")
            print(f"  Total documents: {collection.count()}")
            break
        except Exception as e:
            print(f"  Collection '{name}' not found or error: {e}")
            continue
    
    if not collection:
        print("\n❌ Could not access any collection. Available collections:")
        for coll in collections:
            print(f"  - {coll.name}")
        exit(1)
    
    # Get data with embeddings using where/limit
    print(f"\n{'─' * 80}")
    print("RETRIEVING EMBEDDINGS...")
    print('─' * 80)
    
    # Get first 5 documents
    result = collection.get(
        limit=5,
        include=['embeddings', 'metadatas', 'documents']
    )
    
    print(f"\n✓ Successfully retrieved {len(result['ids'])} documents with embeddings")
    
    if result['embeddings'] is not None and len(result['embeddings']) > 0:
        print(f"✓ Embedding dimension: {len(result['embeddings'][0])}")
    
    # Display each document with its embedding
    for i, (doc_id, embedding, metadata, content) in enumerate(zip(
        result['ids'],
        result['embeddings'],
        result['metadatas'],
        result['documents']
    ), 1):
        print(f"\n{'═' * 80}")
        print(f"DOCUMENT {i}")
        print(f"{'═' * 80}")
        print(f"ID: {doc_id}")
        print(f"CV ID: {metadata.get('cv_id', 'N/A')}")
        print(f"Section: {metadata.get('section', 'N/A')}")
        print(f"Chunk ID: {metadata.get('chunk_id', 'N/A')}")
        
        print(f"\nContent Preview:")
        try:
            content_json = json.loads(content)
            content_str = json.dumps(content_json, indent=2)
            print(content_str[:400] + "..." if len(content_str) > 400 else content_str)
        except:
            print(content[:400] + "..." if len(content) > 400 else content)
        
        print(f"\n{'─' * 40}")
        print("EMBEDDING VECTOR:")
        print('─' * 40)
        embedding_array = np.array(embedding, dtype=np.float32)
        print(f"Shape: {embedding_array.shape}")
        print(f"Dtype: {embedding_array.dtype}")
        print(f"\nFirst 20 values:\n{embedding_array[:20]}")
        print(f"\nLast 20 values:\n{embedding_array[-20:]}")
        print(f"\nStatistics:")
        print(f"  Mean:     {np.mean(embedding_array):.6f}")
        print(f"  Std Dev:  {np.std(embedding_array):.6f}")
        print(f"  Min:      {np.min(embedding_array):.6f}")
        print(f"  Max:      {np.max(embedding_array):.6f}")
        print(f"  L2 Norm:  {np.linalg.norm(embedding_array):.6f}")
        
        # Uncomment to see full vector
        # print(f"\nFull vector:\n{embedding_array}")
    
    # Get all embeddings for analysis
    print(f"\n{'═' * 80}")
    print("FULL COLLECTION ANALYSIS")
    print(f"{'═' * 80}")
    
    all_result = collection.get(
        include=['embeddings', 'metadatas']
    )
    
    print(f"\nTotal documents: {len(all_result['ids'])}")
    
    # Organize by section
    sections_data = {}
    for embedding, metadata in zip(all_result['embeddings'], all_result['metadatas']):
        section = metadata.get('section', 'unknown')
        if section not in sections_data:
            sections_data[section] = []
        sections_data[section].append(np.array(embedding, dtype=np.float32))
    
    print(f"\n{'─' * 40}")
    print("EMBEDDINGS BY SECTION:")
    print('─' * 40)
    for section, embeddings_list in sorted(sections_data.items()):
        embeddings_matrix = np.array(embeddings_list)
        print(f"\n{section.upper()}:")
        print(f"  Documents: {len(embeddings_list)}")
        print(f"  Avg L2 Norm: {np.mean([np.linalg.norm(e) for e in embeddings_list]):.6f}")
        print(f"  Avg Mean: {np.mean(embeddings_matrix):.6f}")
        print(f"  Avg Std: {np.std(embeddings_matrix):.6f}")
    
    # Export options
    print(f"\n{'═' * 80}")
    print("EXPORTING EMBEDDINGS TO FILES")
    print(f"{'═' * 80}")
    
    # Export embeddings as numpy array
    all_embeddings = np.array(all_result['embeddings'], dtype=np.float32)
    np.save('cv_embeddings.npy', all_embeddings)
    print("✓ Saved embeddings to: cv_embeddings.npy")
    print(f"  Shape: {all_embeddings.shape}")
    
    # Export metadata
    with open('cv_metadata.json', 'w', encoding='utf-8') as f:
        json.dump({
            'ids': all_result['ids'],
            'metadatas': all_result['metadatas']
        }, f, indent=2)
    print("✓ Saved metadata to: cv_metadata.json")
    
    # Export embeddings as human-readable text file
    with open('cv_embeddings.txt', 'w', encoding='utf-8') as f:
        f.write("=" * 80 + "\n")
        f.write("CV EMBEDDINGS - FULL EXPORT\n")
        f.write("=" * 80 + "\n\n")
        
        for i, (doc_id, embedding, metadata) in enumerate(zip(
            all_result['ids'],
            all_result['embeddings'],
            all_result['metadatas']
        ), 1):
            f.write(f"\n{'=' * 80}\n")
            f.write(f"DOCUMENT {i}\n")
            f.write(f"{'=' * 80}\n")
            f.write(f"ID: {doc_id}\n")
            f.write(f"CV ID: {metadata.get('cv_id', 'N/A')}\n")
            f.write(f"Section: {metadata.get('section', 'N/A')}\n")
            f.write(f"Chunk ID: {metadata.get('chunk_id', 'N/A')}\n")
            
            embedding_array = np.array(embedding, dtype=np.float32)
            f.write(f"\nEmbedding Vector (dimension: {len(embedding_array)}):\n")
            f.write(f"{embedding_array}\n")
            
            f.write(f"\nStatistics:\n")
            f.write(f"  Mean:     {np.mean(embedding_array):.6f}\n")
            f.write(f"  Std Dev:  {np.std(embedding_array):.6f}\n")
            f.write(f"  Min:      {np.min(embedding_array):.6f}\n")
            f.write(f"  Max:      {np.max(embedding_array):.6f}\n")
            f.write(f"  L2 Norm:  {np.linalg.norm(embedding_array):.6f}\n")
    
    print("✓ Saved human-readable embeddings to: cv_embeddings.txt")
    
    # Export embeddings as CSV for easy viewing
    import csv
    with open('cv_embeddings.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        
        # Header
        header = ['doc_id', 'cv_id', 'section', 'chunk_id'] + [f'dim_{i}' for i in range(len(all_result['embeddings'][0]))]
        writer.writerow(header)
        
        # Data
        for doc_id, embedding, metadata in zip(
            all_result['ids'],
            all_result['embeddings'],
            all_result['metadatas']
        ):
            row = [
                doc_id,
                metadata.get('cv_id', 'N/A'),
                metadata.get('section', 'N/A'),
                metadata.get('chunk_id', 'N/A')
            ] + list(embedding)
            writer.writerow(row)
    
    print("✓ Saved embeddings as CSV to: cv_embeddings.csv")
    
    print(f"\n{'─' * 80}")
    print("FILES CREATED:")
    print('─' * 80)
    print("1. cv_embeddings.npy    - NumPy binary format (for loading in Python)")
    print("2. cv_metadata.json     - Document metadata in JSON format")
    print("3. cv_embeddings.txt    - Human-readable text file with all vectors")
    print("4. cv_embeddings.csv    - CSV format (can open in Excel)")
    
    print(f"\nTo load embeddings later:")
    print("embeddings = np.load('cv_embeddings.npy')")
    print("with open('cv_metadata.json', 'r') as f:")
    print("    metadata = json.load(f)")
    
except Exception as e:
    print(f"\n❌ Error: {e}")
    import traceback
    traceback.print_exc()
    
    print("\n" + "=" * 80)
    print("TROUBLESHOOTING:")
    print("=" * 80)
    print("1. Check collection name in embedding script (cv_sections vs cv_sections1)")
    print("2. Verify ./chroma_db directory exists and has data")
    print("3. Try deleting chroma_db folder and re-running embedding script")
    print("4. Check if ChromaDB version is compatible")

print("\n" + "=" * 80)

CV EMBEDDED VECTORS RETRIEVAL - DIRECT ACCESS

[Method 1] Using ChromaDB client directly...

✓ Found 1 collection(s):
  - cv_sections (count: 15)

✓ Accessing collection: cv_sections
  Total documents: 15

────────────────────────────────────────────────────────────────────────────────
RETRIEVING EMBEDDINGS...
────────────────────────────────────────────────────────────────────────────────

✓ Successfully retrieved 5 documents with embeddings
✓ Embedding dimension: 1024

════════════════════════════════════════════════════════════════════════════════
DOCUMENT 1
════════════════════════════════════════════════════════════════════════════════
ID: 61de4564-6f20-4a83-a86e-822ee63f73f3
CV ID: 9d427c27755752a2c7b94210180e91e9
Section: email
Chunk ID: 0

Content Preview:
"aidoenochkwadwo@gmail.com"

────────────────────────────────────────
EMBEDDING VECTOR:
────────────────────────────────────────
Shape: (1024,)
Dtype: float32

First 20 values:
[-0.01467171 -0.02097356 -0.04572035  0.01120208

In [None]:
import json
from jd_extractor import JDExtractor  # Import from your job description extraction file

class CVJDScorer:
    def __init__(self):
        self.jd_extractor = JDExtractor()  # Use your custom JD extractor

    def extract_job_requirements(self, jd_document_path):
        """
        Extract job requirements using the custom JDExtractor
        """
        try:
            jd_data = self.jd_extractor.extract(jd_document_path)
            return jd_data
        except Exception as e:
            print(f"Error extracting job description: {e}")
            return None

    def calculate_match_score(self, cv_data, jd_data):
        """
        Calculate match score between CV and job description data
        """
        if not jd_data or not cv_data:
            return 0
        
        scores = {}
        total_weight = 0
        
        # 1. Skills Match (30% weight)
        if jd_data.get('technical_skills') and cv_data.get('skills'):
            skill_match = self._calculate_skills_match(
                cv_data['skills'], 
                jd_data['technical_skills']
            )
            scores['skills'] = {'score': skill_match, 'weight': 0.3}
            total_weight += 0.3
        
        # 2. Experience Match (25% weight)
        if jd_data.get('experience_requirements') and cv_data.get('years_of_experience'):
            exp_match = self._calculate_experience_match(
                cv_data['years_of_experience'],
                jd_data['experience_requirements']
            )
            scores['experience'] = {'score': exp_match, 'weight': 0.25}
            total_weight += 0.25
        
        # 3. Education Match (20% weight)
        if jd_data.get('education_requirements') and cv_data.get('education'):
            edu_match = self._calculate_education_match(
                cv_data['education'],
                jd_data['education_requirements']
            )
            scores['education'] = {'score': edu_match, 'weight': 0.2}
            total_weight += 0.2
        
        # 4. Certifications Match (15% weight)
        if jd_data.get('certifications') and cv_data.get('certifications'):
            cert_match = self._calculate_certifications_match(
                cv_data['certifications'],
                jd_data['certifications']
            )
            scores['certifications'] = {'score': cert_match, 'weight': 0.15}
            total_weight += 0.15
        
        # 5. Soft Skills Match (10% weight)
        if jd_data.get('soft_skills') and cv_data.get('soft_skills'):
            soft_skills_match = self._calculate_skills_match(
                cv_data['soft_skills'],
                jd_data['soft_skills']
            )
            scores['soft_skills'] = {'score': soft_skills_match, 'weight': 0.1}
            total_weight += 0.1
        
        # Calculate weighted average
        if total_weight == 0:
            return 0
            
        weighted_score = sum(
            category['score'] * category['weight'] 
            for category in scores.values()
        ) / total_weight
        
        return {
            'overall_score': round(weighted_score, 2),
            'category_scores': scores,
            'jd_data': jd_data,
            'cv_data': cv_data
        }

    def _calculate_skills_match(self, cv_skills, jd_skills):
        """Calculate percentage match between CV skills and JD required skills"""
        if not jd_skills:
            return 0
        
        cv_skills_lower = [skill.lower() for skill in cv_skills]
        jd_skills_lower = [skill.lower() for skill in jd_skills]
        
        matches = sum(1 for jd_skill in jd_skills_lower 
                     if any(jd_skill in cv_skill or cv_skill in jd_skill 
                           for cv_skill in cv_skills_lower))
        
        return round((matches / len(jd_skills_lower)) * 100, 2)

    def _calculate_experience_match(self, cv_experience, jd_experience_req):
        """Calculate experience match score"""
        try:
            # Extract years from JD requirements (simple parsing)
            jd_years_text = jd_experience_req.get('years_of_experience', '0')
            jd_years = self._extract_years_from_text(jd_years_text)
            
            cv_years = float(cv_experience) if cv_experience else 0
            
            if cv_years >= jd_years:
                return 100
            else:
                return round((cv_years / jd_years) * 100, 2) if jd_years > 0 else 0
        except:
            return 0

    def _calculate_education_match(self, cv_education, jd_education_req):
        """Calculate education requirements match"""
        if not jd_education_req:
            return 100
        
        cv_degrees = [edu.get('degree', '').lower() for edu in cv_education]
        jd_edu_lower = [req.lower() for req in jd_education_req]
        
        # Check if any CV degree matches JD requirements
        for degree in cv_degrees:
            for req in jd_edu_lower:
                if req in degree or degree in req:
                    return 100
        
        return 0

    def _calculate_certifications_match(self, cv_certifications, jd_certifications):
        """Calculate certifications match"""
        if not jd_certifications:
            return 100
            
        cv_cert_names = [cert.get('name', '').lower() for cert in cv_certifications]
        jd_cert_lower = [cert.lower() for cert in jd_certifications]
        
        matches = sum(1 for jd_cert in jd_cert_lower 
                     if any(jd_cert in cv_cert or cv_cert in jd_cert 
                           for cv_cert in cv_cert_names))
        
        return round((matches / len(jd_certifications)) * 100, 2)

    def _extract_years_from_text(self, text):
        """Extract years from text like '5+ years' or '3 years'"""
        import re
        numbers = re.findall(r'\d+', str(text))
        return float(numbers[0]) if numbers else 0

# Example usage:
if __name__ == "__main__":
    scorer = CVJDScorer()
    
    # Extract job requirements from JD document using your custom extractor
    jd_document_path = './job_description.txt'
    jd_data = scorer.extract_job_requirements(jd_document_path)
    
    print("Extracted Job Description Data:")
    print(json.dumps(jd_data, indent=2))
    
    # Your CV data (assuming you have it from your CV extraction process)
    cv_data = {
        # Your CV data here...
    }
    
    if cv_data and jd_data:
        # Calculate match score
        match_result = scorer.calculate_match_score(cv_data, jd_data)
        print("\nMatch Score Results:")
        print(json.dumps(match_result, indent=2))