In [6]:
import sys
from docstrange import DocumentExtractor

def extract_document_text(document_path: str):
    """
    Extracts and prints the text content from a given document.

    Args:
        document_path (str): The file path to the document (e.g., 'document.pdf').
    """
    print(f"Starting text extraction from: {document_path}\n")

    # Initialize the DocumentExtractor in local CPU mode.
    # This ensures that all processing happens on your machine and no data
    # is sent to a cloud API.
    try:
        extractor = DocumentExtractor()
    except Exception as e:
        print(f"Error initializing DocumentExtractor: {e}")
        print("Please ensure you have installed the necessary dependencies.")
        print("If you are running for the first time, you may need an internet connection to download models.")
        return

    try:
        # The extract() method processes the document.
        # It handles various formats like PDF, DOCX, and images.
        result = extractor.extract(document_path)
        
        # We can extract the content in various formats. Here, we'll get
        # the cleaned, LLM-optimized Markdown text.
        extracted_text = result.extract_markdown()

        if extracted_text:
            print("--- Extracted Text ---")
            print(extracted_text)
            print("----------------------")
        else:
            print("No text could be extracted from the document.")

    except FileNotFoundError:
        print(f"Error: The file '{document_path}' was not found.")
        print("Please check the file path and try again.")
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")

if __name__ == "__main__":
    # In a real-world scenario, you would replace 'your_document.pdf'
    # with the actual path to your document.
    # For a command-line script, you could also use sys.argv to get the path.
    sample_document_path = './CV_Image.png'

 

    extract_document_text(sample_document_path)


Starting text extraction from: ./CV_Image.png

--- Extracted Text ---
# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
**Phone**
0240542834

**Email**
aidooenochkwadwo@gmail.com

**Kumasi, Ghana**

## Qualities
* Curiosity
* Problem Solving
* System Understanding
* Technical Skills
* Analytical Thinking
* Problem Solving Skills
* Teamwork
* Initiative and Self-motivation
* Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Comb

In [1]:
import os
import json
from docstrange_extractor import CVExtractor
from prof_years_extractor import ProfessionalExperienceCalculator

class CVProcessor:
    """A class to handle CV data extraction, cleaning, and processing."""
    
    @staticmethod
    def clean_cv_data(cv_data):
        """
        Recursively clean extracted CV data:
        - Replace None/null with [] for list-like fields
        - Replace None/null with "" for text fields
        """
        if isinstance(cv_data, dict):
            cleaned = {}
            for key, value in cv_data.items():
                if value is None:
                    if key in ["work_experience", "education", "skills", "soft_skills",
                              "certifications", "projects", "languages", "hobbies"]:
                        cleaned[key] = []
                    else:
                        cleaned[key] = ""
                else:
                    cleaned[key] = CVProcessor.clean_cv_data(value)
            return cleaned
        elif isinstance(cv_data, list):
            return [CVProcessor.clean_cv_data(item) for item in cv_data]
        return cv_data

    def extract_and_save_cv(self, cv_file_path, output_dir):
        """
        Extract CV data, calculate years of experience, and save to JSON.
        
        Args:
            cv_file_path (str): Path to the input CV file (e.g., PDF, DOCX)
            output_dir (str): Directory to save the output JSON file
        
        Returns:
            str: Path to the saved JSON file, or None if extraction fails
        """
        extractor = CVExtractor()
        try:
            content = extractor.extract(cv_file_path)
        except Exception as e:
            print(f"‚ùå Error extracting {cv_file_path}: {e}")
            return None

        if content is None:
            print(f"‚ö†Ô∏è Extraction returned None for {cv_file_path}")
            return None

        # Clean data before processing
        cleaned_content = self.clean_cv_data(content)

        # Normalize structure: always provide CV_data.structured_data
        if isinstance(cleaned_content, dict) and "structured_data" in cleaned_content and isinstance(cleaned_content["structured_data"], dict):
            structured = cleaned_content["structured_data"]
        else:
            structured = cleaned_content if isinstance(cleaned_content, dict) else {"raw_text": cleaned_content}

        # Ensure list keys exist to avoid downstream KeyErrors
        structured.setdefault("work_experience", [])

        output_dict = {"CV_data": {"structured_data": structured}}

        # Calculate years of experience safely
        years_of_experience = 0.0
        try:
            calculator = ProfessionalExperienceCalculator(cv_data_dict=output_dict)
            years_of_experience = calculator.get_total_years()
        except Exception as e:
            print(f"‚ö†Ô∏è Error calculating years of experience for {cv_file_path}: {e}")
        # Guarantee nested path exists before assignment
        output_dict.setdefault("CV_data", {}).setdefault("structured_data", {}).setdefault("years_of_experience", years_of_experience)
        output_dict["CV_data"]["structured_data"]["years_of_experience"] = years_of_experience

        # Save to JSON
        os.makedirs(output_dir, exist_ok=True)
        base_name = os.path.splitext(os.path.basename(cv_file_path))[0]
        output_path = os.path.join(output_dir, f"{base_name}.json")

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_dict, f, indent=2, ensure_ascii=False)

        print(f"‚úÖ Extracted and cleaned CV saved to: {output_path}")
        return output_path

    def batch_extract_cvs(self, input_dir, output_dir="extracted_files"):
        """
        Process multiple CV files in a directory and save extracted data as JSON.
        
        Args:
            input_dir (str): Directory containing CV files
            output_dir (str): Directory to save JSON outputs
        """
        if not os.path.isdir(input_dir):
            print(f"‚ùå Input directory not found: {input_dir}")
            return

        files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.pdf', '.docx', '.png', '.jpg', '.jpeg'))]
        if not files:
            print(f"‚ö†Ô∏è No CV files (.pdf or .docx) found in {input_dir}")
            return

        for file_name in files:
            cv_path = os.path.join(input_dir, file_name)
            self.extract_and_save_cv(cv_path, output_dir)

# Example usage
if __name__ == "__main__":
    processor = CVProcessor()
    processor.extract_and_save_cv("./CVs/Data_Analyst3_CV.pdf", "./extracted_files/")
    # processor.batch_extract_cvs("./CVs", "./extracted_files")

Failed to parse JSON content: Expecting value: line 2 column 1 (char 1)


‚úÖ Extracted and cleaned CV saved to: ./extracted_files/Data_Analyst3_CV.json


In [None]:
import logging
from typing import List, Dict, Optional, Any
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pymongo
try:
    from sentence_transformers import CrossEncoder as STCrossEncoder
    _HAS_ST = True
except Exception:
    _HAS_ST = False

from identifiers import build_mongo_names, sanitize_fragment

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

JD_FIELDS = [
    "job_title", "required_skills", "preferred_skills", "required_qualifications",
    "education_requirements", "experience_requirements", "technical_skills", "soft_skills",
    "certifications", "responsibilities", "description", "full_text"
]

CV_FIELDS = [
    "summary", "work_experience", "education", "skills", "projects", "certifications"
]

class CVJDReranker:
    """Reranks CVs against a job description using a cross-encoder model."""
    
    def __init__(
        self,
        mongo_uri: str,
        mongo_db: str = "cv_db",
        cv_collection: str = "cvs",
        jd_collection: str = "job_descriptions",
        model_name: str = "BAAI/bge-reranker-base"
    ):
        """Initialize MongoDB client and cross-encoder model."""
        # Initialize MongoDB client
        try:
            self.mongo_client = pymongo.MongoClient(mongo_uri)
            self.cv_db = self.mongo_client[mongo_db]
            self.cv_collection = self.cv_db[cv_collection]
            self.jd_collection = self.cv_db[jd_collection]
            logger.info("MongoDB client initialized")
        except Exception as e:
            logger.error(f"Failed to initialize MongoDB client: {e}")
            raise ValueError("MongoDB connection failed. Provide a valid mongo_uri.")
        
        # Initialize cross-encoder
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.use_st = False
        try:
            if _HAS_ST:
                self.cross_encoder = STCrossEncoder(model_name, device=self.device)
                self.use_st = True
                self.tokenizer = self.cross_encoder.tokenizer
                logger.info(f"Initialized sentence-transformers CrossEncoder {model_name} on {self.device}")
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.cross_encoder = AutoModelForSequenceClassification.from_pretrained(model_name)
                self.cross_encoder.to(self.device)
                logger.info(f"Initialized transformers cross-encoder {model_name} on {self.device}")
        except Exception as e:
            logger.error(f"Failed to initialize cross-encoder: {e}")
            raise RuntimeError(f"Failed to load model {model_name}")

    def _build_text_from_doc(self, doc: Dict[str, Any], fields: List[str]) -> str:
        """Build concatenated text from document fields."""
        parts: List[str] = []
        for field in fields:
            val = doc.get(field)
            if isinstance(val, list):
                parts.append(" | ".join(str(x) for x in val))
            elif isinstance(val, dict):
                parts.append(" | ".join(f"{k}: {v}" for k, v in val.items()))
            elif isinstance(val, str) and val.strip():
                parts.append(val.strip())
        return "\n".join(p for p in parts if p)

    def fetch_jd_text(self, jd_id: str) -> str:
        """Fetch JD text from MongoDB."""
        try:
            jd_doc = self.jd_collection.find_one({"jd_id": jd_id})
            if not jd_doc:
                logger.warning(f"No JD found for {jd_id}")
                return ""
            return self._build_text_from_doc(jd_doc, JD_FIELDS)
        except Exception as e:
            logger.error(f"Error fetching JD {jd_id}: {e}")
            return ""

    def fetch_cv_text(self, cv_id: str) -> str:
        """Fetch CV text from MongoDB."""
        try:
            cv_doc = self.cv_collection.find_one({"cv_id": cv_id})
            if cv_doc:
                full_text = cv_doc.get("full_text", "")
                if not full_text:
                    full_text = self._build_text_from_doc(cv_doc, CV_FIELDS)
                return full_text
            return ""
        except Exception as e:
            logger.error(f"Error fetching CV {cv_id}: {e}")
            return ""

    def _score_pairs(self, pairs: List[List[str]], batch_size: int = 8) -> List[float]:
        """Score text pairs using cross-encoder."""
        if not pairs:
            return []
        
        max_length = getattr(self.tokenizer, 'model_max_length', 512)
        scores: List[float] = []
        
        if self.use_st:
            try:
                scores = self.cross_encoder.predict(pairs).tolist()
            except Exception as e:
                logger.error(f"ST inference error: {e}")
                scores = [0.0] * len(pairs)
        else:
            for i in range(0, len(pairs), batch_size):
                batch_pairs = pairs[i:i + batch_size]
                try:
                    features = self.tokenizer(
                        batch_pairs, padding=True, truncation=True, 
                        max_length=max_length, return_tensors="pt"
                    ).to(self.device)
                    with torch.no_grad():
                        logits = self.cross_encoder(**features).logits
                        if logits.shape[1] == 1:
                            batch_scores = logits.squeeze(1)
                        else:
                            batch_scores = logits[:, 1]
                        batch_scores = torch.sigmoid(batch_scores)
                    scores.extend(batch_scores.cpu().tolist())
                except Exception as e:
                    logger.error(f"HF inference error: {e}")
                    scores.extend([0.0] * len(batch_pairs))
        
        return scores

    def rerank_cvs(self, cv_results: List[Dict], jd_id: str, batch_size: int = 8) -> List[Dict]:
        """Rerank CVs by jd_id."""
        jd_text = self.fetch_jd_text(jd_id)
        if not jd_text:
            for result in cv_results:
                result["cross_encoder_score"] = 0.0
            return sorted(cv_results, key=lambda x: x.get("total_score", 0), reverse=True)

        cv_texts, valid_results = [], []
        for result in cv_results:
            cv_id = result.get("cv_id")
            if cv_id:
                cv_text = self.fetch_cv_text(cv_id)
                if cv_text:
                    cv_texts.append(cv_text)
                    valid_results.append(result)
                else:
                    result["cross_encoder_score"] = 0.0

        if not cv_texts:
            return sorted(cv_results, key=lambda x: x.get("total_score", 0), reverse=True)

        pairs = [[jd_text, cv_text] for cv_text in cv_texts]
        scores = self._score_pairs(pairs, batch_size)

        for result, score in zip(valid_results, scores):
            result["cross_encoder_score"] = score

        return sorted(cv_results, key=lambda x: x.get("cross_encoder_score", 0), reverse=True)

    def rerank_cvs_for_job(self, cv_results: List[Dict], company_name: str, job_title: str, batch_size: int = 8) -> List[Dict]:
        """Rerank CVs by company/job."""
        try:
            db_name, cv_coll, jd_coll = build_mongo_names(company_name, job_title)
            dyn_db = self.mongo_client[db_name]
            dyn_jd_coll = dyn_db[jd_coll]

            jd_docs = list(dyn_jd_coll.find({}))
            if not jd_docs:
                jd_docs = list(self.jd_collection.find({
                    "company_name": {"$regex": f"^{company_name}$", "$options": "i"},
                    "job_title": {"$regex": f"^{job_title}$", "$options": "i"}
                }))

            if not jd_docs:
                logger.warning(f"No JD found for {company_name}/{job_title}")
                return cv_results

            jd_text = "\n".join(self._build_text_from_doc(doc, JD_FIELDS) for doc in jd_docs)

            cv_texts, valid_results = [], []
            dyn_cv_coll = dyn_db[cv_coll]
            for result in cv_results:
                cv_id = result.get("cv_id")
                if cv_id:
                    cv_doc = dyn_cv_coll.find_one({"cv_id": cv_id}) or self.cv_collection.find_one({"cv_id": cv_id})
                    if cv_doc:
                        cv_text = cv_doc.get("full_text", "") or self._build_text_from_doc(cv_doc, CV_FIELDS)
                        if cv_text:
                            cv_texts.append(cv_text)
                            valid_results.append(result)
                        else:
                            result["cross_encoder_score"] = 0.0

            if not cv_texts:
                return cv_results

            pairs = [[jd_text, cv_text] for cv_text in cv_texts]
            scores = self._score_pairs(pairs, batch_size)

            for result, score in zip(valid_results, scores):
                result["cross_encoder_score"] = score

            return sorted(cv_results, key=lambda x: x.get("cross_encoder_score", 0), reverse=True)

        except Exception as e:
            logger.error(f"Rerank failed for {company_name}/{job_title}: {e}")
            return cv_results

    def format_results(self, results: List[Dict], show_details: bool = False) -> str:
        """Format results."""
        lines = []
        for i, result in enumerate(results[:10]):
            lines.append(f"\n--- CV #{i+1} ({result['cv_id']}) ---")
            lines.append(f"Email: {result.get('email', 'N/A')}")
            lines.append(f"Vector Score: {result['total_score']:.3f}")
            lines.append(f"Cross-Encoder: {result['cross_encoder_score']:.3f}")
            if show_details and result.get("section_scores"):
                lines.append("Sections: " + " | ".join(f"{k}:{v:.2f}" for k, v in result["section_scores"].items()))
        return "\n".join(lines)

    def close(self):
        if self.mongo_client:
            self.mongo_client.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()


# ===============================================
# REAL-WORLD TEST DATA
# ===============================================

REAL_CV_RESULTS = [
    {
        "cv_id": "cv_alice_001",
        "email": "alice.chen@tech.com",
        "total_score": 0.87,
        "section_scores": {"skills": 0.92, "experience": 0.85}
    },
    {
        "cv_id": "cv_bob_002", 
        "email": "bob.smith@data.com",
        "total_score": 0.82,
        "section_scores": {"skills": 0.88, "experience": 0.78}
    },
    {
        "cv_id": "cv_charlie_003",
        "email": "charlie.lee@ml.com",
        "total_score": 0.79,
        "section_scores": {"skills": 0.85, "experience": 0.75}
    },
    {
        "cv_id": "cv_diana_004",
        "email": "diana.wang@ai.com",
        "total_score": 0.91,
        "section_scores": {"skills": 0.95, "experience": 0.88}
    }
]


# ===============================================
# MAIN TEST SCRIPT
# ===============================================

if __name__ == "__main__":
    print("üöÄ CV-JD RERANKER - REAL WORLD TEST\n")
    
    # Initialize
    reranker = CVJDReranker(
        mongo_uri="mongodb://localhost:27017/",
        mongo_db="cv_db"
    )
    
    print("1. RERANK BY JD_ID")
    print("=" * 50)
    results1 = reranker.rerank_cvs(REAL_CV_RESULTS, jd_id="sample_jd_data_scientist")
    print(reranker.format_results(results1, show_details=True))
    
    print("\n2. RERANK BY COMPANY/JOB")
    print("=" * 50)
    results2 = reranker.rerank_cvs_for_job(
        REAL_CV_RESULTS, 
        company_name="TechCorp", 
        job_title="Senior Data Scientist"
    )
    print(reranker.format_results(results2, show_details=True))
    
    print("\n3. TOP 3 CANDIDATES")
    print("=" * 50)
    top3 = results2[:3]
    for i, r in enumerate(top3, 1):
        print(f"{i}. {r['email']} | CE Score: {r['cross_encoder_score']:.3f}")
    
    reranker.close()
    print("\n‚úÖ TEST COMPLETE!")

2025-10-23 10:37:47,859 - INFO - MongoDB client initialized


üöÄ CV-JD RERANKER - REAL WORLD TEST



2025-10-23 10:37:55,577 - INFO - Initialized sentence-transformers CrossEncoder BAAI/bge-reranker-base on cuda


1. RERANK BY JD_ID

--- CV #1 (cv_diana_004) ---
Email: diana.wang@ai.com
Vector Score: 0.910
Cross-Encoder: 0.000
Sections: skills:0.95 | experience:0.88

--- CV #2 (cv_alice_001) ---
Email: alice.chen@tech.com
Vector Score: 0.870
Cross-Encoder: 0.000
Sections: skills:0.92 | experience:0.85

--- CV #3 (cv_bob_002) ---
Email: bob.smith@data.com
Vector Score: 0.820
Cross-Encoder: 0.000
Sections: skills:0.88 | experience:0.78

--- CV #4 (cv_charlie_003) ---
Email: charlie.lee@ml.com
Vector Score: 0.790
Cross-Encoder: 0.000
Sections: skills:0.85 | experience:0.75

2. RERANK BY COMPANY/JOB

--- CV #1 (cv_alice_001) ---
Email: alice.chen@tech.com
Vector Score: 0.870
Cross-Encoder: 0.000
Sections: skills:0.92 | experience:0.85

--- CV #2 (cv_bob_002) ---
Email: bob.smith@data.com
Vector Score: 0.820
Cross-Encoder: 0.000
Sections: skills:0.88 | experience:0.78

--- CV #3 (cv_charlie_003) ---
Email: charlie.lee@ml.com
Vector Score: 0.790
Cross-Encoder: 0.000
Sections: skills:0.85 | experience:

In [3]:
import logging
from typing import List, Dict, Any
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
try:
    from sentence_transformers import CrossEncoder as STCrossEncoder
    _HAS_ST = True
except Exception:
    _HAS_ST = False

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class CVJDReranker:
    """Reranks CVs against job descriptions using cross-encoder (STANDALONE MODE)."""
    
    def __init__(self, model_name: str = "BAAI/bge-reranker-base"):
        """Initialize cross-encoder only (no MongoDB)."""
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.use_st = False
        
        try:
            if _HAS_ST:
                self.cross_encoder = STCrossEncoder(model_name, device=self.device)
                self.use_st = True
                self.tokenizer = self.cross_encoder.tokenizer
                logger.info(f"‚úÖ Initialized sentence-transformers CrossEncoder on {self.device}")
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.cross_encoder = AutoModelForSequenceClassification.from_pretrained(model_name)
                self.cross_encoder.to(self.device)
                logger.info(f"‚úÖ Initialized transformers cross-encoder on {self.device}")
        except Exception as e:
            logger.error(f"‚ùå Failed to initialize cross-encoder: {e}")
            raise RuntimeError(f"Failed to load model {model_name}")

    def _score_pairs(self, pairs: List[List[str]], batch_size: int = 8) -> List[float]:
        """Score text pairs using cross-encoder."""
        if not pairs:
            return []
        
        max_length = getattr(self.tokenizer, 'model_max_length', 512)
        scores: List[float] = []
        
        if self.use_st:
            try:
                scores = self.cross_encoder.predict(pairs).tolist()
                logger.info(f"‚úÖ Scored {len(scores)} pairs using sentence-transformers")
            except Exception as e:
                logger.error(f"‚ùå ST inference error: {e}")
                scores = [0.0] * len(pairs)
        else:
            for i in range(0, len(pairs), batch_size):
                batch_pairs = pairs[i:i + batch_size]
                try:
                    features = self.tokenizer(
                        batch_pairs, padding=True, truncation=True, 
                        max_length=max_length, return_tensors="pt"
                    ).to(self.device)
                    with torch.no_grad():
                        logits = self.cross_encoder(**features).logits
                        if logits.shape[1] == 1:
                            batch_scores = logits.squeeze(1)
                        else:
                            batch_scores = logits[:, 1]
                        batch_scores = torch.sigmoid(batch_scores)
                    scores.extend(batch_scores.cpu().tolist())
                except Exception as e:
                    logger.error(f"‚ùå HF inference error: {e}")
                    scores.extend([0.0] * len(batch_pairs))
        
        return scores

    def rerank_cvs_direct(
        self, 
        cv_results: List[Dict], 
        jd_text: str, 
        batch_size: int = 8
    ) -> List[Dict]:
        """
        Rerank CVs directly with typed JD text.
        
        Args:
            cv_results: List of CV dicts with 'cv_id', 'email', 'total_score'
            jd_text: Raw job description string
            batch_size: Batch size for inference
            
        Returns:
            Sorted list with 'cross_encoder_score' added
        """
        if not jd_text.strip():
            logger.warning("‚ùå Empty JD text")
            for result in cv_results:
                result["cross_encoder_score"] = 0.0
            return sorted(cv_results, key=lambda x: x.get("total_score", 0), reverse=True)

        # Extract CV texts from results
        cv_texts = []
        valid_results = []
        for result in cv_results:
            cv_text = result.get("cv_text", "")
            if cv_text:
                cv_texts.append(cv_text)
                valid_results.append(result)
            else:
                result["cross_encoder_score"] = 0.0

        if not cv_texts:
            logger.warning("‚ùå No valid CV texts")
            return sorted(cv_results, key=lambda x: x.get("total_score", 0), reverse=True)

        # Score pairs: [JD, CV1], [JD, CV2], ...
        pairs = [[jd_text, cv_text] for cv_text in cv_texts]
        scores = self._score_pairs(pairs, batch_size)

        # Assign scores
        for result, score in zip(valid_results, scores):
            result["cross_encoder_score"] = float(score)

        # Sort by cross-encoder score
        sorted_results = sorted(
            cv_results,
            key=lambda x: x.get("cross_encoder_score", 0),
            reverse=True
        )
        
        logger.info(f"‚úÖ Reranked {len(sorted_results)} CVs | Top score: {max(scores):.3f}")
        return sorted_results

    def format_results(self, results: List[Dict], show_details: bool = False) -> str:
        """Format results as readable string."""
        lines = [f"{'='*60}", f"üéØ TOP CANDIDATES (Cross-Encoder Scores)", f"{'='*60}"]
        for i, result in enumerate(results[:5], 1):
            lines.append(
                f"{i:2d}. {result['email']:25s} | CE: {result['cross_encoder_score']:6.3f} | "
                f"Vector: {result['total_score']:6.3f} | Œî: {result['cross_encoder_score'] - result['total_score']:+.3f}"
            )
            if show_details:
                lines.append(f"    CV ID: {result['cv_id']}")
        lines.append(f"{'='*60}")
        return "\n".join(lines)


# ===============================================
# REAL-WORLD TYPED CVs & JOB DESCRIPTION
# ===============================================

# üéØ REAL JOB DESCRIPTION
SENIOR_DATA_SCIENTIST_JD = """
Senior Data Scientist - TechCorp

RESPONSIBILITIES:
- Develop machine learning models for customer segmentation and churn prediction
- Design A/B testing frameworks and analyze experiment results
- Build scalable data pipelines using Python, Spark, and AWS
- Create dashboards using Tableau/PowerBI for business stakeholders
- Collaborate with engineering teams to deploy ML models to production

REQUIRED SKILLS:
- Python (pandas, scikit-learn, TensorFlow/PyTorch)
- SQL (advanced queries, window functions)
- Machine Learning (supervised/unsupervised, feature engineering)
- Big Data (Spark, Hadoop)
- Cloud (AWS/GCP/Azure)
- Statistics (hypothesis testing, experimental design)

EXPERIENCE:
- 5+ years in data science/ML engineering
- Production ML model deployment experience
- Experience with customer analytics/churn prediction

EDUCATION:
- MS/PhD in Computer Science, Statistics, or related field
"""

# üë• REAL CVs (4 diverse candidates)
REAL_CVS = [
    {
        "cv_id": "cv_alice_001",
        "email": "alice.chen@tech.com",
        "total_score": 0.87,  # Vector search score
        "cv_text": """
        Alice Chen | Senior Data Scientist | 6 years experience
        
        SUMMARY:
        Experienced Data Scientist specializing in customer analytics and ML model deployment.
        Built churn prediction models reducing customer loss by 18% at previous role.
        
        TECHNICAL SKILLS:
        Python (pandas, scikit-learn, TensorFlow) | SQL | Spark | AWS | Tableau
        
        WORK EXPERIENCE:
        TechCorp (2020-Present) - Senior Data Scientist
        - Developed churn prediction models using XGBoost (accuracy: 92%)
        - Built real-time data pipelines with Apache Spark and AWS Lambda
        - Created Tableau dashboards used by 50+ stakeholders
        
        DataCorp (2018-2020) - Data Scientist
        - Implemented A/B testing framework for product features
        - Reduced customer acquisition cost by 12% through segmentation models
        
        EDUCATION:
        MS Computer Science - Stanford University (2018)
        """,
        "section_scores": {"skills": 0.92, "experience": 0.85}
    },
    {
        "cv_id": "cv_bob_002",
        "email": "bob.smith@data.com",
        "total_score": 0.82,
        "cv_text": """
        Bob Smith | Data Analyst | 4 years experience
        
        SUMMARY:
        Data Analyst with strong SQL and visualization skills. Experience in customer reporting.
        
        TECHNICAL SKILLS:
        SQL | Python (pandas) | Tableau | Excel | GCP
        
        WORK EXPERIENCE:
        DataCorp (2021-Present) - Data Analyst
        - Built customer segmentation reports in Tableau
        - Wrote complex SQL queries for marketing team
        - Created weekly churn dashboards
        
        EDUCATION:
        BS Statistics - University of California (2021)
        """,
        "section_scores": {"skills": 0.88, "experience": 0.78}
    },
    {
        "cv_id": "cv_charlie_003",
        "email": "charlie.lee@ml.com",
        "total_score": 0.79,
        "cv_text": """
        Charlie Lee | ML Engineer | 3 years experience
        
        SUMMARY:
        ML Engineer focused on model deployment and MLOps.
        
        TECHNICAL SKILLS:
        Python | TensorFlow | Docker | Kubernetes | AWS | CI/CD
        
        WORK EXPERIENCE:
        MLStartup (2022-Present) - ML Engineer
        - Deployed 20+ ML models to production using Docker/K8s
        - Built CI/CD pipelines for model retraining
        - Optimized inference latency by 40%
        
        EDUCATION:
        BS Computer Science - MIT (2022)
        """,
        "section_scores": {"skills": 0.85, "experience": 0.75}
    },
    {
        "cv_id": "cv_diana_004",
        "email": "diana.wang@ai.com",
        "total_score": 0.91,
        "cv_text": """
        Diana Wang | Lead Data Scientist | 7 years experience
        
        SUMMARY:
        Seasoned Data Science leader with expertise in customer analytics and production ML.
        Led team that reduced churn by 25% through advanced modeling.
        
        TECHNICAL SKILLS:
        Python (scikit-learn, PyTorch) | SQL | Spark | AWS Sagemaker | Tableau | A/B Testing
        
        WORK EXPERIENCE:
        AIInc (2019-Present) - Lead Data Scientist
        - Led churn prediction initiative saving $2M annually
        - Built end-to-end ML pipeline with Spark and Sagemaker
        - Designed A/B testing framework for 100+ experiments
        - Mentored 5 junior data scientists
        
        Google (2017-2019) - Data Scientist
        - Developed recommendation systems for YouTube
        - Published 3 papers on customer retention modeling
        
        EDUCATION:
        PhD Statistics - UC Berkeley (2017)
        """,
        "section_scores": {"skills": 0.95, "experience": 0.88}
    }
]


# ===============================================
# MAIN TEST SCRIPT - NO MONGO REQUIRED!
# ===============================================

if __name__ == "__main__":
    print("üöÄ CV-JD RERANKER - REAL WORLD TEST (STANDALONE)\n")
    
    # Initialize reranker
    reranker = CVJDReranker()
    
    # TEST 1: Rerank with REAL job description
    print("1. RERANKING 4 CVs FOR 'Senior Data Scientist'\n")
    results = reranker.rerank_cvs_direct(
        cv_results=REAL_CVS,
        jd_text=SENIOR_DATA_SCIENTIST_JD
    )
    
    print(reranker.format_results(results, show_details=True))
    
    # TEST 2: Show ranking changes
    print("\n2. RANKING COMPARISON")
    print("-" * 60)
    print("Original (Vector) ‚Üí New (Cross-Encoder)")
    print("-" * 60)
    
    vector_ranking = sorted(REAL_CVS, key=lambda x: x["total_score"], reverse=True)
    ce_ranking = results
    
    for i, (v, c) in enumerate(zip(vector_ranking, ce_ranking), 1):
        v_rank = vector_ranking.index(c) + 1
        print(f"{i:2d}. {c['email']:25s} | Vector: #{v_rank} ‚Üí CE: #{i} | "
              f"Score: {c['cross_encoder_score']:.3f}")
    
    print(f"\n‚úÖ Cross-encoder re-ranked {len(results)} CVs successfully!")
    print(f"‚è±Ô∏è  Device: {reranker.device}")

üöÄ CV-JD RERANKER - REAL WORLD TEST (STANDALONE)



2025-10-23 10:43:08,008 - INFO - ‚úÖ Initialized sentence-transformers CrossEncoder on cuda


1. RERANKING 4 CVs FOR 'Senior Data Scientist'



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-23 10:43:24,686 - INFO - ‚úÖ Scored 4 pairs using sentence-transformers
2025-10-23 10:43:24,687 - INFO - ‚úÖ Reranked 4 CVs | Top score: 0.998


üéØ TOP CANDIDATES (Cross-Encoder Scores)
 1. alice.chen@tech.com       | CE:  0.998 | Vector:  0.870 | Œî: +0.128
    CV ID: cv_alice_001
 2. diana.wang@ai.com         | CE:  0.425 | Vector:  0.910 | Œî: -0.485
    CV ID: cv_diana_004
 3. bob.smith@data.com        | CE:  0.277 | Vector:  0.820 | Œî: -0.543
    CV ID: cv_bob_002
 4. charlie.lee@ml.com        | CE:  0.038 | Vector:  0.790 | Œî: -0.752
    CV ID: cv_charlie_003

2. RANKING COMPARISON
------------------------------------------------------------
Original (Vector) ‚Üí New (Cross-Encoder)
------------------------------------------------------------
 1. alice.chen@tech.com       | Vector: #2 ‚Üí CE: #1 | Score: 0.998
 2. diana.wang@ai.com         | Vector: #1 ‚Üí CE: #2 | Score: 0.425
 3. bob.smith@data.com        | Vector: #3 ‚Üí CE: #3 | Score: 0.277
 4. charlie.lee@ml.com        | Vector: #4 ‚Üí CE: #4 | Score: 0.038

‚úÖ Cross-encoder re-ranked 4 CVs successfully!
‚è±Ô∏è  Device: cuda
