In [None]:
# Step 1: Install Required Libraries

# Multi-format document processing libraries
!pip install langchain openai PyPDF2 python-docx openpyxl pandas matplotlib

In [None]:
# Note: You might need to restart your kernel after installation
import sys
print("Python version:", sys.version)
print("\n✅ Installation complete! Restart kernel if needed.")

## Setup Environment and Import Libraries 🔧

Let's set up our environment and import the necessary libraries for multi-format document processing:

- **langchain & openai**: For working with language models
- **PyPDF2**: For processing PDF files
- **python-docx**: For processing Word documents
- **openpyxl**: For processing Excel files
- **pandas**: For data manipulation and analysis
- **matplotlib**: For visualization and reporting

In [None]:
# Step 2: Import Libraries and Setup Environment

import os
import re
import json
import io
from typing import Dict, List, Optional, Union, Any
from datetime import datetime
from pathlib import Path
from enum import Enum
import base64

# For AI/LLM operations (same as Project 1)
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# For multi-format document processing
import PyPDF2
from docx import Document
import pandas as pd
import openpyxl
import matplotlib.pyplot as plt

# Read API key from config.txt file (same as Project 1)
config_file = Path('config.txt')
openai_api_key = None

if config_file.exists():
    with open(config_file, 'r') as f:
        for line in f:
            if line.startswith('OPENAI_API_KEY='):
                openai_api_key = line.strip().split('=')[1]
                break

if openai_api_key:
    print("✅ OpenAI API key loaded successfully from config.txt!")
    print(f"🔑 API key starts with: {openai_api_key[:8]}...")
else:
    print("❌ OpenAI API key not found!")
    print("Please make sure config.txt file exists with your OPENAI_API_KEY")
    print("Example content: OPENAI_API_KEY=sk-your-key-here")

print("\n📚 All libraries imported successfully!")

## Define the Multi-Format Document Processor 📋

Now we'll implement a document processor that can handle various file formats:

In [None]:
# Multi-Format Document Processor

class DocumentFormat(Enum):
    """Supported document formats"""
    TEXT = "text"
    PDF = "pdf"
    WORD = "word"
    EXCEL = "excel"
    UNKNOWN = "unknown"

class MultiFormatDocumentProcessor:
    """
    Handles document processing for multiple file formats
    """
    
    def __init__(self):
        self.supported_extensions = {
            ".txt": DocumentFormat.TEXT,
            ".pdf": DocumentFormat.PDF,
            ".docx": DocumentFormat.WORD,
            ".doc": DocumentFormat.WORD,
            ".xlsx": DocumentFormat.EXCEL,
            ".xls": DocumentFormat.EXCEL
        }
        print("🔄 Multi-format document processor initialized!")
        
    def detect_format(self, file_path: str) -> DocumentFormat:
        """
        Detects the document format based on file extension
        
        Args:
            file_path (str): Path to the document file
        
        Returns:
            DocumentFormat: Detected format enum
        """
        _, ext = os.path.splitext(file_path.lower())
        return self.supported_extensions.get(ext, DocumentFormat.UNKNOWN)
    
    def extract_text_from_file(self, file_path: str) -> str:
        """
        Extracts text content from files of various formats
        
        Args:
            file_path (str): Path to the document file
        
        Returns:
            str: Extracted text content
        """
        doc_format = self.detect_format(file_path)
        
        try:
            if doc_format == DocumentFormat.TEXT:
                return self._extract_from_text(file_path)
            elif doc_format == DocumentFormat.PDF:
                return self._extract_from_pdf(file_path)
            elif doc_format == DocumentFormat.WORD:
                return self._extract_from_word(file_path)
            elif doc_format == DocumentFormat.EXCEL:
                return self._extract_from_excel(file_path)
            else:
                print(f"❌ Unsupported file format: {os.path.basename(file_path)}")
                return ""
        except Exception as e:
            print(f"❌ Error extracting text from {os.path.basename(file_path)}: {str(e)}")
            return ""
    
    def _extract_from_text(self, file_path: str) -> str:
        """Extract text from .txt file"""
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            print(f"📄 Extracted {len(content)} characters from text file")
            return content
    
    def _extract_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        text = ""
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
        
        print(f"📄 Extracted {len(text)} characters from PDF ({len(pdf_reader.pages)} pages)")
        return text
    
    def _extract_from_word(self, file_path: str) -> str:
        """Extract text from Word document"""
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        
        print(f"📄 Extracted {len(text)} characters from Word document")
        return text
    
    def _extract_from_excel(self, file_path: str) -> str:
        """
        Extract text from Excel file
        This is a simplified approach - actual CV data in Excel may need 
        specific column mapping or structure understanding
        """
        df = pd.read_excel(file_path)
        # Convert DataFrame to string representation
        text = df.to_string(index=False)
        
        print(f"📄 Extracted {len(text)} characters from Excel file")
        return text

# Initialize the multi-format document processor
doc_processor = MultiFormatDocumentProcessor()
print("✅ Multi-format document processor ready for CV analysis!")

In [None]:
# Initialize OpenAI LLM (same as Project 1)
if openai_api_key:
    llm = OpenAI(
        temperature=0,  # Low temperature for consistent, factual responses
        openai_api_key=openai_api_key,
        model_name="gpt-3.5-turbo-instruct"  # Cost-effective model
    )
    print("🤖 OpenAI LLM initialized successfully!")
else:
    print("❌ Cannot initialize LLM without API key")
    llm = None

# Enhanced Entity Extraction Prompt Template for multi-format documents
enhanced_extraction_prompt = PromptTemplate(
    input_variables=["cv_text"],
    template="""
    You are an expert HR assistant analyzing a candidate's CV extracted from various document formats. Extract the following information:
    
    CV Text:
    {cv_text}
    
    Extract and return ONLY the following information in JSON format:
    {{
        "name": "Candidate's full name",
        "email": "Email address",
        "phone": "Phone number", 
        "location": "City, Country",
        "skills": ["List of all technical and soft skills"],
        "experience": [
            {{
                "company": "Company name",
                "position": "Job title",
                "duration": "Employment period",
                "responsibilities": "Key responsibilities",
                "achievements": "Notable achievements"
            }}
        ],
        "education": [
            {{
                "degree": "Degree name",
                "institution": "University/School name", 
                "year": "Graduation year",
                "field": "Field of study"
            }}
        ],
        "certifications": ["List of professional certifications"],
        "languages": ["Languages spoken"],
        "total_experience_years": "Number of years of relevant work experience"
    }}
    
    Be thorough but concise. Extract only factual information present in the CV. If certain information is not available, use null or empty arrays as appropriate.
    """
)

# Advanced Role Classification Prompt with weighted scoring
advanced_classification_prompt = PromptTemplate(
    input_variables=["candidate_profile", "role_requirements", "industry_context"],
    template="""
    You are an HR specialist with expertise in technical recruitment.
    
    Candidate Profile:
    {candidate_profile}
    
    Role Requirements:
    {role_requirements}
    
    Industry Context:
    {industry_context}
    
    Analyze the candidate's fit for this role with weighted scoring and return a JSON response:
    {{
        "role_fit_score": "Score from 0-100 based on skills and experience match",
        "skill_match_percentage": "Percentage of required skills matched",
        "experience_quality_score": "Score from 0-100 on relevance of experience",
        "education_alignment": "Score from 0-100 on education fit",
        "technical_proficiency": "Junior/Mid/Senior/Expert level assessment",
        "matching_skills": ["List of candidate skills that match role requirements"],
        "missing_skills": ["List of required skills the candidate lacks"],
        "strengths": ["Top 3-5 strengths for this role"],
        "growth_areas": ["Top 3 development areas"],
        "recommendation": "Highly Recommended/Recommended/Consider/Not Recommended",
        "interview_focus_areas": ["Suggested areas to explore in interview"],
        "justification": "Brief explanation of the recommendation"
    }}
    
    Be objective and thorough in your assessment. Consider both technical skills and soft skills relevance.
    """
)

print("✅ Advanced prompt templates created!")

In [None]:
# List all CV files for multi-format processing
def list_all_cv_files():
    """
    List all CV files in the data/cvs folder regardless of format
    """
    cv_dir = os.path.join('data', 'cvs')
    
    # Ensure the directory exists
    if not os.path.exists(cv_dir):
        print(f"⚠️ Data directory {cv_dir} does not exist!")
        print("Creating the directory now...")
        os.makedirs(cv_dir, exist_ok=True)
    
    # Find all supported CV file formats
    cv_files = []
    supported_extensions = doc_processor.supported_extensions.keys()
    
    for file in os.listdir(cv_dir):
        _, ext = os.path.splitext(file.lower())
        if ext in supported_extensions:
            cv_files.append(file)
    
    # Display results
    if cv_files:
        print(f"📊 Found {len(cv_files)} CV files in various formats")
        
        # Group by file type for reporting
        format_counts = {}
        for file in cv_files:
            _, ext = os.path.splitext(file.lower())
            format_counts[ext] = format_counts.get(ext, 0) + 1
        
        print("\nCV files available for processing:")
        for i, file in enumerate(cv_files, 1):
            _, ext = os.path.splitext(file.lower())
            format_name = doc_processor.supported_extensions[ext].value.capitalize()
            print(f"{i}. {file} ({format_name})")
        
        print(f"\n📝 Format breakdown:")
        for ext, count in format_counts.items():
            format_name = doc_processor.supported_extensions[ext].value.capitalize()
            print(f"   - {format_name} ({ext}): {count} files")
    else:
        print(f"⚠️ No CV files found in {cv_dir}/")
        print("Please add sample CV files in supported formats to this folder.")
    
    return cv_files

# List all available CV files
all_cv_files = list_all_cv_files()

## Project 2: Enhanced CV Analyzer Implementation 🚀

Building on Project 1's foundation, our enhanced analyzer can now:
1. Process multiple document formats (PDF, Word, Excel, Text)
2. Extract more detailed candidate information
3. Perform weighted scoring for role matching
4. Generate comprehensive visualizations and reports

In [None]:
# Project 2: Enhanced Multi-Format CV Analyzer

class Project2_EnhancedAnalyzer:
    """
    Enhanced CV analyzer with multi-format support
    """
    
    def __init__(self, doc_processor):
        self.doc_processor = doc_processor
        self.results = []
        self.processed_count = 0
        self.industry_context = {
            "current_market_trends": "High demand for cloud, AI, and cybersecurity skills",
            "technical_evolution": "Rapid adoption of containerization, microservices, and DevOps practices",
            "soft_skills_importance": "Growing emphasis on communication, collaboration, and adaptability"
        }
        print("🟡 Project 2: Enhanced Multi-Format Analyzer initialized!")
    
    def process_cv(self, file_path: str) -> dict:
        """
        Process a CV file of any supported format
        
        Args:
            file_path (str): Path to the CV file
            
        Returns:
            dict: Complete candidate analysis
        """
        print(f"\n🚀 Processing CV: {os.path.basename(file_path)}")
        print("=" * 60)
        
        # Step 1: Detect format and extract text
        format_type = self.doc_processor.detect_format(file_path)
        print(f"📄 Detected format: {format_type.value}")
        
        if format_type == DocumentFormat.UNKNOWN:
            return {"error": "Unsupported file format"}
        
        # Step 2: Extract text content based on format
        cv_text = self.doc_processor.extract_text_from_file(file_path)
        if not cv_text:
            return {"error": "Failed to extract text from file"}
        
        # Step 3: Extract enhanced entities
        entities = self.extract_enhanced_entities(cv_text)
        if "error" in entities:
            return entities
        
        # Step 4: Perform advanced role classification for all roles
        from project1_basic_extraction import JobRole, ROLE_DEFINITIONS  # Import from Project 1
        
        role_analyses = {}
        for role in JobRole:
            analysis = self.classify_for_role_advanced(entities, role)
            role_analyses[role.value] = analysis
        
        # Step 5: Determine best fit role with weighted scoring
        best_role = self.find_best_role_match_weighted(role_analyses)
        
        # Compile complete profile with enhanced data
        complete_profile = {
            "candidate_info": entities,
            "role_analyses": role_analyses,
            "best_role_match": best_role,
            "format_type": format_type.value,
            "processed_at": datetime.now().isoformat(),
            "source_file": os.path.basename(file_path)
        }
        
        self.processed_count += 1
        self.results.append(complete_profile)
        
        print(f"✅ CV processing completed! Best fit: {best_role['role']} ({best_role['score']}% match)")
        return complete_profile
    
    def extract_enhanced_entities(self, cv_text: str) -> dict:
        """
        Extract enhanced entities from CV text using AI
        
        Args:
            cv_text (str): Raw CV content
            
        Returns:
            dict: Extracted detailed entities
        """
        if not llm:
            return {"error": "LLM not initialized"}
        
        try:
            print("🔄 Extracting enhanced entities from CV...")
            chain = LLMChain(llm=llm, prompt=enhanced_extraction_prompt)
            result = chain.run(cv_text=cv_text)
            
            # Parse JSON response
            try:
                entities = json.loads(result.strip())
                print("✅ Enhanced entities extracted successfully")
                return entities
            except json.JSONDecodeError:
                print("⚠️ JSON parsing failed, using raw response")
                return {"raw_response": result.strip()}
                
        except Exception as e:
            print(f"❌ Error extracting enhanced entities: {str(e)}")
            return {"error": str(e)}
    
    def classify_for_role_advanced(self, candidate_profile: dict, target_role: "JobRole") -> dict:
        """
        Perform advanced role classification with weighted scoring
        
        Args:
            candidate_profile (dict): Extracted candidate information
            target_role (JobRole): Role to evaluate against
            
        Returns:
            dict: Advanced role fit analysis
        """
        if not llm:
            return {"error": "LLM not initialized"}
        
        # Import role definitions from Project 1
        from project1_basic_extraction import ROLE_DEFINITIONS
        role_req = ROLE_DEFINITIONS[target_role]
        
        try:
            print(f"🎯 Analyzing fit for {target_role.value} with advanced metrics...")
            
            # Prepare role requirements summary
            requirements_summary = {
                "role": target_role.value,
                "required_skills": role_req.required_skills,
                "preferred_skills": role_req.preferred_skills,
                "min_experience": role_req.min_experience,
                "education": role_req.education_requirements,
                "soft_skills": role_req.soft_skills
            }
            
            chain = LLMChain(llm=llm, prompt=advanced_classification_prompt)
            result = chain.run(
                candidate_profile=json.dumps(candidate_profile, indent=2),
                role_requirements=json.dumps(requirements_summary, indent=2),
                industry_context=json.dumps(self.industry_context, indent=2)
            )
            
            # Parse JSON response
            try:
                classification = json.loads(result.strip())
                classification["target_role"] = target_role.value
                print(f"✅ Advanced role analysis completed - {classification.get('recommendation', 'Unknown')}")
                return classification
            except json.JSONDecodeError:
                print("⚠️ JSON parsing failed for advanced classification")
                return {"target_role": target_role.value, "raw_response": result.strip()}
                
        except Exception as e:
            print(f"❌ Error in advanced role classification: {str(e)}")
            return {"target_role": target_role.value, "error": str(e)}
    
    def find_best_role_match_weighted(self, role_analyses: dict) -> dict:
        """
        Find the best role match using weighted scoring algorithm
        
        Args:
            role_analyses (dict): All role analysis results
            
        Returns:
            dict: Best matching role with detailed metrics
        """
        best_role = {
            "role": "Unknown", 
            "score": 0, 
            "recommendation": "No match",
            "skill_match": 0,
            "experience_match": 0
        }
        
        for role_name, analysis in role_analyses.items():
            try:
                # Extract various scores for weighted calculation
                if "role_fit_score" in analysis:
                    overall_score = float(analysis.get("role_fit_score", 0))
                    skill_match = float(analysis.get("skill_match_percentage", 0))
                    exp_quality = float(analysis.get("experience_quality_score", 0))
                    
                    # Apply weighted scoring (can be adjusted based on priorities)
                    weighted_score = (
                        overall_score * 0.5 + 
                        skill_match * 0.3 + 
                        exp_quality * 0.2
                    )
                    
                    if weighted_score > best_role["score"]:
                        best_role = {
                            "role": role_name,
                            "score": weighted_score,
                            "recommendation": analysis.get("recommendation", "Unknown"),
                            "skill_match": skill_match,
                            "experience_match": exp_quality,
                            "technical_level": analysis.get("technical_proficiency", "Unknown")
                        }
            except (ValueError, TypeError):
                continue
        
        return best_role
    
    def generate_candidate_visualization(self, candidate_result: dict) -> None:
        """
        Generate visualization charts for candidate analysis
        
        Args:
            candidate_result (dict): Complete candidate analysis
        """
        try:
            # Create figure with multiple subplots
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            
            # Chart 1: Role fit comparison
            roles = []
            scores = []
            
            for role, analysis in candidate_result["role_analyses"].items():
                if "role_fit_score" in analysis:
                    try:
                        score = float(analysis["role_fit_score"])
                        roles.append(role)
                        scores.append(score)
                    except (ValueError, TypeError):
                        continue
            
            # Sort by score for better visualization
            sorted_data = sorted(zip(roles, scores), key=lambda x: x[1])
            roles, scores = zip(*sorted_data) if sorted_data else ([], [])
            
            bars = ax1.barh(roles, scores, color='skyblue')
            ax1.set_xlim(0, 100)
            ax1.set_xlabel('Match Score (%)')
            ax1.set_title('Role Fit Analysis')
            
            # Add value labels
            for bar in bars:
                width = bar.get_width()
                label_x_pos = width + 1
                ax1.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{width:.0f}%',
                        va='center')
            
            # Chart 2: Skill breakdown for best role
            best_role = candidate_result["best_role_match"]["role"]
            best_role_analysis = candidate_result["role_analyses"][best_role]
            
            if "matching_skills" in best_role_analysis and "missing_skills" in best_role_analysis:
                matching = len(best_role_analysis["matching_skills"])
                missing = len(best_role_analysis["missing_skills"])
                
                ax2.pie([matching, missing], 
                        labels=['Matching Skills', 'Missing Skills'],
                        autopct='%1.1f%%',
                        colors=['#66b3ff', '#ff9999'])
                ax2.set_title(f'Skill Analysis for {best_role}')
            
            plt.tight_layout()
            
            # Save the visualization
            candidate_name = candidate_result["candidate_info"].get("name", "unknown").replace(" ", "_").lower()
            output_dir = "output/visualizations"
            os.makedirs(output_dir, exist_ok=True)
            
            fig_path = os.path.join(output_dir, f"{candidate_name}_analysis.png")
            plt.savefig(fig_path)
            plt.close()
            
            print(f"📊 Candidate visualization saved to {fig_path}")
            
        except Exception as e:
            print(f"❌ Error generating visualization: {str(e)}")
    
    def generate_enhanced_reports(self) -> None:
        """
        Generate enhanced HTML and JSON reports for all processed candidates
        """
        if not self.results:
            print("❌ No results to generate reports")
            return
        
        output_dir = "output/enhanced_reports"
        os.makedirs(output_dir, exist_ok=True)
        
        # Generate individual HTML reports for each candidate
        for result in self.results:
            candidate_name = result["candidate_info"].get("name", "unknown").replace(" ", "_").lower()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            
            # Save detailed JSON report
            json_path = os.path.join(output_dir, f"{candidate_name}_{timestamp}.json")
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2)
            
            # Generate HTML report with visualizations
            html_path = os.path.join(output_dir, f"{candidate_name}_{timestamp}.html")
            self._create_html_report(result, html_path)
            
            print(f"📄 Enhanced reports generated for {candidate_name}")
    
    def _create_html_report(self, result: dict, output_path: str) -> None:
        """
        Create an HTML report for a candidate
        
        Args:
            result (dict): Candidate analysis result
            output_path (str): Path to save the HTML report
        """
        try:
            # Generate visualization for the report
            self.generate_candidate_visualization(result)
            
            # Get candidate info
            candidate = result["candidate_info"]
            best_role = result["best_role_match"]
            
            # Create HTML content
            html_content = f"""
            <!DOCTYPE html>
            <html>
            <head>
                <title>CV Analysis Report: {candidate.get('name', 'Candidate')}</title>
                <style>
                    body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
                    h1, h2, h3 {{ color: #2c3e50; }}
                    .container {{ max-width: 1000px; margin: 0 auto; }}
                    .header {{ background-color: #3498db; color: white; padding: 20px; border-radius: 5px; }}
                    .section {{ margin: 20px 0; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }}
                    .score-high {{ color: green; font-weight: bold; }}
                    .score-medium {{ color: orange; font-weight: bold; }}
                    .score-low {{ color: red; font-weight: bold; }}
                    table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
                    th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
                    th {{ background-color: #f2f2f2; }}
                    .visualization {{ text-align: center; margin: 30px 0; }}
                </style>
            </head>
            <body>
                <div class="container">
                    <div class="header">
                        <h1>CV Analysis Report</h1>
                        <p>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M')}</p>
                    </div>
                    
                    <div class="section">
                        <h2>Candidate Profile</h2>
                        <table>
                            <tr><td><strong>Name</strong></td><td>{candidate.get('name', 'N/A')}</td></tr>
                            <tr><td><strong>Contact</strong></td><td>Email: {candidate.get('email', 'N/A')}<br>Phone: {candidate.get('phone', 'N/A')}</td></tr>
                            <tr><td><strong>Location</strong></td><td>{candidate.get('location', 'N/A')}</td></tr>
                            <tr><td><strong>Experience</strong></td><td>{candidate.get('total_experience_years', 'N/A')} years</td></tr>
                        </table>
                        
                        <h3>Skills</h3>
                        <p>{', '.join(candidate.get('skills', ['No skills listed']))}</p>
                        
                        <h3>Experience</h3>
                        <table>
                            <tr>
                                <th>Company</th>
                                <th>Position</th>
                                <th>Duration</th>
                                <th>Responsibilities</th>
                            </tr>
            """
            
            # Add experience entries
            for exp in candidate.get('experience', []):
                html_content += f"""
                            <tr>
                                <td>{exp.get('company', 'N/A')}</td>
                                <td>{exp.get('position', 'N/A')}</td>
                                <td>{exp.get('duration', 'N/A')}</td>
                                <td>{exp.get('responsibilities', 'N/A')}</td>
                            </tr>
                """
            
            if not candidate.get('experience'):
                html_content += """
                            <tr>
                                <td colspan="4">No experience data available</td>
                            </tr>
                """
            
            html_content += """
                        </table>
                        
                        <h3>Education</h3>
                        <table>
                            <tr>
                                <th>Degree</th>
                                <th>Institution</th>
                                <th>Year</th>
                                <th>Field</th>
                            </tr>
            """
            
            # Add education entries
            for edu in candidate.get('education', []):
                html_content += f"""
                            <tr>
                                <td>{edu.get('degree', 'N/A')}</td>
                                <td>{edu.get('institution', 'N/A')}</td>
                                <td>{edu.get('year', 'N/A')}</td>
                                <td>{edu.get('field', 'N/A')}</td>
                            </tr>
                """
            
            if not candidate.get('education'):
                html_content += """
                            <tr>
                                <td colspan="4">No education data available</td>
                            </tr>
                """
            
            html_content += """
                        </table>
                    </div>
                    
                    <div class="section">
                        <h2>Role Analysis</h2>
                        <h3>Best Role Match</h3>
                        <p>
                            <strong>Role:</strong> {0}<br>
                            <strong>Match Score:</strong> <span class="{1}">{2}%</span><br>
                            <strong>Recommendation:</strong> {3}<br>
                            <strong>Technical Level:</strong> {4}
                        </p>
                        
                        <h3>Role Comparison</h3>
                        <div class="visualization">
                            <img src="../visualizations/{5}_analysis.png" alt="Candidate Analysis Chart" style="max-width: 100%;">
                        </div>
                    </div>
                    
                    <div class="section">
                        <h2>Recommendation</h2>
            """.format(
                best_role['role'],
                'score-high' if best_role['score'] >= 75 else 'score-medium' if best_role['score'] >= 50 else 'score-low',
                round(best_role['score']),
                best_role['recommendation'],
                best_role.get('technical_level', 'N/A'),
                candidate.get('name', 'unknown').replace(" ", "_").lower()
            )
            
            # Get the best role analysis details
            best_role_name = best_role['role']
            best_role_analysis = result["role_analyses"][best_role_name]
            
            html_content += f"""
                        <h3>Strengths</h3>
                        <ul>
            """
            
            # Add strengths
            for strength in best_role_analysis.get('strengths', []):
                html_content += f"<li>{strength}</li>\n"
            
            if not best_role_analysis.get('strengths'):
                html_content += "<li>No specific strengths identified</li>\n"
            
            html_content += """
                        </ul>
                        
                        <h3>Areas for Development</h3>
                        <ul>
            """
            
            # Add growth areas
            for area in best_role_analysis.get('growth_areas', best_role_analysis.get('concerns', [])):
                html_content += f"<li>{area}</li>\n"
            
            if not best_role_analysis.get('growth_areas') and not best_role_analysis.get('concerns'):
                html_content += "<li>No specific development areas identified</li>\n"
            
            html_content += """
                        </ul>
                        
                        <h3>Interview Focus Areas</h3>
                        <ul>
            """
            
            # Add interview focus areas
            for area in best_role_analysis.get('interview_focus_areas', []):
                html_content += f"<li>{area}</li>\n"
            
            if not best_role_analysis.get('interview_focus_areas'):
                html_content += "<li>No specific interview focus areas suggested</li>\n"
            
            html_content += """
                        </ul>
                        
                        <h3>Justification</h3>
                        <p>
                            {0}
                        </p>
                    </div>
                    
                    <div class="section">
                        <h3>Processing Information</h3>
                        <p>
                            <strong>Source File:</strong> {1}<br>
                            <strong>Format Type:</strong> {2}<br>
                            <strong>Processed At:</strong> {3}
                        </p>
                    </div>
                </div>
            </body>
            </html>
            """.format(
                best_role_analysis.get('justification', 'No justification provided.'),
                result['source_file'],
                result['format_type'].capitalize(),
                result['processed_at']
            )
            
            # Write HTML to file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(html_content)
            
        except Exception as e:
            print(f"❌ Error creating HTML report: {str(e)}")

# Initialize the enhanced analyzer
project2_analyzer = Project2_EnhancedAnalyzer(doc_processor)

## Process All CV Files 🚀

Now let's process all available CV files with our enhanced multi-format analyzer:

In [None]:
# Process all available CV files with enhanced analyzer

def process_all_cv_files():
    """
    Process all CV files in the data/cvs folder with the enhanced analyzer
    """
    print("\n🚀 Starting Enhanced Multi-Format CV Analysis")
    print("=" * 60)
    
    # Check if API key is available
    if not openai_api_key:
        print("❌ OpenAI API key not configured")
        print("Please set up your config.txt file with OPENAI_API_KEY")
        return
    
    # Get all CV files from data folder
    cv_dir = os.path.join('data', 'cvs')
    cv_files = []
    
    if os.path.exists(cv_dir):
        for file in os.listdir(cv_dir):
            # Only include supported formats
            format_type = doc_processor.detect_format(file)
            if format_type != DocumentFormat.UNKNOWN:
                cv_files.append(file)
    
    if not cv_files:
        print("❌ No CV files found in data/cvs/")
        print("Please add CV files in supported formats to continue.")
        return
    
    # Display processing information
    print(f"📄 Processing {len(cv_files)} CV files with enhanced multi-format analyzer")
    
    # Process each CV
    for cv_file in cv_files:
        cv_path = os.path.join(cv_dir, cv_file)
        result = project2_analyzer.process_single_cv(cv_path)
        
        if "error" not in result:
            print(f"\n📊 Enhanced Results for {cv_file}:")
            print(f"- Name: {result['candidate_info'].get('name', 'Not found')}")
            print(f"- Best Role: {result['best_role_match']['role']} ({result['best_role_match']['score']:.1f}% fit)")
            print(f"- Technical Level: {result['best_role_match'].get('technical_level', 'Unknown')}")
            print(f"- Format: {result['format_type'].capitalize()}")
    
    # Generate enhanced reports
    project2_analyzer.generate_enhanced_reports()
    
    print(f"\n✅ Enhanced CV Analysis completed! Processed {project2_analyzer.processed_count} CVs")
    print("   Enhanced HTML and JSON reports have been generated in output/enhanced_reports")

# Define the method for processing a single CV (needed for the class)
def process_single_cv(self, cv_file_path: str) -> dict:
    """
    Complete enhanced processing pipeline for a single CV.
    
    Args:
        cv_file_path (str): Path to CV file
        
    Returns:
        dict: Complete candidate analysis
    """
    return self.process_cv(cv_file_path)

# Add the method to the Project2_EnhancedAnalyzer class
Project2_EnhancedAnalyzer.process_single_cv = process_single_cv

# Run the analysis on all CV files
if all_cv_files:  # Only run if CV files were found
    process_all_cv_files()

## 🎉 Project 2 Conclusion

In this advanced project, you've built a sophisticated CV analysis system that:

✅ **Processes multiple document formats** (PDF, Word, Excel, Text)  
✅ **Extracts enriched candidate information** with greater detail  
✅ **Applies advanced weighted scoring** for more accurate role matching  
✅ **Generates visual analytics** to aid decision making  
✅ **Produces professional HTML reports** for hiring managers  

This enhanced system demonstrates how to scale your HR automation from Project 1's basic functionality to an enterprise-grade solution that can handle the diverse document formats and complex analysis needs of modern HR departments.

### Complete CV Analyzer System

You now have a complete CV Analyzer system with:

1. **Project 1**: Basic text-based CV analysis with role matching  
2. **Project 2**: Advanced multi-format document processing with enhanced reporting  

Together, these projects form a comprehensive HR automation solution that significantly reduces the time and effort required for candidate evaluation while increasing consistency and accuracy.

### Next Steps

To expand this system further:

1. **Add a web interface** for easier interaction
2. **Implement candidate database** for long-term storage
3. **Add collaborative features** for team-based hiring
4. **Incorporate interview scheduling** and feedback collection
5. **Develop talent pool analytics** for strategic HR planning