# Resume Detail Extracting tool
### This tool extracts the import the details from multiple resumes at a time

## Import Libraries

In [None]:
import pdfplumber
import pandas as pd
import os
import re

## Extract Phone numbers

In [None]:
def extract_phone(text):
    """Extract phone numbers with improved pattern matching."""
    if not isinstance(text, str):
        return "Not Found"
    
    # Match various phone number formats
    patterns = [
        r"\+91[-\s]?\d{10}",  # +91 followed by 10 digits
        r"\b\d{10}\b",        # Plain 10 digits
        r"\d{3}[-\s]?\d{3}[-\s]?\d{4}"  # XXX-XXX-XXXX format
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            # Clean the found phone number
            phone = re.sub(r'[^\d+]', '', matches[0])
            return phone
    
    return "Not Found"

## Extract LinkedIN URLs

In [None]:
def extract_linkedin(text):
    """Extract LinkedIn URLs with improved pattern matching."""
    if not isinstance(text, str):
        return "Not Found"
    
    # First, look for lines containing "linkedin" and extract URLs
    lines = text.lower().split('\n')
    for line in lines:
        if 'linkedin' in line:
            # Try to extract URL from the line
            url_match = re.search(r'(?:https?:\/\/)?(?:www\.)?linkedin\.com\/in\/[a-zA-Z0-9\-_.]+\/?', line)
            if url_match:
                return url_match.group()
            
            # If no URL format found, look for profile identifier after "linkedin.com/in/"
            profile_match = re.search(r'linkedin\.com\/in\/([a-zA-Z0-9\-_.]+)', line)
            if profile_match:
                return f"linkedin.com/in/{profile_match.group(1)}"
    
    # Fallback: look for LinkedIn URL anywhere in text
    url_match = re.search(r'(?:https?:\/\/)?(?:www\.)?linkedin\.com\/in\/[a-zA-Z0-9\-_.]+\/?', text.lower())
    return url_match.group() if url_match else "Not Found"

## Extract Email Addresses

In [None]:
def extract_email(text):
    """Extract email addresses with improved pattern matching."""
    if not isinstance(text, str):
        return "Not Found"
    
    # Look for email patterns
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    matches = re.findall(email_pattern, text)
    
    if matches:
        # Return the first valid email found
        return matches[0].lower()
    
    return "Not Found"

## Extract Names

In [None]:
def extract_name(text):
    """Extract name from resume with improved logic."""
    if not isinstance(text, str):
        return "Not Found"
    
    lines = text.split('\n')
    
    # Skip empty lines and get the first non-empty line
    for line in lines:
        line = line.strip()
        if line:
            # Remove common resume headers or titles
            name = re.sub(r'\b(resume|cv|curriculum vitae)\b', '', line, flags=re.IGNORECASE).strip()
            # Remove special characters and extra spaces
            name = re.sub(r'[^\w\s]', '', name)
            name = ' '.join(name.split())
            if name:
                return name
    
    return "Not Found"

## Extract all the necessary details

In [None]:
def extract_resume_details(pdf_path):
    """Extract all relevant details from a PDF resume with improved extraction."""
    details = {
        "Name": "Not Found",
        "Phone": "Not Found",
        "Email": "Not Found",
        "LinkedIn": "Not Found",
        "Education": "Not Found",
        "Work Experience": "Not Found",
        "Skills": "Not Found",
        "Certifications": "Not Found",
        "File Name": os.path.basename(pdf_path)
    }
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
            
            if not text.strip():
                print(f"Warning: No text could be extracted from {pdf_path}")
                return details
            
            # Extract basic information
            details["Name"] = extract_name(text)
            details["Phone"] = extract_phone(text)
            details["Email"] = extract_email(text)
            details["LinkedIn"] = extract_linkedin(text)
            
            # Extract Education
            education_sections = []
            education_started = False
            for line in text.split('\n'):
                line = line.strip()
                if re.search(r'\b(education|qualification|academic|degree)\b', line, re.IGNORECASE):
                    education_started = True
                    continue
                if education_started and line:
                    if re.search(r'\b(experience|work|employment|skills|projects)\b', line, re.IGNORECASE):
                        education_started = False
                        continue
                    if any(keyword in line.lower() for keyword in ['university', 'college', 'institute', 'school', 'degree', 'b.tech', 'b.e', 'm.tech', 'diploma']):
                        education_sections.append(line)
            
            details["Education"] = '; '.join(education_sections) if education_sections else "Not Found"
            
            # Extract Work Experience
            experience_sections = []
            experience_started = False
            for line in text.split('\n'):
                line = line.strip()
                if re.search(r'\b(experience|work history|employment)\b', line, re.IGNORECASE):
                    experience_started = True
                    continue
                if experience_started and line:
                    if re.search(r'\b(education|skills|projects|certificates)\b', line, re.IGNORECASE):
                        experience_started = False
                        continue
                    if re.search(r'\d{4}\s*[-–]\s*\d{4}|\d{4}\s*[-–]\s*present', line.lower()):
                        experience_sections.append(line)
            
            details["Work Experience"] = '; '.join(experience_sections) if experience_sections else "Not Found"
            
            # Extract Skills
            skills_sections = []
            skills_started = False
            for line in text.split('\n'):
                line = line.strip()
                if re.search(r'\b(technical skills|skills|technical expertise)\b', line, re.IGNORECASE):
                    skills_started = True
                    continue
                if skills_started and line:
                    if re.search(r'\b(experience|education|certificates|projects)\b', line, re.IGNORECASE):
                        skills_started = False
                        continue
                    if line and not re.match(r'^[\W\d]+$', line):  # Skip lines with only special characters or numbers
                        skills_sections.append(line)
            
            details["Skills"] = '; '.join(skills_sections) if skills_sections else "Not Found"
            
            # Extract Certifications
            cert_sections = []
            cert_started = False
            for line in text.split('\n'):
                line = line.strip()
                if re.search(r'\b(certifications?|certificates?)\b', line, re.IGNORECASE):
                    cert_started = True
                    continue
                if cert_started and line:
                    if re.search(r'\b(education|experience|skills|projects)\b', line, re.IGNORECASE):
                        cert_started = False
                        continue
                    if re.search(r'\b(certified|certification|certificate|udemy|coursera)\b', line, re.IGNORECASE):
                        cert_sections.append(line)
            
            details["Certifications"] = '; '.join(cert_sections) if cert_sections else "Not Found"
            
    except Exception as e:
        print(f"Error processing file {pdf_path}: {e}")
    
    return details

## Clean the extracted details

In [None]:
def sanitize_text(text):
    """Clean text for Excel compatibility."""
    if not isinstance(text, str):
        return text
    
    # Replace problematic characters
    text = text.replace('|', ';')
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    
    # Remove control characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    
    # Replace multiple spaces with single space
    text = ' '.join(text.split())
    
    # Limit text length for Excel
    return text[:32767] if len(text) > 32767 else text

## Process all the PDFs in the input folder and save the results

In [None]:
def process_resumes(folder_path, output_file):
    all_details = []
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Process each PDF file
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print(f"No PDF files found in {folder_path}")
        return
    
    for file_name in pdf_files:
        pdf_path = os.path.join(folder_path, file_name)
        try:
            print(f"Processing: {file_name}")
            details = extract_resume_details(pdf_path)
            sanitized_details = {k: sanitize_text(v) for k, v in details.items()}
            all_details.append(sanitized_details)
            print(f"Successfully processed: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    if not all_details:
        print("No resumes were processed successfully.")
        return

    # Convert to DataFrame
    df = pd.DataFrame(all_details)
    
    # Save to Excel with error handling
    try:
        df.to_excel(output_file, index=False, engine='xlsxwriter')
        print(f"Successfully saved {len(df)} resume details to: {output_file}")
    except Exception as e:
        try:
            df.to_excel(output_file, index=False, engine='openpyxl')
            print(f"Successfully saved {len(df)} resume details to: {output_file}")
        except Exception as e2:
            print(f"Error saving to Excel. Attempting to save as CSV...")
            csv_output = output_file.rsplit('.', 1)[0] + '.csv'
            df.to_csv(csv_output, index=False, encoding='utf-8-sig')
            print(f"Saved as CSV instead at: {csv_output}")

## Run main by giving the directories location

In [None]:
if __name__ == "__main__":
    resumes_folder = "E:/Data/Resume_gpt"
    output_excel = "E:/Data/xl/resumes1_details.xlsx"
    
    process_resumes(resumes_folder, output_excel)