In [14]:
# Resume data extraction using Gemini AI model
import google.generativeai as genai
import PyPDF2
import docx
import json
import os
import re
from dotenv import load_dotenv

In [15]:
# Load environment variables from the config/.env file
load_dotenv(dotenv_path="config/.env")

# Configure the API key using environment variable
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [16]:
# Function to extract text from the resume DOCX
def extract_text_from_docx(docx_file_path):
    doc = docx.Document(docx_file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"  # Adding new line for better formatting
    return text

# Function to extract text from the resume PDF
def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

In [17]:
# Use Gemini AI to analyze and extract structured information
def analyze_resume_text(resume_text):
    prompt = f"""
    Extract the following information from the given resume:
    1. Name
    2. Contact Number
    3. Email Address
    4. Education (Highest degree)
    5. Current Company Name
    6. Current Location - must be only the current location - one location like the city
    7. Primary Skills
    8. Secondary Skills
    9. Total Experience (in years)
    10. Relevant Experience in Primary Skills (in years)
    11. Relevant Experience in Secondary Skills (in years)
    
    If any information is not available, set it as 'NA'.
    
    Resume Text: {resume_text}
    """

    # Send the prompt to the Gemini AI model
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)

    return response.text  # Return the model's generated output

def clean_gemini_output(response_text):
    """
    Cleans the Gemini AI output by removing asterisks, extra spaces, unwanted characters,
    trailing commas, numbering prefixes, and any unnecessary lines. Returns a properly formatted JSON.
    """
    structured_info = {}

    for line in response_text.split("\n"):
        if ":" in line:
            key, value = line.split(":", 1)
            key_clean = re.sub(r'\*\*|\*|\"|\d+\.\s*', '', key).strip()  # Remove asterisks, quotes, and numbering
            value_clean = re.sub(r'\*\*|\*|\"', '', value).strip()  # Remove asterisks, quotes, and clean value
            
            # Remove any trailing commas
            value_clean = value_clean.rstrip(",")  
            
            # Skip unnecessary lines
            if key_clean in [
                "Here's the extracted information from the resume",
                "Here is the extracted information from the resume",
                "Here is the extracted information from the provided resume",
                "Here's the extracted information from the provided resume"
            ] or value_clean == "NA":
                continue
            
            structured_info[key_clean] = value_clean if value_clean else "NA"

    # Return the cleaned info as a JSON object
    return json.dumps(structured_info, indent=4)

In [18]:
# Parse and store output as JSON
def parse_response_to_json(response_text):
    return clean_gemini_output(response_text)

# Main function to run the model on a given resume (PDF or DOCX)
def process_resume(file_path):
    if file_path.endswith('.pdf'):
        resume_text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        resume_text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")

    analysis_result = analyze_resume_text(resume_text)
    resume_json = parse_response_to_json(analysis_result)
    
    return resume_json

In [19]:
# Function to process a batch of resumes
def process_batch(batch_files, results, batch_number):
    print(f"\nProcessing batch {batch_number} with {len(batch_files)} resumes...\n")
    for idx, file_path in enumerate(batch_files, start=1):
        filename = os.path.basename(file_path)
        print(f"Batch {batch_number} - Processing ({idx}/{len(batch_files)}): {filename}")
        try:
            extracted_info_json = process_resume(file_path)
            results[filename] = json.loads(extracted_info_json)  # Convert JSON string to dictionary
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            results[filename] = {"Error": str(e)}

In [20]:
#Function to process multiple resumes with a limit of 20 files and in batches
def process_all_resumes(input_path, batch_size=5):
    results = {}
    files_to_process = []

    #Determine if the input path is a file or a directory
    if os.path.isfile(input_path):
        # Single file input
        if input_path.endswith('.pdf') or input_path.endswith('.docx'):
            files_to_process.append(input_path)
        else:
            raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")
    elif os.path.isdir(input_path):
        all_files = os.listdir(input_path)
        #Filter for .pdf and .docx files
        for filename in all_files:
            if filename.lower().endswith(('.pdf', '.docx')):
                files_to_process.append(os.path.join(input_path, filename))
        #Check if number of files exceeds 20
        if len(files_to_process) > 20:
            raise ValueError("Too many files in the folder. The maximum allowed is 20 resumes.")
    else:
        raise ValueError("Invalid input path. Please provide a valid file or directory path.")

    #Check if total files exceed 20 (additional safety)
    if len(files_to_process) > 20:
        raise ValueError("Too many files to process. The maximum allowed is 20 resumes.")

    if len(files_to_process) == 1:
        #Single file processing
        filename = os.path.basename(files_to_process[0])
        print(f"Processing single file: {filename}")
        try:
            extracted_info_json = process_resume(files_to_process[0])
            results[filename] = json.loads(extracted_info_json)  # Convert JSON string to dictionary
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            results[filename] = {"Error": str(e)}
    else:
        #Multiple files processing in batches
        total_files = len(files_to_process)
        batch_count = (total_files // batch_size) + (1 if total_files % batch_size != 0 else 0)

        for batch_num in range(batch_count):
            start_idx = batch_num * batch_size
            end_idx = min(start_idx + batch_size, total_files)
            batch_files = files_to_process[start_idx:end_idx]
            process_batch(batch_files, results, batch_num + 1)

    return results

In [21]:
#Set 'input_path' to either a single file or a directory containing multiple resumes
#input_path = "Preetha_S_CV.pdf"  #Single file
#input_path = "resumes"            #Directory with multiple files
input_path = "resumes"
batch_size = 5          #Define your desired batch size
try:
    results = process_all_resumes(input_path, batch_size=batch_size)
    # Output the results to a JSON file for persistence
    output_file = "extracted_resume_data.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)
    print(f"\nAll resumes have been processed. Results saved to {output_file}.")
except Exception as e:
    print(f"An error occurred: {e}")



Processing batch 1 with 5 resumes...

Batch 1 - Processing (1/5): QuodeWorks1.pdf
Batch 1 - Processing (2/5): QuodeWorks11.pdf
Batch 1 - Processing (3/5): QuodeWorks12.pdf
Batch 1 - Processing (4/5): Quodeworks13.pdf
Batch 1 - Processing (5/5): Quodeworks14.docx

Processing batch 2 with 5 resumes...

Batch 2 - Processing (1/5): Quodeworks15.pdf
Batch 2 - Processing (2/5): Quodeworks16.docx
Batch 2 - Processing (3/5): QuodeWorks17.pdf
Batch 2 - Processing (4/5): QuodeWorks18.pdf
Batch 2 - Processing (5/5): QuodeWorks19.pdf

Processing batch 3 with 5 resumes...

Batch 3 - Processing (1/5): Quodeworks2.docx
Batch 3 - Processing (2/5): Quodeworks20.pdf
Batch 3 - Processing (3/5): QuodeWorks21.pdf
Batch 3 - Processing (4/5): Quodeworks3.pdf
Batch 3 - Processing (5/5): Quodeworks4.pdf

Processing batch 4 with 5 resumes...

Batch 4 - Processing (1/5): Quodeworks5.pdf
Batch 4 - Processing (2/5): Quodeworks6.pdf
Batch 4 - Processing (3/5): Quodeworks7.pdf
Batch 4 - Processing (4/5): QuodeWorks