In [2]:
!apt-get update
# Install required dependencies
!apt-get install -y poppler-utils tesseract-ocr
!pip install pdf2image pytesseract pandas google-generativeai openpyxl
!pip install pypdf2

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [62.5 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,560 kB]
Get:13 https://developer.download.nvidia.com/compute/cuda/re

In [13]:
# Import necessary libraries
import os
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
from google.colab import drive
from google.colab import userdata
import google.generativeai as genai
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

# Configure Tesseract and Poppler
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
os.environ["PATH"] += os.pathsep + "/usr/bin/"

# Mount Google Drive
drive.mount('/content/drive')

# Set up Generative AI API key
# Store your API KEY in Secrets section and get its access
API_KEY = userdata.get('API_KEY')
if API_KEY:
    # Initialize the Gemini API with the retrieved API key
    genai.configure(api_key=API_KEY)
else:
    print("API_KEY not found. Please ensure it is stored correctly in the Secrets tab.")
#genai.configure(api_key=API_KEY)

# Configure the Generative AI model
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

# Function to check if a PDF is text-based
def is_text_pdf(pdf_path):
    """Check if the PDF is text-based or image-based."""
    try:
        reader = PdfReader(pdf_path)
        first_page_text = reader.pages[0].extract_text()
        return bool(first_page_text)  # Return True if text is found
    except Exception as e:
        print(f"Error checking PDF text: {e}")
        return False

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    """Convert PDF to images using pdf2image."""
    try:
        images = convert_from_path(pdf_path, dpi=100)  # Reduced DPI for faster conversion
        print(f"Successfully converted {len(images)} pages from {pdf_path}.")
        return images
    except Exception as e:
        print(f"Error converting PDF {pdf_path}: {e}")
        return []

# Function to extract text from images using OCR
custom_oem_psm_config = r'--oem 3 --psm 6'  # Use OEM 3 (both standard and LSTM OCR) and PSM 6 (single block of text)
def extract_text_from_images(images):
    """Extract text from images using Tesseract OCR with optimizations."""
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, config=custom_oem_psm_config)
    return text

# Function to extract text directly from PDF if it's text-based, else perform OCR
def extract_text_from_pdf(pdf_path):
    """Extract text directly from PDF if it's text-based, else perform OCR."""
    if is_text_pdf(pdf_path):
        # Directly extract text from the PDF (no OCR)
        with open(pdf_path, "rb") as f:
            reader = PdfReader(f)
            return "\n".join([page.extract_text() for page in reader.pages])
    else:
        # Perform OCR if the PDF is image-based
        images = pdf_to_images(pdf_path)
        return extract_text_from_images(images)

# Function to generate insights using Generative AI
def generate_insights(resume_text):
    """Generate insights from resume text using Generative AI."""
    try:
        # Start a chat session
        chat_session = model.start_chat(history=[])
        # Define the prompt
        prompt = f"""
Extract and organize the following details from the given resume text into a structured format. Use the scoring mechanism described below for **Gen AI Experience Score** and **AI/ML Experience Score**:

**Scoring Mechanism**:
- **1 (Exposed):** Candidate has theoretical knowledge but limited or no hands-on experience.
- **2 (Hands-on):** Candidate has practical experience through projects, internships, or related work but has not explored advanced concepts.
- **3 (Advanced):** Candidate has expertise in advanced areas such as Agentic RAG, Evals, or creating innovative AI/ML solutions.

Provide the following 13 fields for the resume in a structured format. If any detail is unavailable, return "Not Mentioned" for that field.

1. **Name**: The candidate's full name.
2. **Contact Details**: Email address and phone number.
3. **University**: Name of the university or educational institution.
4. **Year of Study**: Year or range of years during which the candidate pursued education.
5. **Course**: Name of the course or program studied.
6. **Discipline**: Area or field of study (e.g., Computer Science, Mechanical Engineering).
7. **CGPA/Percentage**: Academic performance score or grade.
8. **Key Skills**: A list of the candidate's skills, separated by commas.
9. **Certifications**: A list of certifications obtained, separated by commas.
10. **Internships**: Details of internships, including company names and roles.
11. **Projects**: Name and a 10-word description of each project.
12. **Gen AI Experience Score**: Assign a score (1-3) based on the above scoring mechanism.
13. **AI/ML Experience Score**: Assign a score (1-3) based on the above scoring mechanism.
Resume Text: {resume_text}

Output the result in a clear and structured format, with each field on a new line, and include the field name followed by a colon and the corresponding value (e.g., Name: John Doe). Ensure scores are logical and aligned with the scoring mechanism.
"""
        # Send the message
        response = chat_session.send_message(prompt)
        return response.text if response else None
    except Exception as e:
        print(f"Error generating insights: {e}")
        return None

# Function to parse AI-generated response into structured fields
def parse_generated_response(response_text):
    """Parse the AI-generated response into structured fields."""
    fields = {}
    try:
        lines = response_text.split("\n")
        for line in lines:
            if ":" in line:
                key, value = line.split(":", 1)
                fields[key.strip()] = value.strip()
    except Exception as e:
        print(f"Error parsing response: {e}")
    return fields

# Function to process a single resume (PDF to structured insights)
def process_resume(pdf_path):
    """Process a single resume: PDF to structured insights."""
    try:
        # Extract text from PDF (OCR if necessary)
        resume_text = extract_text_from_pdf(pdf_path)
        if not resume_text:
            return {}
        # Generate insights using Generative AI
        insights = generate_insights(resume_text)
        # Parse insights into structured fields
        return parse_generated_response(insights) if insights else {}
    except Exception as e:
        print(f"Error processing resume {pdf_path}: {e}")
        return {}

# Function to process multiple resumes in parallel
def process_resumes_from_drive_parallel(folder_path):
    """Process multiple resumes in a folder using parallel execution."""
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    results = []

    with ThreadPoolExecutor(max_workers = 8) as executor:
        futures = [executor.submit(process_resume, pdf_file) for pdf_file in pdf_files]
        for future in futures:
            result = future.result()
            if result:
                results.append(result)

    return pd.DataFrame(results)

# Main execution
folder_path = "/content/drive/MyDrive/Resume/Resume"  # Update with your folder path
processed_data = process_resumes_from_drive_parallel(folder_path)

# Save processed data to an Excel file
output_file = "/content/drive/MyDrive/Resume/2processed_resumes.xlsx"
processed_data.to_excel(output_file, index=False, engine='openpyxl')
print(f"Processed data saved to {output_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully converted 1 pages from /content/drive/MyDrive/Resume/Resume/updated resume .pdf.
Successfully converted 1 pages from /content/drive/MyDrive/Resume/Resume/Bhuvin_Resume_1.pdf.




Error generating insights: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error generating insights: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error generating insights: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Processed data saved to /content/drive/MyDrive/Resume/2processed_resumes.xlsx
