In [1]:
import fitz
import re
import os
from pymongo import MongoClient

In [2]:
from pymongo import MongoClient

def save_data_to_mongodb(data, db_name='resume_db', collection_name='resumes'):
    try:
        # Connect to local MongoDB server with a 5-second timeout
        client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
        
        # Check server availability
        client.server_info()  # Will raise an exception if MongoDB is not running

        db = client[db_name]
        collection = db[collection_name]

        if isinstance(data, dict):
            collection.insert_one(data)
            print("Data successfully saved to MongoDB.")
        else:
            print("Error: Data must be a dictionary.")
        
        return True

    except Exception as e:
        print(f"An error occurred while saving to MongoDB: {e}")
        print("HINT: Make sure MongoDB server is running on your machine.")
        return False


In [3]:
def save_extracted_data(data, output_file):
    if data is None:
        print("Error: No data to save. Resume data is None.")
        return
        
    with open(output_file, 'w', encoding='utf-8') as f:
        for key, value in data.items():
            f.write(f"{key}:\n")
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        for subkey, subvalue in item.items():
                            f.write(f"  {subkey}: {subvalue}\n")
                        f.write("\n")
                    else:
                        f.write(f"  - {item}\n")
            elif isinstance(value, dict):
                for subkey, subvalue in value.items():
                    f.write(f"  {subkey}: {subvalue}\n")
            else:
                f.write(f"  {value}\n")
            f.write("\n")

In [4]:
def extract_text_from_pdf(file_path, output_txt_path):
    with fitz.open(file_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        f.write(text)
    return text

In [5]:
def extract_resume_info_from_txt(txt_file_path):
    try:
        with open(txt_file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        data = {}
        lines = text.strip().split('\n')

        # --- Basic Info ---
        data['Name'] = lines[0].strip() if lines else None
        email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
        phone_match = re.search(r'(\+91\s?\d{10})|(\d{10})', text)
        linkedin_match = re.search(r'(https?://linkedin\.com/in/\S+)|(linkedin\.com/in/\S+)', text, re.IGNORECASE)

        data['Email'] = email_match.group() if email_match else None
        data['Phone'] = phone_match.group() if phone_match else None
        data['LinkedIn'] = linkedin_match.group() if linkedin_match else None

        # --- Experience ---
        experience_section = re.search(r'EXPERIENCE(.*?)(?:PROJECTS|EDUCATION)', text, re.DOTALL | re.IGNORECASE)
        if experience_section:
            experiences = re.findall(r'([A-Z\s&]+)\s\|\s([A-Z ]+)\n([A-Za-z0-9\- ]+)\n(.*?)(?=(?:[A-Z\s&]+\s\||PROJECTS|EDUCATION))', 
                                    experience_section.group(1), re.DOTALL)
            data['Experience'] = []
            for company, role, dates, desc in experiences:
                data['Experience'].append({
                    'Company': company.strip().title(),
                    'Role': role.strip().title(),
                    'Duration': dates.strip(),
                    'Description': [d.strip("● ").strip() for d in desc.strip().split('\n') if d.strip()]
                })

        # --- Projects ---
        project_section = re.search(r'PROJECTS(.*?)(?:EDUCATION|SKILLS)', text, re.DOTALL | re.IGNORECASE)
        if project_section:
            project_blocks = re.split(r'\n(?=[A-Z][A-Z\s\-]+\n)', project_section.group(1).strip())
            data['Projects'] = []
            for block in project_blocks:
                lines = block.strip().split('\n')
                title = lines[0].strip()
                details = [l.strip("● ").strip() for l in lines[1:] if l.strip()]
                data['Projects'].append({
                    'Title': title,
                    'Details': details
                })

        # --- Education ---
        education_section = re.search(r'EDUCATION(.*?)(?:SKILLS|$)', text, re.DOTALL | re.IGNORECASE)
        if education_section:
            education_blocks = re.split(r'\n(?=[A-Z].+?,)', education_section.group(1).strip())
            data['Education'] = []
            for block in education_blocks:
                lines = block.strip().split('\n')
                if len(lines) >= 3:
                    institution = lines[0].strip()
                    duration = lines[1].strip()
                    degree = lines[2].strip()
                    score_match = re.search(r'[\d.]+%', block) or re.search(r'CGPA[:\s]+([\d.]+)', block)
                    score = score_match.group(0) if score_match else None
                    data['Education'].append({
                        'Institution': institution,
                        'Duration': duration,
                        'Degree': degree,
                        'Score': score
                    })

        # --- Skills ---
        skills_section = re.search(r'SKILLS(.*)', text, re.DOTALL | re.IGNORECASE)
        if skills_section:
            skills_text = skills_section.group(1)
            data['Skills'] = {
                'Programming Languages': re.search(r'Programming Languages:\s*(.*)', skills_text).group(1).strip() if re.search(r'Programming Languages:\s*(.*)', skills_text) else '',
                'Technical Skills': re.search(r'Technical Skills:\s*(.*)', skills_text).group(1).strip() if re.search(r'Technical Skills:\s*(.*)', skills_text) else '',
                'Languages Spoken': re.search(r'Languages:\s*(.*)', skills_text).group(1).strip() if re.search(r'Languages:\s*(.*)', skills_text) else '',
                'Certifications': re.search(r'Certifications:\s*(.*)', skills_text).group(1).strip() if re.search(r'Certifications:\s*(.*)', skills_text) else ''
            }

        return data
    except Exception as e:
        print(f"Error extracting resume information: {e}")
        return {}

In [6]:
def main():
    # Define file paths
    pdf_path = r"C:\Users\Charan\Downloads\Amrutha_Y_Resume (1).pdf"   
    raw_text_path = "resume_text.txt"
    structured_output_path = "resume_extracted_info.txt"

    try:
        if not os.path.isfile(pdf_path):
            print(f"Error: PDF file not found at '{pdf_path}'")
            print("Please update the pdf_path variable above.")
            return
        else:
            print(f"Found PDF file: {pdf_path}")
            print("Extracting text...")
            extract_text_from_pdf(pdf_path, raw_text_path)
            print(f"Raw text saved to: {raw_text_path}")
    except Exception as e:
        print(f"Error during PDF extraction: {e}")
        return

    try:
        print("Parsing resume data...")
        resume_data = extract_resume_info_from_txt(raw_text_path)
        
        if resume_data:
            # Save to text file
            save_extracted_data(resume_data, structured_output_path)
            print(f"Structured data saved to: {structured_output_path}")
            
            # Print a preview
            print("\nExtracted data preview:")
            print(f"Name: {resume_data.get('Name', 'Not found')}")
            print(f"Email: {resume_data.get('Email', 'Not found')}")
            print(f"Phone: {resume_data.get('Phone', 'Not found')}")
            print(f"Number of experiences: {len(resume_data.get('Experience', []))}")
            print(f"Number of projects: {len(resume_data.get('Projects', []))}")
        else:
            print("Failed to extract resume data.")
    except Exception as e:
        print(f"Error during parsing: {e}")

# Run the main function
if __name__ == "__main__":
    main()


Found PDF file: C:\Users\Charan\Downloads\Amrutha_Y_Resume (1).pdf
Extracting text...
Raw text saved to: resume_text.txt
Parsing resume data...
Structured data saved to: resume_extracted_info.txt

Extracted data preview:
Name: AMRUTHA Y
Email: amruthachatre@gmail.com
Phone: +91 9353803942
Number of experiences: 1
Number of projects: 3


In [7]:
# try:
#     if not os.path.isfile(pdf_path):
#         print(f"Error: PDF file not found at '{pdf_path}'")
#         print("Please update the pdf_path variable above.")
#     else:
#         print(f"Found PDF file: {pdf_path}")
#         print("Extracting text...")
#         extract_text_from_pdf(pdf_path, raw_text_path)
#         print(f"Raw text saved to: {raw_text_path}")
# except Exception as e:
#     print(f"Error during PDF extraction: {e}")

In [8]:
# try:
#     print("Parsing resume data...")
#     resume_data = extract_resume_info_from_txt(raw_text_path)
    
#     if resume_data:
#         # Save to text file
#         save_extracted_data(resume_data, structured_output_path)
#         print(f"Structured data saved to: {structured_output_path}")
        
#         # Print a preview
#         print("\nExtracted data preview:")
#         print(f"Name: {resume_data.get('Name', 'Not found')}")
#         print(f"Email: {resume_data.get('Email', 'Not found')}")
#         print(f"Phone: {resume_data.get('Phone', 'Not found')}")
#         print(f"Number of experiences: {len(resume_data.get('Experience', []))}")
#         print(f"Number of projects: {len(resume_data.get('Projects', []))}")
#     else:
#         print("Failed to extract resume data.")
# except Exception as e:
#     print(f"Error during parsing: {e}")

In [9]:
# if __name__ == "__main__":
#     main()

In [10]:
# def main():
#     # Set this to False if you don't want to use MongoDB or don't have it installed
#     USE_MONGODB = False
    
#     # Define file paths - modify these as needed
#     pdf_path = r"C:\Users\Charan\Downloads\Amrutha_Y_Resume (1).pdf"
    
#     # Use the current directory for output files
#     current_dir = os.path.dirname(os.path.abspath(__file__))
#     raw_text_path = os.path.join(current_dir, "resume_text.txt")
#     structured_output_path = os.path.join(current_dir, "resume_extracted_info.txt")
    
#     try:
#         # Check if the PDF file exists
#         if not os.path.isfile(pdf_path):
#             print(f"Error: PDF file not found at '{pdf_path}'")
#             print("Please update the pdf_path variable in the code.")
#             return
            
#         # Process the resume
#         print(f"Extracting text from PDF: {pdf_path}")
#         extract_text_from_pdf(pdf_path, raw_text_path)
#         print(f"Raw text saved to: {raw_text_path}")
        
#         print("Parsing resume data...")
#         resume_data = extract_resume_info_from_txt(raw_text_path)
        
#         if resume_data:
#             # Try to save to MongoDB only if enabled
#             if USE_MONGODB:
#                 db_success = save_data_to_mongodb(resume_data)
#             else:
#                 db_success = False
#                 print("MongoDB storage skipped (disabled in settings).")
            
#             # Save to text file
#             save_extracted_data(resume_data, structured_output_path)
#             print(f"Structured data saved to: {structured_output_path}")
            
#             if db_success:
#                 print("Process completed successfully with MongoDB storage.")
#             else:
#                 print("Process completed successfully (without MongoDB storage).")
                
#             # Print a sample of what was extracted
#             print("\nExtracted data preview:")
#             print(f"Name: {resume_data.get('Name', 'Not found')}")
#             print(f"Email: {resume_data.get('Email', 'Not found')}")
#             print(f"Phone: {resume_data.get('Phone', 'Not found')}")
#             print(f"Number of experiences: {len(resume_data.get('Experience', []))}")
#             print(f"Number of projects: {len(resume_data.get('Projects', []))}")
            
#         else:
#             print("Failed to extract resume data.")
#     except Exception as e:
#         print(f"An error occurred during processing: {e}")

# if __name__ == "__main__":
#     main()