In [2]:
import docx
import re

# Check for embedded images
def check_images(doc):
    image_count = sum(1 for rel in doc.part.rels.values() if "image" in rel.reltype)
    return image_count

In [4]:
# Extract basic information
def extract_basic_info(text):
    name_pattern = r"^(\w+\s\w+)"
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\b\d{10}\b"
    address_pattern = r"\d+\s\w+\s\w+"

    name = re.findall(name_pattern, text)
    email = re.findall(email_pattern, text)
    phone = re.findall(phone_pattern, text)
    address = re.findall(address_pattern, text)

    return {
        'Name': name[0] if name else 'Not Found',
        'Email': email[0] if email else 'Not Found',
        'Phone': phone[0] if phone else 'Not Found',
        'Address': address[0] if address else 'Not Found'
    }

In [6]:
# Extract the Experience section
def extract_experience(doc):
    experience_text = []
    recording = False
    for para in doc.paragraphs:
        if "experience" in para.text.lower():
            recording = True
        if recording:
            experience_text.append(para.text)
            if para.text.strip() == "":
                break
    return "\n".join(experience_text)

In [8]:
# Count number of pages (Approximation based on character count)
def count_pages(doc):
    total_chars = sum(len(para.text) for para in doc.paragraphs)
    return max(1, total_chars // 1800)  # Rough estimate of 1800 characters per page


In [10]:
# Main pipeline to process the resume
def process_resume():
    try:
        # Load the .docx file
        file_path = input("Enter the full path to your .docx file: ")
        doc = docx.Document(file_path)
        print(f"Loaded {len(doc.paragraphs)} paragraphs from the document!")

        # Combine all paragraphs into a single text
        text = '\n'.join([para.text for para in doc.paragraphs])

        # Get embedded images count
        image_count = check_images(doc)
        print(f"\nImages Count: {image_count}")

        # Extract basic information
        basic_info = extract_basic_info(text)
        print(f"\nBasic Information: {basic_info}")

        # Extract experience section
        experience_section = extract_experience(doc)
        print(f"\nExperience Section:\n{experience_section}")

        # Count the number of pages
        page_count = count_pages(doc)
        print(f"Page Count: {page_count}")
    except Exception as e:
        print(f"Error: {e}")

# Execute the process_resume function
if __name__ == "__main__":
    process_resume()

Enter the full path to your .docx file:  E:\Amrinder Business Analyst.docx


Loaded 146 paragraphs from the document!

Images Count: 0

Basic Information: {'Name': 'Amrinder Pelia', 'Email': 'amirindersingh1234@gmail.com', 'Phone': 'Not Found', 'Address': '10 years of'}

Experience Section:
Around 10 years of experience in Business process analysis, Business modeling and Business requirements gathering.
Extensive experience with Banking and Mortgage clients.
Expert in creating diagrams (Use case diagrams, flow charts, activity diagrams, sequence diagrams), use case document, test plans and test case documents.
Worked closely with project Stakeholders, SMEs, staff to understand requirements and specifications for new applications along with re-engineering the existing application.
Experience in interacting across the hierarchy from architects, to data modelers, underwriters and risk analyst.
Experience in iterative agile project management methodology with Scrum to manage the software development life cycle (SDLC).
Used MS Project to manage schedules, meet deadl