In [1]:
import docx
import re

# Check for embedded images
def check_images(doc):
    image_count = sum(1 for rel in doc.part.rels.values() if "image" in rel.reltype)
    return image_count

In [3]:
# Extract basic information
def extract_basic_info(text):
    name_pattern = r"^(\w+\s\w+)"
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\b\d{10}\b"
    address_pattern = r"\d+\s\w+\s\w+"

    name = re.findall(name_pattern, text)
    email = re.findall(email_pattern, text)
    phone = re.findall(phone_pattern, text)
    address = re.findall(address_pattern, text)

    return {
        'Name': name[0] if name else 'Not Found',
        'Email': email[0] if email else 'Not Found',
        'Phone': phone[0] if phone else 'Not Found',
        'Address': address[0] if address else 'Not Found'
    }

In [5]:
# Extract the Experience section
def extract_experience(doc):
    experience_text = []
    recording = False
    for para in doc.paragraphs:
        if "experience" in para.text.lower():
            recording = True
        if recording:
            experience_text.append(para.text)
            if para.text.strip() == "":
                break
    return "\n".join(experience_text)

In [7]:
# Count number of pages (Approximation based on character count)
def count_pages(doc):
    total_chars = sum(len(para.text) for para in doc.paragraphs)
    return max(1, total_chars // 1800)  # Rough estimate of 1800 characters per page


In [9]:
# Main pipeline to process the resume
def process_resume():
    try:
        # Load the .docx file
        file_path = input("Enter the full path to your .docx file: ")
        doc = docx.Document(file_path)
        print(f"Loaded {len(doc.paragraphs)} paragraphs from the document!")

        # Combine all paragraphs into a single text
        text = '\n'.join([para.text for para in doc.paragraphs])

        # Get embedded images count
        image_count = check_images(doc)
        print(f"\nImages Count: {image_count}")

        # Extract basic information
        basic_info = extract_basic_info(text)
        print(f"\nBasic Information: {basic_info}")

        # Extract experience section
        experience_section = extract_experience(doc)
        print(f"\nExperience Section:\n{experience_section}")

        # Count the number of pages
        page_count = count_pages(doc)
        print(f"Page Count: {page_count}")
    except Exception as e:
        print(f"Error: {e}")

# Execute the process_resume function
if __name__ == "__main__":
    process_resume()

Enter the full path to your .docx file:  C:\Users\Forhad\OneDrive\Documents\John Doe_Resume.docx


Loaded 9 paragraphs from the document!

Images Count: 0

Basic Information: {'Name': 'John Doe', 'Email': 'john.doe@example.com', 'Phone': '1234567890', 'Address': '123 Main St'}

Experience Section:
Experience:
- Data Scientist at ABC Corp (3 years)
Page Count: 1


In [11]:
import docx
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Check for embedded images
def check_images(doc):
    image_count = sum(1 for rel in doc.part.rels.values() if "image" in rel.reltype)
    return image_count

# Extract basic information
def extract_basic_info(text):
    name_pattern = r"^(\w+\s\w+)"
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\b\d{10}\b"
    address_pattern = r"\d+\s\w+\s\w+"

    name = re.findall(name_pattern, text)
    email = re.findall(email_pattern, text)
    phone = re.findall(phone_pattern, text)
    address = re.findall(address_pattern, text)

    return {
        'Name': name[0] if name else 'Not Found',
        'Email': email[0] if email else 'Not Found',
        'Phone': phone[0] if phone else 'Not Found',
        'Address': address[0] if address else 'Not Found'
    }

# Extract the Experience section
def extract_experience(doc):
    experience_text = []
    recording = False
    for para in doc.paragraphs:
        if "experience" in para.text.lower():
            recording = True
        if recording:
            experience_text.append(para.text)
            if para.text.strip() == "":
                break
    return "\n".join(experience_text)

# Count number of pages (Approximation based on character count)
def count_pages(doc):
    total_chars = sum(len(para.text) for para in doc.paragraphs)
    return max(1, total_chars // 1800)  # Rough estimate of 1800 characters per page

# Check for full address
def train_address_model():
    # Training data
    data = [
        ("123 Main St, Springfield", 1),
        ("456 Elm St, Somecity, CA 98765", 1),
        ("Main Street", 0),
        ("Apartment 23, 789 North Ave", 0)
    ]
    texts, labels = zip(*data)

    # Feature extraction and model training
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    model = LogisticRegression()
    model.fit(X, labels)

    return vectorizer, model

def check_full_address(text, vectorizer, model):
    X = vectorizer.transform([text])
    return bool(model.predict(X)[0])

# Check for portfolio links
def check_portfolio_links(text):
    github_pattern = r"github\.com/[a-zA-Z0-9_-]+"
    linkedin_pattern = r"linkedin\.com/in/[a-zA-Z0-9_-]+"

    github = re.search(github_pattern, text)
    linkedin = re.search(linkedin_pattern, text)

    return {
        'GitHub': github.group(0) if github else 'Not Found',
        'LinkedIn': linkedin.group(0) if linkedin else 'Not Found'
    }

# Main pipeline to process the resume
def process_resume():
    try:
        # Load the .docx file
        file_path = input("Enter the full path to your .docx file: ")
        doc = docx.Document(file_path)
        print(f"Loaded {len(doc.paragraphs)} paragraphs from the document!")

        # Combine all paragraphs into a single text
        text = '\n'.join([para.text for para in doc.paragraphs])

        # Get embedded images count
        image_count = check_images(doc)
        print(f"\nImages Count: {image_count}")

        # Extract basic information
        basic_info = extract_basic_info(text)
        print(f"\nBasic Information: {basic_info}")

        # Extract experience section
        experience_section = extract_experience(doc)
        print(f"\nExperience Section:\n{experience_section}")

        # Count the number of pages
        page_count = count_pages(doc)
        print(f"Page Count: {page_count}")

        # Train the address model
        vectorizer, address_model = train_address_model()

        # Check for full address
        has_full_address = check_full_address(text, vectorizer, address_model)
        print(f"\nFull Address Found: {'Yes' if has_full_address else 'No'}")

        # Check for portfolio links
        portfolio_links = check_portfolio_links(text)
        print(f"\nPortfolio Links: {portfolio_links}")

    except Exception as e:
        print(f"Error: {e}")

# Execute the process_resume function
if __name__ == "__main__":
    process_resume()


Enter the full path to your .docx file:  C:\Users\Forhad\OneDrive\Documents\John Doe_Resume.docx


Loaded 9 paragraphs from the document!

Images Count: 0

Basic Information: {'Name': 'John Doe', 'Email': 'john.doe@example.com', 'Phone': '1234567890', 'Address': '123 Main St'}

Experience Section:
Experience:
- Data Scientist at ABC Corp (3 years)
Page Count: 1

Full Address Found: Yes

Portfolio Links: {'GitHub': 'github.com/johndoe', 'LinkedIn': 'linkedin.com/in/johndoe'}
