In [12]:
import sys
from docstrange import DocumentExtractor

def extract_document_text(document_path: str):
    """
    Extracts and prints the text content from a given document.

    Args:
        document_path (str): The file path to the document (e.g., 'document.pdf').
    """
    print(f"Starting text extraction from: {document_path}\n")

    # Initialize the DocumentExtractor in local CPU mode.
    # This ensures that all processing happens on your machine and no data
    # is sent to a cloud API.
    try:
        extractor = DocumentExtractor()
    except Exception as e:
        print(f"Error initializing DocumentExtractor: {e}")
        print("Please ensure you have installed the necessary dependencies.")
        print("If you are running for the first time, you may need an internet connection to download models.")
        return

    try:
        # The extract() method processes the document.
        # It handles various formats like PDF, DOCX, and images.
        result = extractor.extract(document_path)
        
        # We can extract the content in various formats. Here, we'll get
        # the cleaned, LLM-optimized Markdown text.
        extracted_text = result.extract_markdown()

        if extracted_text:
            print("--- Extracted Text ---")
            print(extracted_text)
            print("----------------------")
        else:
            print("No text could be extracted from the document.")

    except FileNotFoundError:
        print(f"Error: The file '{document_path}' was not found.")
        print("Please check the file path and try again.")
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
    
    return extracted_text

if __name__ == "__main__":
    # In a real-world scenario, you would replace 'your_document.pdf'
    # with the actual path to your document.
    # For a command-line script, you could also use sys.argv to get the path.
    sample_document_path = './CV_Image.png'

 

    extract_document_text(sample_document_path)


Starting text extraction from: ./CV_Image.png

--- Extracted Text ---
# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
**Phone**
0240542834

**Email**
aidooenochkwadwo@gmail.com

**Kumasi, Ghana**

## Qualities
* Curiosity
* Problem Solving
* System Understanding
* Technical Skills
* Analytical Thinking
* Problem Solving Skills
* Teamwork
* Initiative and Self-motivation
* Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Comb

In [13]:
extracted_text = extract_document_text('./CV_Image.png')
with open('extracted_text.txt', 'w', encoding='utf-8') as f:
    f.write(extracted_text)

Starting text extraction from: ./CV_Image.png

--- Extracted Text ---
# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
**Phone**
0240542834

**Email**
aidooenochkwadwo@gmail.com

**Kumasi, Ghana**

## Qualities
* Curiosity
* Problem Solving
* System Understanding
* Technical Skills
* Analytical Thinking
* Problem Solving Skills
* Teamwork
* Initiative and Self-motivation
* Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Comb

In [17]:
import re
import json

def structure_cv_schema(text):
    schema = {
        "name": None,
        "email": None,
        "phone": None,
        "summary": None,
        "work_experience": [],
        "education": [],
        "skills": [],
        "soft_skills": [],
        "certifications": [],
        "projects": [],
        "languages": None,
        "hobbies": None,
        "other": None
    }

    # Extract email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    if email_match:
        schema['email'] = email_match.group(0)

    # Extract phone (simple pattern, may need adjustment)
    phone_match = re.search(r'(\+?\d[\d\s\-]{7,}\d)', text)
    if phone_match:
        schema['phone'] = phone_match.group(0)

    # Extract name (assume first line is name if it looks like a name)
    lines = text.splitlines()
    if lines:
        first_line = lines[0].strip()
        if len(first_line.split()) >= 2 and not any(x in first_line.lower() for x in ['curriculum', 'resume', 'cv']):
            schema['name'] = first_line

    # Extract sections by keywords
    section_patterns = {
        'education': r'(education|academic background|qualifications)',
        'work_experience': r'(experience|employment|work history|professional experience|work experience)',
        'skills': r'(skills|technical skills|competencies)',
        'summary': r'(summary|profile|about me)',
        'certifications': r'(certifications|certificates)',
        'projects': r'(projects|project experience)',
        'soft_skills': r'(soft skills|personal skills|interpersonal skills)',
        'languages': r'(languages|language proficiency)',
        'hobbies': r'(hobbies|interests)',
        'other': r'(other|additional information)'
    }

    # Find section indices
    section_indices = {}
    for key, pattern in section_patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            section_indices[key] = match.start()

    # Sort sections by appearance
    sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])

    # Extract section contents
    for i, (section, start_idx) in enumerate(sorted_sections):
        end_idx = sorted_sections[i+1][1] if i+1 < len(sorted_sections) else len(text)
        content = text[start_idx:end_idx].strip()
        if section in ['education', 'work_experience', 'certifications', 'projects', 'soft_skills']:
            schema[section] = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns[section], line, re.IGNORECASE)]
        elif section == 'skills':
            skills_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['skills'], line, re.IGNORECASE)]
            if skills_lines:
                schema['skills'] = re.split(r',|;|\n', ' '.join(skills_lines))
                schema['skills'] = [s.strip() for s in schema['skills'] if s.strip()]
        elif section == 'summary':
            summary_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['summary'], line, re.IGNORECASE)]
            schema['summary'] = ' '.join(summary_lines)
        elif section in ['languages', 'hobbies', 'other']:
            lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns[section], line, re.IGNORECASE)]
            schema[section] = ', '.join(lines) if lines else None

    return schema

# Example usage: structure the extracted markdown or text
# Replace 'markdown_content' with your extracted text variable
# structured_cv = structure_cv_schema(markdown_content)
# print(json.dumps(structured_cv, indent=2))


In [18]:
structured_cv = structure_cv_schema(extracted_text)

In [19]:
structured_cv

{'name': '# Aidoo Enoch Kwadwo',
 'email': 'aidooenochkwadwo@gmail.com',
 'phone': '0240542834',
 'summary': 'A Data Analyst with about two years of',
 'work_experience': ['##'],
 'education': ['**Bachelor of Science in Computer Science.**',
  'KNUST, Ghana',
  '2023',
  '## Courses and'],
 'skills': ['* Analytical Thinking * Teamwork * Initiative and Self-motivation * Discipline and Resilient **Tools:** Python',
  'R',
  'AWS',
  'Microsoft Excel',
  'Google Sheets',
  'Power BI',
  'SQL **Packages/Frameworks:** NumPy',
  'Pandas',
  'Scikit-Learn',
  'Matplotlib',
  'Pytorch **Machine Learning:** Data Analysis',
  'Classification Modeling',
  'Deep Neural Networks',
  'Regression Modelling',
  'MLOPs',
  'Computer Vision',
  'Natural Language Processing',
  'Recommendation Systems ##'],
 'soft_skills': [],
 'certifications': ['* Coursera Crash Course on Python.',
  'Oct 2021',
  '* AWS Machine Learning Foundation.',
  'Oct 2021',
  '* Introduction to Deep Learning with Pytorch.',
  '

In [None]:
from docstrange import DocumentExtractor
import json

# Initialize the document extractor
extractor = DocumentExtractor()

# Define a sample JSON schema that represents your desired database schema
# The schema is designed to extract a comprehensive set of details from a resume
resume_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "summary": {"type": "string"},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {"type": "string"},
                    "title": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"},
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["company", "title"]
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string"},
                    "field_of_study": {"type": "string"},
                    "institution": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"}
                },
                "required": ["degree", "institution"]
            }
        },
        "skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "soft_skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "certifications": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "issuing_organization": {"type": "string"},
                    "date": {"type": "string"}
                },
                "required": ["name"]
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "technologies": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["name"]
            }
        },
        "languages": {"type": "string"},
        "hobbies": {"type": "string"},
        "other": {"type": "string"}
    }
}

# Assume 'my_document.pdf' is a document like a resume
# You would replace this with the path to your actual file
document_path = './CV_Image.png'

# Extract data from the document using the JSON schema
try:
    result = extractor.extract(document_path)
    structured_data = result.extract_data(json_schema=resume_schema)

    # Print the resulting structured JSON
    print(json.dumps(structured_data, indent=2))

except FileNotFoundError:
    print(f"Error: Document not found at '{document_path}'. Please provide a valid file path.")
except Exception as e:
    print(f"An error occurred during extraction: {e}")


{
  "structured_data": {
    "name": "Aidoo Enoch Kwadwo",
    "email": "aidoenochkwadwo@gmail.com",
    "phone": "0240542834",
    "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
    "work_experience": [
      {
        "company": "Really Great Tech",
        "title": "Data Analytics/AI/ML Engineer",
        "location": null,
        "start_date": "November 2023",
        "end_date": "October 2024",
        "responsibilities": [
          "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in an AI project.",
          

In [20]:
with open('extracted_file.txt', 'w', encoding='utf-8') as f:
    f.write(json.dumps(structured_data, indent=2, ensure_ascii=False))