In [1]:
from docstrange import DocumentExtractor
import json

# CV Data Schema
resume_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "summary": {"type": "string"},
            "work_experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {"type": "string"},
                        "title": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "responsibilities": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["company", "title"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field_of_study": {"type": "string"},
                        "institution": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "soft_skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuing_organization": {"type": "string"},
                        "date": {"type": "string"}
                    },
                    "required": ["name"]
                }
            },
            "projects": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "description": {"type": "string"},
                        "technologies": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["name"]
                }
            },
            "languages": {"type": "string"},
            "hobbies": {"type": "string"},
            "other": {"type": "string"}
        }
    }

class CVExtractor:
    def __init__(self):
        self.extractor = DocumentExtractor()

    def extract(self, document_path):
        """
        Extract structured resume data from a document using the default resume_schema.
        Returns the structured data as a Python dict, or None if extraction fails.
        """
        try:
            result = self.extractor.extract(document_path)
            structured_data = result.extract_data(json_schema=resume_schema)
            return structured_data
        except FileNotFoundError:
            print(f"Error: Document not found at '{document_path}'. Please provide a valid file path.")
            return None
        except Exception as e:
            print(f"An error occurred during extraction: {e}")
            return None

# Example usage:
if __name__ == "__main__":
    # Define your schema (now inside main)
    
    document_path = './CVs/Power_BI_Developer.pdf'
    extractor = CVExtractor()
    data = extractor.extract(document_path)
    print("Returned structured data:")
    print(json.dumps(data, indent=2))

Failed to parse JSON content: Expecting value: line 2 column 1 (char 1)


Returned structured data:
{
  "document": {
    "raw_content": "\n## Page 1\n\n{\n  \"name\": \"Enoch Kwadwo Aidoo\",\n  \"email\": \"aidooenochkwadwo@gmail.com\",\n  \"phone\": \"0240542834\",\n  \"summary\": \"A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.\",\n  \"work_experience\": [\n    {\n      \"company\": \"@Really Great Tech\",\n      \"title\": \"Data Analytics/AI/ML Engineer\",\n      \"location\": \"\",\n      \"start_date\": \"November 2023\",\n      \"end_date\": \"October 2024\",\n      \"responsibilities\": [\n        \"Conducted Shapelet Analysis on trained machine learning models to interpret performance

In [2]:
if __name__ == "__main__":
    # Define your schema (now inside main)
    
    document_path = './CVs/CV_Image.png'
    extractor = CVExtractor()
    data = extractor.extract(document_path)
    print("Returned structured data:")
    print(json.dumps(data, indent=2))

Returned structured data:
{
  "structured_data": {
    "name": "Aidoo Enoch Kwadwo",
    "email": "aidookenchkwadwo@gmail.com",
    "phone": "0240542834",
    "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
    "work_experience": [
      {
        "company": "@Really Great Tech",
        "title": "Data Analytics/AI/ML Engineer",
        "location": "",
        "start_date": "November 2023",
        "end_date": "October 2024",
        "responsibilities": [
          "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in a

### **TESTING DOCSTRANGE LOCALLY**

In [10]:
from docstrange import DocumentExtractor

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()


# Convert any document to clean markdown
result = extractor.extract("./CVs/Power_BI_Developer.pdf")

markdown = result.extract_markdown()
print(markdown)

Failed to get markdown from cloud API: HTTPSConnectionPool(host='extraction-api.nanonets.com', port=443): Max retries exceeded with url: /extract (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000263A1FB1610>: Failed to resolve 'extraction-api.nanonets.com' ([Errno 11001] getaddrinfo failed)"))





In [None]:
from docstrange import DocumentExtractor

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()

# CV Data Schema
resume_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "summary": {"type": "string"},
            "work_experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {"type": "string"},
                        "title": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "responsibilities": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["company", "title"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field_of_study": {"type": "string"},
                        "institution": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "soft_skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuing_organization": {"type": "string"},
                        "date": {"type": "string"}
                    },
                    "required": ["name"]
                }
            },
            "projects": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "description": {"type": "string"},
                        "technologies": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["name"]
                }
            },
            "languages": {"type": "string"},
            "hobbies": {"type": "string"},
            "other": {"type": "string"}
        }
    }


# Convert any document to clean markdown
result = extractor.extract("./CVs/CV_Image.png")

structured = result.extract_data(json_schema=resume_schema)

structured

{'structured_data': {'name': 'Aidoo Enoch Kwadwo',
  'email': 'aidookenchkwadwo@gmail.com',
  'phone': '0240542834',
  'summary': "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
  'work_experience': [{'company': '@Really Great Tech',
    'title': 'Data Analytics/AI/ML Engineer',
    'location': '',
    'start_date': 'November 2023',
    'end_date': 'October 2024',
    'responsibilities': ['Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in an AI project.',
     'Created a dynamic Google Sheets pivot table to track projects, empl

In [12]:
from docstrange import DocumentExtractor
import json
import re

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()

# CV Data Schema
resume_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "summary": {"type": "string"},
            "work_experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {"type": "string"},
                        "title": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "responsibilities": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["company", "title"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field_of_study": {"type": "string"},
                        "institution": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "soft_skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuing_organization": {"type": "string"},
                        "date": {"type": "string"}
                    },
                    "required": ["name"]
                }
            },
            "projects": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "description": {"type": "string"},
                        "technologies": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["name"]
                }
            },
            "languages": {"type": "string"},
            "hobbies": {"type": "string"},
            "other": {"type": "string"}
        }
    }


# Extract from the document
result = extractor.extract("./CVs/Power_BI_Developer.pdf")

# Get the raw text containing the JSON
raw_output = result.extract_data(json_schema=resume_schema)

# The raw_output in case of error is a dictionary, we need the 'raw_content' from it.
# In case of success, it's already a dictionary (the parsed JSON).
if isinstance(raw_output, dict) and 'document' in raw_output and 'raw_content' in raw_output['document']:
    text_to_parse = raw_output['document']['raw_content']
else:
    # It might be already parsed correctly
    text_to_parse = json.dumps(raw_output)


# Find the start of the JSON object
json_start = text_to_parse.find('{')

if json_start != -1:
    # Extract the JSON part of the string
    json_string = text_to_parse[json_start:]
    
    try:
        # Parse the cleaned JSON string
        structured_data = json.loads(json_string)
        
        # Ensure optional fields are present
        optional_fields = ["languages", "hobbies", "other"]
        if isinstance(structured_data, dict):
            for field in optional_fields:
                if field not in structured_data:
                    structured_data[field] = ""
        
        print("Successfully parsed structured data:")
        print(json.dumps(structured_data, indent=2))
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON after cleaning: {e}")
        print("Problematic JSON string:", json_string)
else:
    print("Could not find a JSON object in the output.")
    print("Raw output:", text_to_parse)


Failed to parse JSON content: Expecting value: line 2 column 1 (char 1)


Successfully parsed structured data:
{
  "name": "Enoch Kwadwo Aidoo",
  "email": "aidooenochkwadwo@gmail.com",
  "phone": "0240542834",
  "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
  "work_experience": [
    {
      "company": "@Really Great Tech",
      "title": "Data Analytics/AI/ML Engineer",
      "location": "",
      "start_date": "November 2023",
      "end_date": "October 2024",
      "responsibilities": [
        "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in the Theta Tech AI project.",
        "C