In [1]:
from docstrange import DocumentExtractor
import json

# CV Data Schema
resume_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "summary": {"type": "string"},
            "work_experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {"type": "string"},
                        "title": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "responsibilities": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["company", "title"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field_of_study": {"type": "string"},
                        "institution": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "soft_skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuing_organization": {"type": "string"},
                        "date": {"type": "string"}
                    },
                    "required": ["name"]
                }
            },
            "projects": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "description": {"type": "string"},
                        "technologies": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["name"]
                }
            },
            "languages": {"type": "string"},
            "hobbies": {"type": "string"},
            "other": {"type": "string"}
        }
    }

class CVExtractor:
    def __init__(self):
        self.extractor = DocumentExtractor()

    def extract(self, document_path):
        """
        Extract structured resume data from a document using the default resume_schema.
        Returns the structured data as a Python dict, or None if extraction fails.
        """
        try:
            result = self.extractor.extract(document_path)
            structured_data = result.extract_data(json_schema=resume_schema)
            return structured_data
        except FileNotFoundError:
            print(f"Error: Document not found at '{document_path}'. Please provide a valid file path.")
            return None
        except Exception as e:
            print(f"An error occurred during extraction: {e}")
            return None

# Example usage:
if __name__ == "__main__":
    # Define your schema (now inside main)
    
    document_path = './CVs/Power_BI_Developer.pdf'
    extractor = CVExtractor()
    data = extractor.extract(document_path)
    print("Returned structured data:")
    print(json.dumps(data, indent=2))

Failed to parse JSON content: Expecting value: line 2 column 1 (char 1)


Returned structured data:
{
  "document": {
    "raw_content": "\n## Page 1\n\n{\n  \"name\": \"Enoch Kwadwo Aidoo\",\n  \"email\": \"aidooenochkwadwo@gmail.com\",\n  \"phone\": \"0240542834\",\n  \"summary\": \"A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.\",\n  \"work_experience\": [\n    {\n      \"company\": \"@Really Great Tech\",\n      \"title\": \"Data Analytics/AI/ML Engineer\",\n      \"location\": \"\",\n      \"start_date\": \"November 2023\",\n      \"end_date\": \"October 2024\",\n      \"responsibilities\": [\n        \"Conducted Shapelet Analysis on trained machine learning models to interpret performance

In [2]:
if __name__ == "__main__":
    # Define your schema (now inside main)
    
    document_path = './CVs/CV_Image.png'
    extractor = CVExtractor()
    data = extractor.extract(document_path)
    print("Returned structured data:")
    print(json.dumps(data, indent=2))

Returned structured data:
{
  "structured_data": {
    "name": "Aidoo Enoch Kwadwo",
    "email": "aidookenchkwadwo@gmail.com",
    "phone": "0240542834",
    "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges.",
    "work_experience": [
      {
        "company": "@Really Great Tech",
        "title": "Data Analytics/AI/ML Engineer",
        "location": "",
        "start_date": "November 2023",
        "end_date": "October 2024",
        "responsibilities": [
          "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in a

### **TESTING DOCSTRANGE LOCALLY**

In [9]:
from docstrange import DocumentExtractor

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()


# Convert any document to clean markdown
result = extractor.extract("./../data/CVs/CV_Image.png")

markdown = result.extract_markdown()
print(markdown)

Error checking GPU availability: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at /dev/null:488; latest registration was registered at /dev/null:488


# Aidoo Enoch Kwadwo
## Data Analyst

## Personal Info
Phone: 0240542834
Email: aidookenchkwadwo@gmail.com
Location: Kumasi, Ghana

## Qualities
- Curiosity
- Problem Solving
- System Understanding
- Technical Skills
- Analytical Thinking
- Problem Solving Skills
- Teamwork
- Initiative and Self-motivation
- Discipline and Resilient

## Key Skills
**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL
**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch
**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOps, Computer Vision, Natural Language Processing, Recommendation Systems

## About Me
A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identify

In [6]:
from docstrange import DocumentExtractor

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()

# CV Data Schema
resume_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "summary": {"type": "string"},
            "work_experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "company": {"type": "string"},
                        "title": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "responsibilities": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["company", "title"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field_of_study": {"type": "string"},
                        "institution": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "soft_skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuing_organization": {"type": "string"},
                        "date": {"type": "string"}
                    },
                    "required": ["name"]
                }
            },
            "projects": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "description": {"type": "string"},
                        "technologies": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["name"]
                }
            },
            "languages": {"type": "string"},
            "hobbies": {"type": "string"},
            "other": {"type": "string"}
        }
    }


# Convert any document to clean markdown
result = extractor.extract("./../data/CVs/flutter_cv.pdf")

structured = result.extract_data(json_schema=resume_schema)

structured

Failed to parse JSON content: Extra data: line 4 column 1 (char 1707)


{'document': {'raw_content': '{"name": "Ransford Owusu-Ansah", "email": "ransfordowusuansah9@gmail.com", "phone": "(+233)559-529-436", "summary": "Passionate and result-driven application developer specialized in mobile application development with four years of experience. Proficient in Flutter for building scalable, responsive and adaptive applications, with more knowledge building great tools with Kotlin, Express.js for backend systems and Arduino in embedded systems.", "work_experience": [{"company": "Ebits, Adum", "title": "Application Developer Service Personnel", "location": null, "start_date": "October 2024", "end_date": "September 2025", "responsibilities": ["Collaborated with senior developers to create enterprise applications", "Improved and updated company mobile applications codebase to updated versions", "Created responsive and adaptive mobile applications for various platforms and screen orientations"]}, {"company": "Ebits, Aprade", "title": "Application Developer Intern

In [7]:
from docstrange import DocumentExtractor
import json
import re

def merge_cv_json_objects(json_objects):
    """
    Merge multiple JSON objects from different pages into a single complete CV.
    """
    merged = {
        "name": None,
        "email": None,
        "phone": None,
        "summary": None,
        "work_experience": [],
        "education": [],
        "skills": [],
        "soft_skills": [],
        "certifications": [],
        "projects": [],
        "languages": None,
        "hobbies": None,
        "other": None
    }
    
    for obj in json_objects:
        # Merge scalar fields (take first non-null value)
        for field in ["name", "email", "phone", "summary", "languages", "hobbies", "other"]:
            if merged[field] is None and obj.get(field):
                merged[field] = obj[field]
        
        # Merge array fields (concatenate)
        for field in ["work_experience", "education", "certifications", "projects"]:
            if obj.get(field):
                merged[field].extend(obj[field])
        
        # Merge simple string arrays (deduplicate)
        for field in ["skills", "soft_skills"]:
            if obj.get(field):
                merged[field].extend(obj[field])
    
    # Deduplicate simple arrays
    merged["skills"] = list(set(merged["skills"]))
    merged["soft_skills"] = list(set(merged["soft_skills"]))
    
    # Convert None to empty string for optional fields
    for field in ["languages", "hobbies", "other"]:
        if merged[field] is None:
            merged[field] = ""
    
    return merged


def parse_multi_object_json(raw_text):
    """
    Parse output that may contain multiple JSON objects separated by page breaks.
    Returns a single merged JSON object.
    """
    # Split by page break markers
    page_break_patterns = [
        r'<!-- Page Break.*?-->',
        r'\n\n\n+',  # Multiple newlines
    ]
    
    # Combine patterns into one regex
    split_pattern = '|'.join(f'(?:{p})' for p in page_break_patterns)
    chunks = re.split(split_pattern, raw_text, flags=re.IGNORECASE)
    
    json_objects = []
    
    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue
        
        # Find JSON object in chunk
        json_start = chunk.find('{')
        if json_start == -1:
            continue
        
        json_string = chunk[json_start:]
        
        # Try to find the end of the JSON object
        try:
            # Use json.JSONDecoder to find where the JSON ends
            decoder = json.JSONDecoder()
            obj, end_idx = decoder.raw_decode(json_string)
            json_objects.append(obj)
        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse JSON chunk: {e}")
            continue
    
    if not json_objects:
        raise ValueError("No valid JSON objects found in the output")
    
    # If only one object, return it directly
    if len(json_objects) == 1:
        return json_objects[0]
    
    # Otherwise, merge multiple objects
    print(f"Found {len(json_objects)} JSON objects across pages. Merging...")
    return merge_cv_json_objects(json_objects)


# Initialize extractor
extractor = DocumentExtractor()

# CV Data Schema
resume_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "summary": {"type": "string"},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {"type": "string"},
                    "title": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"},
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["company", "title"]
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string"},
                    "field_of_study": {"type": "string"},
                    "institution": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"}
                },
                "required": ["degree", "institution"]
            }
        },
        "skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "soft_skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "certifications": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "issuing_organization": {"type": "string"},
                    "date": {"type": "string"}
                },
                "required": ["name"]
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "technologies": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["name"]
            }
        },
        "languages": {"type": "string"},
        "hobbies": {"type": "string"},
        "other": {"type": "string"}
    }
}

# Extract from the document
result = extractor.extract("../data/CVs/flutter_cv.pdf")

# Get the raw text containing the JSON
raw_output = result.extract_data(json_schema=resume_schema)

# Handle different output formats
if isinstance(raw_output, dict) and 'document' in raw_output and 'raw_content' in raw_output['document']:
    text_to_parse = raw_output['document']['raw_content']
elif isinstance(raw_output, str):
    text_to_parse = raw_output
else:
    text_to_parse = json.dumps(raw_output)

# Use the new robust parser
print("Parsing multi-object JSON output...")
try:
    structured_data = parse_multi_object_json(text_to_parse)
    
    print("\n" + "="*60)
    print("‚úÖ Successfully parsed and merged structured CV data:")
    print("="*60)
    print(json.dumps(structured_data, indent=2))
    
    print("\n" + "="*60)
    print("üìä Summary:")
    print("="*60)
    print(f"Name: {structured_data.get('name')}")
    print(f"Email: {structured_data.get('email')}")
    print(f"Phone: {structured_data.get('phone')}")
    print(f"Work Experience: {len(structured_data.get('work_experience', []))} entries")
    print(f"Education: {len(structured_data.get('education', []))} entries")
    print(f"Skills: {len(structured_data.get('skills', []))} skills")
    print(f"Projects: {len(structured_data.get('projects', []))} projects")
    
except Exception as e:
    print(f"‚ùå Error parsing CV: {e}")
    print("\nRaw text preview:")
    print(text_to_parse[:1000])

Failed to parse JSON content: Extra data: line 4 column 1 (char 1797)


Parsing multi-object JSON output...
Found 3 JSON objects across pages. Merging...

‚úÖ Successfully parsed and merged structured CV data:
{
  "name": "Ransford Owusu-Ansah",
  "email": "ransfordowusuansah9@gmail.com",
  "phone": "(+233)559-529-436",
  "summary": "Passionate and result-driven application developer specialized in mobile application development with four years of experience. Proficient in Flutter for building scalable, responsive and adaptive applications, with more knowledge building great tools with Kotlin, Express.js for backend systems and Arduino in embedded systems.",
  "work_experience": [
    {
      "company": "Ebits",
      "title": "Application Developer Service Personnel",
      "location": "Adum",
      "start_date": "2024-10",
      "end_date": "2025-09",
      "responsibilities": [
        "Collaborated with senior developers to create enterprise applications",
        "Improved and updated company mobile applications codebase to updated versions",
        

In [None]:
from docstrange import DocumentExtractor
import json
import re

def merge_cv_json_objects(json_objects):
    """
    Merge multiple JSON objects from different pages into a single complete CV.
    
    Strategy:
    - Take non-null scalar fields from the first object that has them
    - Merge arrays (work_experience, education, skills, etc.)
    - Remove duplicates from arrays
    """
    merged = {
        "name": None,
        "email": None,
        "phone": None,
        "summary": None,
        "work_experience": [],
        "education": [],
        "skills": [],
        "soft_skills": [],
        "certifications": [],
        "projects": [],
        "languages": None,
        "hobbies": None,
        "other": None
    }
    
    for obj in json_objects:
        # Merge scalar fields (take first non-null value)
        for field in ["name", "email", "phone", "summary", "languages", "hobbies", "other"]:
            if merged[field] is None and obj.get(field):
                merged[field] = obj[field]
        
        # Merge array fields (concatenate and deduplicate)
        for field in ["work_experience", "education", "certifications", "projects"]:
            if obj.get(field):
                merged[field].extend(obj[field])
        
        # Merge simple string arrays (deduplicate)
        for field in ["skills", "soft_skills"]:
            if obj.get(field):
                merged[field].extend(obj[field])
    
    # Deduplicate simple arrays
    merged["skills"] = list(set(merged["skills"]))
    merged["soft_skills"] = list(set(merged["soft_skills"]))
    
    # Convert None to empty string for optional fields
    for field in ["languages", "hobbies", "other"]:
        if merged[field] is None:
            merged[field] = ""
    
    return merged


def parse_multi_object_json(raw_text):
    """
    Parse output that may contain multiple JSON objects separated by page breaks.
    Returns a single merged JSON object.
    """
    # Split by page break markers
    page_break_patterns = [
        r'<!-- Page Break.*?-->',
        r'\n\n\n+',  # Multiple newlines
    ]
    
    # Combine patterns into one regex
    split_pattern = '|'.join(f'(?:{p})' for p in page_break_patterns)
    chunks = re.split(split_pattern, raw_text, flags=re.IGNORECASE)
    
    json_objects = []
    
    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue
        
        # Find JSON object in chunk
        json_start = chunk.find('{')
        if json_start == -1:
            continue
        
        json_string = chunk[json_start:]
        
        # Try to find the end of the JSON object
        try:
            # Use json.JSONDecoder to find where the JSON ends
            decoder = json.JSONDecoder()
            obj, end_idx = decoder.raw_decode(json_string)
            json_objects.append(obj)
        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse JSON chunk: {e}")
            print(f"Chunk preview: {json_string[:200]}...")
            continue
    
    if not json_objects:
        raise ValueError("No valid JSON objects found in the output")
    
    # If only one object, return it directly
    if len(json_objects) == 1:
        return json_objects[0]
    
    # Otherwise, merge multiple objects
    print(f"Found {len(json_objects)} JSON objects across pages. Merging...")
    return merge_cv_json_objects(json_objects)


# Initialize extractor
extractor = DocumentExtractor()

# CV Data Schema
resume_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "summary": {"type": "string"},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {"type": "string"},
                    "title": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"},
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["company", "title"]
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string"},
                    "field_of_study": {"type": "string"},
                    "institution": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"}
                },
                "required": ["degree", "institution"]
            }
        },
        "skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "soft_skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "certifications": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "issuing_organization": {"type": "string"},
                    "date": {"type": "string"}
                },
                "required": ["name"]
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "technologies": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["name"]
            }
        },
        "languages": {"type": "string"},
        "hobbies": {"type": "string"},
        "other": {"type": "string"}
    }
}

# Test with the problematic CV
document_path = "../data/CVs/flutter_cv.pdf"

print("Extracting CV data...")
result = extractor.extract(document_path)

# Get the raw output
raw_output = result.extract_data(json_schema=resume_schema)

# Handle different output formats
if isinstance(raw_output, dict) and 'document' in raw_output and 'raw_content' in raw_output['document']:
    text_to_parse = raw_output['document']['raw_content']
elif isinstance(raw_output, str):
    text_to_parse = raw_output
else:
    text_to_parse = json.dumps(raw_output)

print("Parsing multi-object JSON output...")
try:
    structured_data = parse_multi_object_json(text_to_parse)
    
    print("\n" + "="*60)
    print("Successfully parsed and merged structured CV data:")
    print("="*60)
    print(json.dumps(structured_data, indent=2))
    
    print("\n" + "="*60)
    print("Summary:")
    print("="*60)
    print(f"Name: {structured_data.get('name')}")
    print(f"Email: {structured_data.get('email')}")
    print(f"Phone: {structured_data.get('phone')}")
    print(f"Work Experience: {len(structured_data.get('work_experience', []))} entries")
    print(f"Education: {len(structured_data.get('education', []))} entries")
    print(f"Skills: {len(structured_data.get('skills', []))} skills")
    print(f"Projects: {len(structured_data.get('projects', []))} projects")
    
except Exception as e:
    print(f"Error parsing CV: {e}")
    print("\nRaw text preview:")
    print(text_to_parse[:1000])

## Robust JSON Parser for Multi-Page CVs

This cell handles the case where docstrange returns multiple JSON objects (one per page) separated by page break markers.

## Local Ollama Configuration

Testing docstrange with local Ollama (llama3.2:latest)

In [6]:
from docstrange import DocumentExtractor
import json

# Initialize extractor with OpenAI cloud model
extractor = DocumentExtractor(model="OpenAI")

# CV Data Schema
resume_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "summary": {"type": "string"},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "company": {"type": "string"},
                    "title": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"},
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["company", "title"]
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string"},
                    "field_of_study": {"type": "string"},
                    "institution": {"type": "string"},
                    "location": {"type": "string"},
                    "start_date": {"type": "string"},
                    "end_date": {"type": "string"}
                },
                "required": ["degree", "institution"]
            }
        },
        "skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "soft_skills": {
            "type": "array",
            "items": {"type": "string"}
        },
        "certifications": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "issuing_organization": {"type": "string"},
                    "date": {"type": "string"}
                },
                "required": ["name"]
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "technologies": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["name"]
            }
        },
        "languages": {"type": "string"},
        "hobbies": {"type": "string"},
        "other": {"type": "string"}
    }
}

# Test with a CV
document_path = "./../data/CVs/CV_Image.png"

print("Extracting with OpenAI cloud model...")
result = extractor.extract(document_path)

# Extract structured data
structured_data = result.extract_data(json_schema=resume_schema)

print("\nStructured CV Data:")
print(json.dumps(structured_data, indent=2))

Extracting with OpenAI cloud model...


Error checking GPU availability: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at /dev/null:488; latest registration was registered at /dev/null:488
Failed to get specified-json from cloud API: 400 Client Error: Bad Request for url: https://extraction-api.nanonets.com/extract
Failed to parse JSON content: Expecting value: line 1 column 1 (char 0)
Failed to get specified-json from cloud API: 400 Client Error: Bad Request for url: https://extraction-api.nanonets.com/extract
Failed to parse JSON content: Expecting value: line 1 column 1 (char 0)



Structured CV Data:
{
  "document": {
    "raw_content": ""
  },
  "format": "json_parse_error",
  "error": "Expecting value: line 1 column 1 (char 0)"
}


In [None]:
# Alternative: Just extract markdown first to see if it's working
result = extractor.extract("./../data/CVs/CV_Image.png")
markdown = result.extract_markdown()

print("Extracted Markdown:")
print(markdown[:500])  # Print first 500 characters