## **USING LLAMAPARSE**

In [1]:
from dotenv import load_dotenv
import os

In [None]:
from resume_parser import extract_resume, print_resume_data

# Example usage:
data = extract_resume("./CV_Image.png")
print_resume_data(data)

In [2]:
load_dotenv()
mykey = os.getenv("llamaParse")

In [4]:
from llama_cloud_services import LlamaExtract
from pydantic import BaseModel, Field
from typing import List, Optional
from dotenv import load_dotenv
import json


# Initialize client
extractor = LlamaExtract(mykey)

# Define schema
class Experience(BaseModel):
    company: str = Field(description="Company name")
    role: str = Field(description="Job title")
    start_date: Optional[str] = Field(description="Start date", default=None)
    end_date: Optional[str] = Field(description="End date", default=None)
    description: Optional[str] = Field(description="Job description", default=None)

class Education(BaseModel):
    institution: str = Field(description="School or university")
    degree: Optional[str] = Field(description="Degree or qualification", default=None)
    start_date: Optional[str] = Field(description="Start date", default=None)
    end_date: Optional[str] = Field(description="End date", default=None)

class Resume(BaseModel):
    name: Optional[str] = Field(description="Full name", default=None)
    email: Optional[str] = Field(description="Email address", default=None)
    phone: Optional[str] = Field(description="Phone number", default=None)
    summary: Optional[str] = Field(description="Profile summary", default=None)
    work_experience: Optional[List[Experience]] = Field(description="List of work experiences", default=None)
    education: Optional[List[Education]] = Field(description="Educational background", default=None)
    skills: Optional[List[str]] = Field(description="Technical skills", default=None)
    soft_skills: Optional[List[str]] = Field(description="Soft skills like problem solving, teamwork, etc.", default=None)
    certifications: Optional[List[str]] = Field(description="Certifications", default=None)
    projects: Optional[List[str]] = Field(description="Projects", default=None)
    languages: Optional[List[str]] = Field(description="Languages", default=None)
    hobbies: Optional[List[str]] = Field(description="Hobbies or interests", default=None)
    other: Optional[str] = Field(description="Other relevant info", default=None)


In [5]:
# Create extraction agent
agent = extractor.create_agent(name="resume-parser_v2", data_schema=Resume)

In [6]:
# Extract structured data
result = agent.extract("./CV_Image.png")
print(json.dumps(result.data, indent=2))

Uploading files: 100%|██████████| 1/1 [00:02<00:00,  2.39s/it]
Creating extraction jobs:   0%|          | 0/1 [00:00<?, ?it/s]


ApiError: status_code: 429, body: {'detail': "You've exceeded the maximum number of credits for your plan."}

In [None]:
# Extract structured data
result = agent.extract("/content/Data_Analyst3_CV.docx")
print(json.dumps(result.data, indent=2))

Uploading files: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Extracting files: 100%|██████████| 1/1 [00:27<00:00, 27.73s/it]

{
  "name": "Aidoo Enoch Kwadwo",
  "email": "aidooenochkwadwo@gmail.com",
  "phone": "0240542834",
  "summary": "A Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical skills with business acumen, I excel at identifying data patterns and translating them into clear, implementable strategies that solve real-world challenges",
  "work_experience": [
    {
      "company": "Really Great Tech",
      "role": "Data Analytics/AI/ML Engineer",
      "start_date": "November 2023",
      "end_date": "October 2024",
      "description": "Conducted Shapelet Analysis on trained machine learning models to interpret performance patterns and identify opportunities for optimization in an AI project. Created a dynamic Google Sheets pivot table to track projects, employee assignments, and mentorship r





 - 'instructions' is not a valid parameter. Please check the documentation or update the package.


Started parsing the file under job_id 0bdc6e62-85fe-43a3-ba50-af17b8e2c331
Error while parsing the file '/content/Data_Analyst3_CV.pdf': 'json'
[extract_and_structure_cv] Failed: list index out of range
{}


In [12]:
# Install: pip install docstrange
from docstrange import DocumentExtractor

# Authenticated access - run 'docstrange login' first
extractor = DocumentExtractor()  # Auto-uses cached credentials

# Extract from your file
result = extractor.extract("./Data_Analyst3_CV.pdf")

# Get markdown output
markdown_content = result.extract_markdown()

In [13]:
text_content = result.extract_text()

In [14]:
text_content

"\n## Page 1\n\n# Aidoo Enoch Kwadwo\n## Data Analyst\n\n## Personal Info\n**Phone**\n0240542834\n\n**Email**\naidooenochkwadwo@gmail.com\n\n**Kumasi, Ghana**\n\n## Qualities\n* Curiosity\n* Problem Solving\n* System Understanding\n* Technical Skills\n* Analytical Thinking\n* Problem Solving Skills\n* Teamwork\n* Initiative and Self-motivation\n* Discipline and Resilient\n\n## Key Skills\n**Tools:** Python, R, AWS, Microsoft Excel, Google Sheets, Power BI, SQL\n**Packages/Frameworks:** NumPy, Pandas, Scikit-Learn, Matplotlib, Pytorch\n**Machine Learning:** Data Analysis, Classification Modeling, Deep Neural Networks, Regression Modelling, MLOPs, Computer Vision, Natural Language Processing, Recommendation Systems\n\n## About Me\nA Data Analyst with about two years of professional experience specialized in transforming complex datasets into strategic business solutions. I've consistently delivered actionable insights that have driven key business decisions. Combining strong analytical s

In [15]:
import re

def structure_cv_text(text):
    """
    Parse and structure CV text into a dictionary with common fields.
    This is a simple example and may need to be adapted for your CV format.
    """
    structured = {
        'name': None,
        'email': None,
        'phone': None,
        'education': [],
        'experience': [],
        'skills': [],
        'summary': None
    }

    # Extract email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    if email_match:
        structured['email'] = email_match.group(0)

    # Extract phone (simple pattern, may need adjustment)
    phone_match = re.search(r'(\+?\d[\d\s\-]{7,}\d)', text)
    if phone_match:
        structured['phone'] = phone_match.group(0)

    # Extract name (assume first line is name if it looks like a name)
    lines = text.splitlines()
    if lines:
        first_line = lines[0].strip()
        if len(first_line.split()) >= 2 and not any(x in first_line.lower() for x in ['curriculum', 'resume', 'cv']):
            structured['name'] = first_line

    # Extract sections by keywords
    section_patterns = {
        'education': r'(education|academic background|qualifications)',
        'experience': r'(experience|employment|work history|professional experience)',
        'skills': r'(skills|technical skills|competencies)',
        'summary': r'(summary|profile|about me)'
    }

    # Find section indices
    section_indices = {}
    for key, pattern in section_patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            section_indices[key] = match.start()

    # Sort sections by appearance
    sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])

    # Extract section contents
    for i, (section, start_idx) in enumerate(sorted_sections):
        end_idx = sorted_sections[i+1][1] if i+1 < len(sorted_sections) else len(text)
        content = text[start_idx:end_idx].strip()
        if section == 'education':
            structured['education'] = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['education'], line, re.IGNORECASE)]
        elif section == 'experience':
            structured['experience'] = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['experience'], line, re.IGNORECASE)]
        elif section == 'skills':
            skills_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['skills'], line, re.IGNORECASE)]
            # Try to split comma-separated skills
            if skills_lines:
                structured['skills'] = re.split(r',|;|\n', ' '.join(skills_lines))
                structured['skills'] = [s.strip() for s in structured['skills'] if s.strip()]
        elif section == 'summary':
            summary_lines = [line.strip() for line in content.split('\n') if line.strip() and not re.search(section_patterns['summary'], line, re.IGNORECASE)]
            structured['summary'] = ' '.join(summary_lines)

    return structured

# Example usage: structure the text extracted with docstrange
structured_cv = structure_cv_text(text_content)
import json
print(json.dumps(structured_cv, indent=2))


{
  "name": null,
  "email": "aidooenochkwadwo@gmail.com",
  "phone": "0240542834",
  "education": [
    "**Bachelor of Science in Computer Science.**",
    "Included one year course on Management Studies",
    "KNUST, Ghana",
    "2023",
    "## Courses and Certifications",
    "* Coursera Crash Course on Python.",
    "Oct 2021",
    "* AWS Machine Learning Foundation.",
    "Oct 2021",
    "* Introduction to Deep Learning with Pytorch.",
    "Sep 2022",
    "* Machine Learning for Trading on Udacity.",
    "Offered at Georgia Tech as CS 7646.",
    "May 2023",
    "* Machine Learning.",
    "Offered by Stanford University on Coursera.",
    "Sep 2025",
    "* Google Data Analytics Professional Certificate.",
    "Present",
    "* ALX Data Science",
    "May 2023",
    "* AWS Certified Cloud Practitioner",
    "Aug 2025",
    "## Projects",
    "* Conducted advanced Excel analysis on water access datasets, applying data cleaning, formulas, statistical functions, and pivot tables. Des

In [16]:
import json

def structure_cv_to_schema(text):
    """
    Structure the extracted CV text to match the schema in cv_result.txt.
    """
    # You may want to use advanced NLP or LLMs for production, but here is a simple placeholder
    # that returns a dict with all required keys and empty/default values.
    schema = {
        "name": None,
        "email": None,
        "phone": None,
        "summary": None,
        "work_experience": [],
        "education": [],
        "skills": [],
        "soft_skills": [],
        "certifications": [],
        "projects": [],
        "languages": None,
        "hobbies": None,
        "other": None
    }
    # --- Insert your parsing logic here, or use the previous structure_cv_text as a base ---
    # For now, use the previous function and map fields accordingly
    base = structure_cv_text(text)
    schema["name"] = base.get("name")
    schema["email"] = base.get("email")
    schema["phone"] = base.get("phone")
    schema["summary"] = base.get("summary")
    schema["skills"] = base.get("skills")
    # You may want to further parse experience, education, etc.
    # This is a placeholder for demonstration
    return schema

# Structure and save to file
structured_cv = structure_cv_to_schema(text_content)
with open("extracted_file.txt", "w", encoding="utf-8") as f:
    json.dump(structured_cv, f, indent=2)
print("Structured data saved to extracted_file.txt")


Structured data saved to extracted_file.txt
