In [8]:
from mistralai import Mistral
import os

api_key = "L9Z3JFdu5LaSollNKnMAJDytfS6i3o5X"

client = Mistral(api_key=api_key)

uploaded_pdf = client.files.upload(
    file={
        "file_name": "./resumes/RamaCharan_resume.pdf",
        "content": open("./resumes/RamaCharan_resume.pdf", "rb"),
    },
    purpose="ocr"
)  

In [9]:
client.files.retrieve(file_id=uploaded_pdf.id)
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
    }
)


In [10]:
# Specify model
model = "mistral-small-latest"

# Initialize the Mistral client
client = Mistral(api_key=api_key)

# Define the messages for the chat
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Structure every single detail into JSON response and output a structured JSON response."
            },
            {
                "type": "document_url",
                "document_url": signed_url.url
            }
        ]
    }
]
chat_response = client.chat.complete(
      model = model,
      messages = messages,
      response_format = {
          "type": "json_object",
      }
)

print(chat_response.choices[0].message.content)


[{"name": "Rama Charan Pisupati", "email": "rama.charan.official@gmail.com", "phone": "+91 9908673209", "location": "Guntur, India", "linkedin": "LinkedIn", "github": "Github", "profile": "I'm a focused and driven computer science student with a passion for applied AI and machine learning. I take a methodical approach to solving problems and finding efficient solutions. I enjoy connecting theory to practical applications and constantly look for ways to improve performance. I'm eager to take on challenges and learn from experienced mentors.", "education": [{"degree": "B.Tech, Artificial Intelligence and Machine Learning", "institution": "Vasireddy Venkatadri Institute of Technology", "cgpa": "8.46/10", "relevant_coursework": ["Deep Learning", "Operating Systems", "Data Structures", "DBMS", "Advanced Python Programming"]}], "projects": [{"title": "Speech Synthesis using Deep Learning, Telugu Text-to-Speech System", "description": "Developed a production-ready Telugu Text-to-Speech system

In [11]:
# For multiple pages, you could do:
for page in ocr_response.pages:
    print(f"--- Page {page.index} ---")
    print(page.markdown)
    print("\n")

--- Page 0 ---
# Rama Charan Pisupati 

Aspiring AI/ML Engineer
rama.charan.official@gmail.com | +91 9908673209 | Guntur, India | LinkedIn | Github

## PROFILE

I'm a focused and driven computer science student with a passion for applied AI and machine learning. I take a methodical approach to solving problems and finding efficient solutions. I enjoy connecting theory to practical applications and constantly look for ways to improve performance. I'm eager to take on challenges and learn from experienced mentors.

## EDUCATION

B.Tech, Artificial Intelligence and Machine Learning,

Vasireddy Venkatadri Institute of Technology

- CGPA: 8.46/10
- Relevant Coursework: Deep Learning, Operating Systems, Data Structures, DBMS, Advaned Python Programming


## PROJECTS

Speech Synthesis using Deep Learning, Telugu Text-to-Speech System $\square$ - Developed a production-ready Telugu Text-to-Speech system processing 1000+ unique characters, achieving $40 \%$ improved speech naturalness (MOS 4.2/

In [None]:
# Mistral OCR AI Implementation Analysis

## Overview
This notebook demonstrates the implementation and usage of Mistral's OCR (Optical Character Recognition) AI capabilities for processing PDF documents, specifically resumes. The implementation follows a multi-step approach to extract structured data from documents.

## Implementation Breakdown

### 1. **Client Initialization**
```python
from mistralai import Mistral
client = Mistral(api_key=api_key)
```
- Uses the new Mistral SDK (v1.0+)
- Requires valid API key for authentication
- Creates a client instance for all subsequent operations

### 2. **File Upload Process**
```python
uploaded_pdf = client.files.upload(
    file={
        "file_name": "./resumes/RamaCharan_resume.pdf",
        "content": open("./resumes/RamaCharan_resume.pdf", "rb"),
    },
    purpose="ocr"
)
```
**Key Points:**
- Uploads PDF to Mistral's servers for processing
- Purpose is set to "ocr" indicating OCR processing intent
- Returns file object with unique ID for subsequent operations
- File content is read in binary mode ("rb")

### 3. **Signed URL Generation**
```python
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
```
**Purpose:**
- Creates a secure, temporary URL for accessing the uploaded file
- Required for OCR processing and chat completion
- Provides secure access without exposing API keys

### 4. **OCR Processing**
```python
ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
    }
)
```
**Features:**
- Uses latest OCR model for best accuracy
- Processes document via URL reference
- Returns structured OCR response with page-level data
- Extracts text in markdown format for better structure preservation

### 5. **Chat-Based Structured Extraction**
```python
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Structure every single detail into JSON response and output a structured JSON response."
            },
            {
                "type": "document_url",
                "document_url": signed_url.url
            }
        ]
    }
]
chat_response = client.chat.complete(
    model="mistral-small-latest",
    messages=messages,
    response_format={"type": "json_object"}
)
```
**Advanced Features:**
- Multimodal input (text + document)
- Direct document URL processing in chat
- Forced JSON output format
- Combines OCR with LLM understanding for intelligent extraction

## Strengths of This Approach

### ✅ **Advantages:**
1. **High Accuracy**: Mistral's OCR model is specifically trained for document understanding
2. **Structured Output**: Direct JSON formatting reduces post-processing needs
3. **Multimodal Processing**: Combines visual document understanding with text analysis
4. **Page-Level Access**: Can process multi-page documents with page-specific extraction
5. **Secure Processing**: Uses signed URLs for secure document handling
6. **Modern SDK**: Uses latest Mistral SDK with up-to-date features

### ✅ **Use Cases:**
- Resume parsing and ATS integration
- Document digitization
- Form data extraction
- Invoice/receipt processing
- Legal document analysis

## Limitations and Considerations

### ⚠️ **Limitations:**
1. **API Dependency**: Requires internet connection and API availability
2. **Cost**: Each OCR and chat completion call incurs API costs
3. **File Upload**: Documents must be uploaded to Mistral servers
4. **Rate Limits**: Subject to API rate limiting
5. **File Size**: May have limits on document size and page count

### ⚠️ **Security Considerations:**
- Documents are uploaded to external servers
- Ensure compliance with data privacy regulations
- Consider data retention policies
- Use appropriate access controls for sensitive documents

## Performance Optimization Tips

### 🚀 **Best Practices:**
1. **Batch Processing**: Process multiple documents in batches when possible
2. **Caching**: Cache OCR results for repeated processing
3. **Error Handling**: Implement robust error handling for API failures
4. **Retry Logic**: Add exponential backoff for transient failures
5. **Input Validation**: Validate file types and sizes before upload

## Integration with ATS Backend

The implementation in this notebook directly addresses the resume parsing challenges identified in the ATS project:

1. **Replaces Deprecated Methods**: Uses new SDK instead of deprecated file upload methods
2. **Improved Accuracy**: OCR + LLM provides better extraction than text-only parsing
3. **Structured Data**: Direct JSON output fits ATS database schema requirements
4. **Scalable Architecture**: Can be integrated into the Flask backend for production use

## Recommended Next Steps

1. **Error Handling**: Add comprehensive error handling for production use
2. **Data Validation**: Implement validation for extracted JSON data
3. **Backend Integration**: Integrate this approach into the Flask resume parser service
4. **Performance Monitoring**: Add logging and monitoring for OCR operations
5. **Cost Optimization**: Implement caching and rate limiting strategies

In [None]:
# Comparison: OCR vs Current Backend Approach

print("=== COMPARISON: Mistral OCR vs Current Backend ===\n")

# Current backend approach (PyPDF2 + python-docx)
print("🔄 Current Backend Approach:")
print("- Uses PyPDF2 for PDF text extraction")
print("- Uses python-docx for DOCX text extraction")
print("- Sends extracted text to Mistral chat completion")
print("- Limited to text-extractable content only")
print("- May miss formatting, tables, or image-based content\n")

# OCR approach demonstrated here
print("🚀 Mistral OCR Approach:")
print("- Processes visual content of documents")
print("- Handles scanned PDFs and image-based content")
print("- Preserves document structure and formatting")
print("- Direct document understanding without intermediate text extraction")
print("- More accurate for complex layouts\n")

print("💡 Key Differences:")
print("1. OCR can handle scanned documents and images")
print("2. OCR preserves visual layout and structure")
print("3. OCR is more accurate for complex formatting")
print("4. OCR incurs additional API costs")
print("5. OCR requires document upload to Mistral servers\n")

print("🎯 Recommendation:")
print("- Use OCR for scanned documents or complex layouts")
print("- Use text extraction for simple, text-based documents")
print("- Implement hybrid approach based on document characteristics")

In [None]:
# Backend Integration Example

class MistralOCRParser:
    """Enhanced resume parser using Mistral OCR for better document understanding"""
    
    def __init__(self, api_key):
        self.client = Mistral(api_key=api_key)
    
    def parse_resume_with_ocr(self, file_path: str) -> dict:
        """Parse resume using OCR for enhanced accuracy"""
        try:
            # Step 1: Upload document
            with open(file_path, 'rb') as file:
                uploaded_file = self.client.files.upload(
                    file={
                        "file_name": file_path.split('/')[-1],
                        "content": file,
                    },
                    purpose="ocr"
                )
            
            # Step 2: Get signed URL
            signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id)
            
            # Step 3: Extract structured data using chat completion
            messages = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """Extract and structure all resume information into JSON format:
                            {
                                "personal_info": {"name": "", "email": "", "phone": "", "location": ""},
                                "summary": "",
                                "work_experience": [{"company": "", "title": "", "start_date": "", "end_date": "", "description": []}],
                                "education": [{"institution": "", "degree": "", "start_date": "", "end_date": ""}],
                                "skills": {"technical": [], "tools": [], "soft": []},
                                "projects": [{"name": "", "description": "", "technologies": []}],
                                "certifications": [{"name": "", "issuer": "", "date": ""}],
                                "achievements": []
                            }"""
                        },
                        {
                            "type": "document_url", 
                            "document_url": signed_url.url
                        }
                    ]
                }
            ]
            
            response = self.client.chat.complete(
                model="mistral-small-latest",
                messages=messages,
                response_format={"type": "json_object"}
            )
            
            # Parse JSON response
            import json
            structured_data = json.loads(response.choices[0].message.content)
            
            # Cleanup: Delete uploaded file (optional)
            # self.client.files.delete(file_id=uploaded_file.id)
            
            return structured_data
            
        except Exception as e:
            print(f"OCR parsing failed: {str(e)}")
            return None

# Example usage
print("🔧 Backend Integration Pattern:")
print("1. Create MistralOCRParser instance in Flask app")
print("2. Use for complex documents or when text extraction fails")
print("3. Fallback to text-based extraction for simple documents")
print("4. Cache results to minimize API calls")
print("5. Implement error handling and retry logic")

In [12]:
# OCR Quality Analysis

def analyze_ocr_quality(ocr_response, chat_response):
    """Analyze the quality and completeness of OCR extraction"""
    
    print("=== OCR QUALITY ANALYSIS ===\n")
    
    # Analyze OCR response structure
    print("📄 OCR Response Analysis:")
    print(f"- Number of pages processed: {len(ocr_response.pages)}")
    
    total_chars = 0
    for i, page in enumerate(ocr_response.pages):
        page_chars = len(page.markdown)
        total_chars += page_chars
        print(f"- Page {i+1}: {page_chars:,} characters extracted")
    
    print(f"- Total characters extracted: {total_chars:,}")
    print(f"- Average characters per page: {total_chars // len(ocr_response.pages):,}\n")
    
    # Analyze structured output
    print("🧠 Structured Extraction Analysis:")
    try:
        import json
        structured_data = json.loads(chat_response.choices[0].message.content)
        
        # Count populated fields
        def count_fields(data, prefix=""):
            count = 0
            populated = 0
            
            if isinstance(data, dict):
                for key, value in data.items():
                    if isinstance(value, (dict, list)):
                        sub_count, sub_populated = count_fields(value, f"{prefix}.{key}")
                        count += sub_count
                        populated += sub_populated
                    else:
                        count += 1
                        if value and str(value).strip():
                            populated += 1
            elif isinstance(data, list):
                for item in data:
                    if isinstance(item, (dict, list)):
                        sub_count, sub_populated = count_fields(item, prefix)
                        count += sub_count
                        populated += sub_populated
                    else:
                        count += 1
                        if item and str(item).strip():
                            populated += 1
            
            return count, populated
        
        total_fields, populated_fields = count_fields(structured_data)
        completion_rate = (populated_fields / total_fields * 100) if total_fields > 0 else 0
        
        print(f"- Total extractable fields: {total_fields}")
        print(f"- Populated fields: {populated_fields}")
        print(f"- Completion rate: {completion_rate:.1f}%")
        
        # Analyze specific sections
        sections = ['personal_info', 'work_experience', 'education', 'skills', 'projects']
        print("\n📊 Section Analysis:")
        for section in sections:
            if section in structured_data:
                data = structured_data[section]
                if isinstance(data, list):
                    print(f"- {section}: {len(data)} items")
                elif isinstance(data, dict):
                    filled = sum(1 for v in data.values() if v and str(v).strip())
                    print(f"- {section}: {filled}/{len(data)} fields filled")
                else:
                    status = "✓ Present" if data and str(data).strip() else "✗ Missing"
                    print(f"- {section}: {status}")
        
        print(f"\n✨ Overall Quality Score: {completion_rate:.1f}%")
        
        if completion_rate >= 80:
            print("🎉 Excellent extraction quality!")
        elif completion_rate >= 60:
            print("👍 Good extraction quality")
        elif completion_rate >= 40:
            print("⚠️ Moderate extraction quality - may need manual review")
        else:
            print("❌ Poor extraction quality - manual processing recommended")
            
    except Exception as e:
        print(f"❌ Error analyzing structured data: {str(e)}")

# Run the analysis
analyze_ocr_quality(ocr_response, chat_response)

=== OCR QUALITY ANALYSIS ===

📄 OCR Response Analysis:
- Number of pages processed: 1
- Page 1: 3,902 characters extracted
- Total characters extracted: 3,902
- Average characters per page: 3,902

🧠 Structured Extraction Analysis:
- Total extractable fields: 63
- Populated fields: 63
- Completion rate: 100.0%

📊 Section Analysis:

✨ Overall Quality Score: 100.0%
🎉 Excellent extraction quality!
