In [1]:
!pip install -q google-generativeai easyocr pdf2image pytesseract Pillow
!apt-get install -y tesseract-ocr



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **Driving License**

In [7]:
import requests
import easyocr
import os
import json
from google.colab import files

GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
GEMINI_ENDPOINT = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"


reader = easyocr.Reader(['en'])

def extract_text_from_image(image_path):
    result = reader.readtext(image_path, detail=0, paragraph=True)
    return " ".join(result)

def build_prompt(doc_type, text):
    return f"""
Extract structured JSON data from the following OCR'd {doc_type} document text.

Text:
{text}

Required Fields (strict JSON format):

Driving License:
- Name
- Date of Birth
- License Number
- Issuing State
- Expiry Date

Return ONLY valid JSON.
"""

def call_gemini(prompt_text):
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [
            {
                "parts": [{"text": prompt_text}]
            }
        ]
    }

    response = requests.post(GEMINI_ENDPOINT, headers=headers, json=data)
    if response.status_code == 200:
        try:
            return response.json()['candidates'][0]['content']['parts'][0]['text']
        except:
            raise Exception("Could not parse Gemini response.")
    else:
        raise Exception(f"Gemini API error: {response.status_code} - {response.text}")

def process_document(doc_type, file_path):
    text = extract_text_from_image(file_path)
    prompt = build_prompt(doc_type, text)
    raw_output = call_gemini(prompt)

    try:
        return json.loads(raw_output)
    except json.JSONDecodeError:
        print("Gemini returned malformed JSON. Raw output:")
        print(raw_output)
        return {}

uploaded = files.upload()
file_path = list(uploaded.keys())[0]
doc_type = "Driving License"

result = process_document(doc_type, file_path)
print(json.dumps(result, indent=4))





Saving generated_license_1.png to generated_license_1.png
Gemini returned malformed JSON. Raw output:
```json
{
  "Driving License": {
    "Name": "AJCC Anderson",
    "Date of Birth": "19.02.93",
    "License Number": "619406434",
    "Issuing State": "IRELAND",
    "Expiry Date": "23.02.05"
  }
}
```
{}
Saved structured data to: generated_license_1_output.json


# **Shop Receipt**

In [10]:
import pytesseract
from PIL import Image
import requests
import json
import os
import re


GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
GEMINI_ENDPOINT = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
HEADERS = {"Content-Type": "application/json"}

def extract_text_from_image(image_path):
    image = Image.open(image_path).convert("L")
    text = pytesseract.image_to_string(image)
    return text.strip()


def preprocess_ocr_text(text):

    lines = text.split("\n")
    new_lines = []
    i = 0
    while i < len(lines):
        if i + 1 < len(lines) and re.match(r'^\$?\d+(\.\d{2})?$', lines[i+1].strip()):
            new_lines.append(lines[i] + " " + lines[i+1])
            i += 2
        else:
            new_lines.append(lines[i])
            i += 1
    return "\n".join(new_lines)


def build_gemini_parser_prompt(text):
    return f"""
You are an intelligent document parser designed for HR-tech systems that must operate in noisy, real-world environments.

Your task is to extract structured information from unstructured text obtained from OCR or directly from readable PDFs. The document can be of three types: Driving License, Shop Receipt, or Resume.

---

  **If the document is a Shop Receipt**, return:
{{
  "document_type": "Shop Receipt",
  "merchant_name": ...,
  "total_amount": ...,
  "date_of_purchase": ...,
  "payment_method": ...,
  "items": [
    {{
      "name": ...,
      "quantity": ...,
      "price": ...
    }},
    ...
  ]
}}

---

  **Instructions:**
- Only return a JSON object. Do not explain or comment.
- Be precise, concise, and never guess.
- If document type is unclear, attempt best match using content.
- Strip out noisy data or irrelevant content like headers or legal footnotes.

Ready to begin. Here's the document text:
\"\"\"
{text}
\"\"\"
"""



def call_gemini(prompt):
    data = {
        "contents": [{"parts": [{"text": prompt}]}]
    }
    response = requests.post(GEMINI_ENDPOINT, headers=HEADERS, json=data)

    try:
        raw_text = response.json()['candidates'][0]['content']['parts'][0]['text']


        if raw_text.strip().startswith("```"):
            raw_text = re.sub(r"^```(json)?\s*", "", raw_text.strip())
            raw_text = re.sub(r"\s*```$", "", raw_text.strip())

        return json.loads(raw_text)

    except Exception as e:
        print("❌ Gemini JSON parse failed.")
        print("Raw output:\n", response.text)
        return {}



def process_shop_receipt(image_path):
    print(f"🧾 Processing image: {image_path}")
    ocr_text = extract_text_from_image(image_path)

    prompt = build_gemini_parser_prompt(ocr_text)
    result = call_gemini(prompt)

    # Save result
    output_path = os.path.splitext(image_path)[0] + "_output.json"
    with open(output_path, "w") as f:
        json.dump(result, f, indent=4)

    print("\n✅ Final Extracted JSON:")
    print(json.dumps(result, indent=4))
    print(f"\n📁 Saved to: {output_path}")
    return result



uploaded = files.upload()
img = list(uploaded.keys())[0]
process_shop_receipt(img)


Saving 2.jpg to 2.jpg
🧾 Processing image: 2.jpg

✅ Final Extracted JSON:
{
    "document_type": "Shop Receipt",
    "merchant_name": "Walmart",
    "total_amount": "49.90",
    "date_of_purchase": "10/18/20",
    "payment_method": "DEBIT",
    "items": [
        {
            "name": "GV OATMEAL",
            "quantity": null,
            "price": "1.76"
        },
        {
            "name": "OT 2002 TUM",
            "quantity": null,
            "price": "6.74"
        },
        {
            "name": "M ATHLETICS",
            "quantity": null,
            "price": "24.97"
        },
        {
            "name": "DEXAS 15X20",
            "quantity": null,
            "price": "12.97"
        }
    ]
}

📁 Saved to: 2_output.json


{'document_type': 'Shop Receipt',
 'merchant_name': 'Walmart',
 'total_amount': '49.90',
 'date_of_purchase': '10/18/20',
 'payment_method': 'DEBIT',
 'items': [{'name': 'GV OATMEAL', 'quantity': None, 'price': '1.76'},
  {'name': 'OT 2002 TUM', 'quantity': None, 'price': '6.74'},
  {'name': 'M ATHLETICS', 'quantity': None, 'price': '24.97'},
  {'name': 'DEXAS 15X20', 'quantity': None, 'price': '12.97'}]}

# **Resume Extract**

In [2]:
import easyocr
import requests
import json
from google.colab import files
import pprint


uploaded = files.upload()
image_path = list(uploaded.keys())[0]


reader = easyocr.Reader(['en'], gpu=False)
ocr_results = reader.readtext(image_path, detail=0, paragraph=True)
ocr_text = "\n".join(ocr_results)


prompt = f"""
You are an intelligent document parser designed for HR-tech systems that must operate in noisy, real-world environments.

Your task is to extract structured information from unstructured text obtained from OCR or readable PDFs.

Return this format for resumes:
{{
  "document_type": "Resume",
  "full_name": "...",
  "email": "...",
  "phone_number": "...",
  "skills": [...],
  "work_experience": [
    {{
      "company": "...",
      "role": "...",
      "dates": "..."
    }},
    ...
  ],
  "education": [
    {{
      "institution": "...",
      "degree": "...",
      "graduation_year": "..."
    }},
    ...
  ]
}}

If any field is missing, return null or empty list. Only return valid JSON. No explanation.

Document content:
\"\"\"
{ocr_text}
\"\"\"
"""

GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
headers = {"Content-Type": "application/json"}
data = {"contents": [{"parts": [{"text": prompt}]}]}

response = requests.post(url, headers=headers, data=json.dumps(data))
gemini_output = response.json()


try:
    text_output = gemini_output["candidates"][0]["content"]["parts"][0]["text"]
    json_text = text_output.split("```json")[1].split("```")[0].strip() if "```json" in text_output else text_output.strip()
    parsed_json = json.loads(json_text)
except Exception as e:
    parsed_json = {"error": str(e), "raw_output": gemini_output}

pprint.pprint(parsed_json)




Saving 60037207_60037209.jpg to 60037207_60037209.jpg
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete{'document_type': 'Resume',
 'education': [{'degree': 'B.S_ (Ma thematics )',
                'graduation_year': '1971',
                'institution': 'Michigan State University'},
               {'degree': 'M.s. (Ma thematics)',
                'graduation_year': '1972',
                'institution': 'Michigan State University'},
               {'degree': 'Ph.D. (Biophys ics)',
                'graduation_year': '1976',
                'institution': 'Michigan State University'},
               {'degree': 'M.D.',
                'graduation_year': '1983',
                'institution': 'University 0f Miami'}],
 'email': None,
 'full_name': 'Gary A Clawson',
 'phone_number': None,
 'skills': [],
 'work_experience': [{'company': 'Michigan State University',
                      'dates': '10/76 3/77',
                      'role': 'Research Associate'},
                     {'company': 'University of Califor