feat: support ocr (base on [PaddleOCR-API](https://github.com/cgcel…

…/PaddleOCRFastAPI))
Deeptrain-Community · Apr 24, 2024 · 5779cbf · 5779cbf
1 parent 8e10891
commit 5779cbf
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -63,14 +63,25 @@ Response
 ```
 
 ## Environment Variables
-- `MAX_FILE_SIZE`: Max File Size MiB (Default: No Limit)
-  - Tips: Size limit is also depend on the server configuration (e.g. Nginx/Apache Config, Vercel Free Plan Limit **5MB** Body Size) 
+### General Config
+- `MAX_FILE_SIZE`: Max Uploaded File Size MiB (Default: No Limit)
+  - *Tips: Size limit is also depend on the server configuration (e.g. Nginx/Apache Config, Vercel Free Plan Limit **5MB** Body Size)*
 - `CORS_ALLOW_ORIGINS`: CORS Allow Origins (Default: `*`)
   - e.g.: *http://localhost:3000,https://example.com*
 - `AZURE_SPEECH_KEY`: Azure Speech to Text Service Key (Required for Audio Support)
 - `AZURE_SPEECH_REGION`: Azure Speech to Text Service Region (Required for Audio Support)
 
-## Image Storage Config
+### OCR Config
+
+OCR Support is based on [PaddleOCR API](https://github.com/cgcel/PaddleOCRFastAPI), please deploy the API to use OCR feature.
+
+When OCR is enabled, the service will automatically extract text from the image and **skip the original image storage solution** below.
+
+- `OCR_ENABLED` Image OCR Enabled (`1` for **Enabled**, `0` for **Disabled**, Default is **Disabled**)
+- `OCR_ENDPOINT` Paddle OCR Endpoint ([Deploy PaddleOCR API](https://github.com/cgcel/PaddleOCRFastAPI))
+    - e.g.: *http://example.com:8000*
+
+### Image Storage Config
 1. ✨ No Storage (Default)
    - [x] **No Storage Required & No External Dependencies**
    - [x] Base64 Encoding/Decoding

diff --git a/config.py b/config.py
@@ -4,6 +4,8 @@
 if environ.get("CORS_ALLOW_ORIGINS") and len(environ.get("CORS_ALLOW_ORIGINS")) > 0:
     CORS_ALLOW_ORIGINS = environ.get("CORS_ALLOW_ORIGINS").split(",")
 
+PDF_MAX_IMAGES = int(environ.get("PDF_MAX_IMAGES", 0))  # The maximum number of images to extract from a PDF file
+
 AZURE_SPEECH_KEY = environ.get("AZURE_SPEECH_KEY")  # Azure Speech Key
 AZURE_SPEECH_REGION = environ.get("AZURE_SPEECH_REGION")  # e.g. "eastus"
 ENABLE_AZURE_SPEECH = AZURE_SPEECH_KEY and AZURE_SPEECH_REGION
@@ -28,3 +30,6 @@
 TG_PASSWORD = environ.get("TG_PASSWORD", "")  # Telegram Password
 
 TG_API = TG_ENDPOINT + "/api" + (f"?pass={TG_PASSWORD}" if TG_PASSWORD and len(TG_PASSWORD) > 0 else "")
+
+OCR_ENDPOINT = environ.get("OCR_ENDPOINT", "").rstrip("/")  # OCR Endpoint
+OCR_ENABLED = int(environ.get("OCR_ENABLED", 0)) == 1  # OCR Enabled
diff --git a/handlers/image.py b/handlers/image.py
@@ -1,4 +1,7 @@
 from fastapi import UploadFile
+
+from config import OCR_ENABLED
+from handlers.ocr import ocr_image
 from store.store import process_image
 
 COMMON_IMAGE_EXTENSIONS = {
@@ -18,5 +21,7 @@ def is_image(filename: str) -> bool:
 
 async def process(file: UploadFile) -> str:
     """Process image."""
+    if OCR_ENABLED:
+        return ocr_image(file)
 
     return await process_image(file)
diff --git a/handlers/ocr.py b/handlers/ocr.py
@@ -0,0 +1,36 @@
+from fastapi import UploadFile, File
+import requests
+from config import OCR_ENDPOINT
+import time
+
+
+def get_ocr_source(data: any) -> list:
+    if type(data) is str:
+        return [data]
+    elif type(data) is list:
+        # recursive call and merge the results
+        return sum([get_ocr_source(item) for item in data], [])
+
+    return []
+
+
+def ocr_image(file: UploadFile = File(...)) -> str:
+    start = time.time()
+
+    response = requests.post(
+        OCR_ENDPOINT + "/ocr/predict-by-file",
+        files={"file": (file.filename, file.file, file.content_type)},
+    )
+    response.raise_for_status()
+    data = response.json()
+
+    code = data.get("resultcode", -1)
+    message = data.get("message", "")
+    result = data.get("data", [])
+
+    if code != 200:
+        raise ValueError(f"OCR API error: {message} (code: {code})")
+
+    print(f"[orc] time taken: {time.time() - start:.2f}s (file: {file.filename})")
+
+    return " ".join(get_ocr_source(result))