Authormaton · fehranbit · Sep 30, 2025 · Sep 25, 2025 · coderabbitai · Sep 25, 2025
diff --git a/api/endpoints/upload.py b/api/endpoints/upload.py
@@ -2,56 +2,116 @@
 Upload endpoint for document ingestion.
 """
 
-from fastapi import APIRouter, UploadFile, File, HTTPException
-import logging
+from fastapi import APIRouter, UploadFile, File, HTTPException, Request, status
+import os
+import pathlib
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+
 from services.file_service import save_upload_file
 from services.parsing_service import extract_text_from_pdf
 from models.schemas import UploadResponse
 from services.exceptions import DocumentSaveError, DocumentParseError, DocumentChunkError, DocumentEmbeddingError
 
+from services.logging_config import get_logger, set_log_context, clear_log_context
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 router = APIRouter()
 
+# Configuration: allowed types and size (bytes)
+ALLOWED_CONTENT_TYPES = {"application/pdf", "text/plain", "text/markdown"}
+MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(25 * 1024 * 1024)))  # default 25 MB
+
+
+def _secure_filename(name: str) -> str:
+    # Simple sanitization: take only base name and strip suspicious characters
+    base = pathlib.Path(name).name
+    # remove path separators and control chars
+    return "".join(c for c in base if c.isprintable())
+
+
 @router.post("/upload", response_model=UploadResponse)
-def upload_document(file: UploadFile = File(...)):  # noqa: B008
+async def upload_document(request: Request, file: UploadFile = File(...)):  # noqa: B008
+    """Upload and do a light parse to provide a preview.
+
+    This endpoint is async but offloads blocking file IO to a threadpool.
+    """
+    # Per-request logging context
+    request_id = request.headers.get("X-Request-Id") or None
+    if request_id:
+        set_log_context(request_id=request_id)
     try:
-        saved_path = save_upload_file(file.file, file.filename)
+        # Basic content-type and size checks
+        content_type = (file.content_type or "").lower()
+        if content_type not in ALLOWED_CONTENT_TYPES:
+            logger.warning("Rejected upload due to content-type", extra={"content_type": content_type})
+            raise HTTPException(status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, detail="Unsupported file type")
+
+        # If client provided Content-Length header, check early
+        content_length = request.headers.get("content-length")
+        if content_length:
+            try:
+                if int(content_length) > MAX_UPLOAD_BYTES:
+                    logger.warning("Rejected upload due to size header too large", extra={"size": content_length})
+                    raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="File too large")
+            except ValueError:
+                # ignore invalid header and continue with streaming checks
+                pass
+
+        # Sanitize filename
+        filename = _secure_filename(file.filename or "upload")
+
+        # Offload blocking save to threadpool
+        loop = __import__("asyncio").get_running_loop()
+        with ThreadPoolExecutor(max_workers=1) as ex:
+            saved_path = await loop.run_in_executor(ex, save_upload_file, file.file, filename)
+
+        # Quick size check after save
+        try:
+            size = os.path.getsize(saved_path)
+            if size > MAX_UPLOAD_BYTES:
+                logger.warning("Saved file exceeds max size", extra={"size": size, "path": saved_path})
+                raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="File too large")
+        except OSError:
+            logger.exception("Failed to stat saved file", extra={"path": saved_path})
+
         parsing_status = "success"
-        text_preview = None
-        if file.content_type == "application/pdf":
+        text_preview: Optional[str] = None
+
+        if content_type == "application/pdf":
             try:
                 text = extract_text_from_pdf(saved_path)
                 text_preview = text[:500] if text else None
             except DocumentParseError:
-                logger.error("Document parse error")
+                logger.error("Document parse error", extra={"path": saved_path})
                 parsing_status = "failed"
             except Exception:
-                logger.exception("Error parsing PDF file for preview: %s", saved_path)
+                logger.exception("Error parsing PDF file for preview", extra={"path": saved_path})
                 parsing_status = "failed"
-        elif file.content_type in {"text/plain", "text/markdown"}:
+        elif content_type in {"text/plain", "text/markdown"}:
             try:
                 with open(saved_path, "r", encoding="utf-8", errors="replace") as f:
                     text = f.read(500)
                 text_preview = text if text else None
             except UnicodeDecodeError:
                 parsing_status = "failed"
-                logger.error("Unicode decode error while reading file for preview: %s", saved_path)
+                logger.error("Unicode decode error while reading file for preview", extra={"path": saved_path})
             except OSError:
                 parsing_status = "failed"
-                logger.error("OS error while reading file for preview: %s", saved_path)
+                logger.error("OS error while reading file for preview", extra={"path": saved_path})
+
         return UploadResponse(
-            filename=file.filename,
+            filename=filename,
             message="File uploaded and parsed.",
             parsing_status=parsing_status,
-            text_preview=text_preview
+            text_preview=text_preview,
         )
     except DocumentSaveError as dse:
-        logger.error("Document save error: %s", dse)
-        raise HTTPException(status_code=400, detail=str(dse)) from dse
+        logger.error("Document save error", extra={"error": str(dse)})
+        raise HTTPException(status_code=400, detail="Failed to save uploaded document") from dse
     except (DocumentParseError, DocumentChunkError, DocumentEmbeddingError) as de:
-        logger.error("Document processing error: %s", de)
-        raise HTTPException(status_code=422, detail=str(de))
-    except Exception:
-        logger.exception("Unhandled error in upload_document")
-        raise HTTPException(status_code=500, detail="Internal server error")
+        logger.error("Document processing error", extra={"error": str(de)})
+        raise HTTPException(status_code=422, detail="Error processing document")
+    finally:
+        # Clear per-request logging context
+        clear_log_context()
-    finally:
-        # Clear per-request logging context
-        clear_log_context()
+    finally:
+        with contextlib.suppress(Exception):
+            await file.close()
+        # Clear per-request logging context
+        clear_log_context()
-    finally:
-        # Clear per-request logging context
-        clear_log_context()
+    finally:
+        with contextlib.suppress(Exception):
+            await file.close()
+        # Clear per-request logging context
+        clear_log_context()