Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
558 changes: 558 additions & 0 deletions Process Logic.md

Large diffs are not rendered by default.

90 changes: 86 additions & 4 deletions contextifier/core/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ class DocumentProcessor:
"""

# === Supported File Type Classifications ===
DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'pptx', 'ppt', 'hwp', 'hwpx'])
TEXT_TYPES = frozenset(['txt', 'md', 'markdown', 'rtf'])
DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'rtf', 'pptx', 'ppt', 'hwp', 'hwpx'])
TEXT_TYPES = frozenset(['txt', 'md', 'markdown'])
CODE_TYPES = frozenset([
'py', 'js', 'ts', 'java', 'cpp', 'c', 'h', 'cs', 'go', 'rs',
'php', 'rb', 'swift', 'kt', 'scala', 'dart', 'r', 'sql',
Expand All @@ -291,6 +291,8 @@ def __init__(
slide_tag_suffix: Optional[str] = None,
chart_tag_prefix: Optional[str] = None,
chart_tag_suffix: Optional[str] = None,
metadata_tag_prefix: Optional[str] = None,
metadata_tag_suffix: Optional[str] = None,
**kwargs
):
"""
Expand Down Expand Up @@ -328,6 +330,12 @@ def __init__(
chart_tag_suffix: Suffix for chart tags in extracted text
- Default: "[/chart]"
- Example: "</chart>" for XML format
metadata_tag_prefix: Opening tag for metadata section
- Default: "<Document-Metadata>"
- Example: "<metadata>" for custom format
metadata_tag_suffix: Closing tag for metadata section
- Default: "</Document-Metadata>"
- Example: "</metadata>" for custom format
**kwargs: Additional configuration options

Example:
Expand All @@ -342,7 +350,9 @@ def __init__(
... page_tag_prefix="<page>",
... page_tag_suffix="</page>",
... chart_tag_prefix="<chart>",
... chart_tag_suffix="</chart>"
... chart_tag_suffix="</chart>",
... metadata_tag_prefix="<meta>",
... metadata_tag_suffix="</meta>"
... )

>>> # Markdown format
Expand All @@ -359,6 +369,10 @@ def __init__(
self._ocr_engine = ocr_engine
self._kwargs = kwargs
self._supported_extensions: Optional[List[str]] = None

# Store metadata tag settings
self._metadata_tag_prefix = metadata_tag_prefix
self._metadata_tag_suffix = metadata_tag_suffix

# Logger setup
self._logger = logging.getLogger("contextify.processor")
Expand Down Expand Up @@ -389,12 +403,19 @@ def __init__(
chart_tag_prefix=chart_tag_prefix,
chart_tag_suffix=chart_tag_suffix
)

# Create instance-specific MetadataFormatter
self._metadata_formatter = self._create_metadata_formatter(
metadata_tag_prefix=metadata_tag_prefix,
metadata_tag_suffix=metadata_tag_suffix
)

# Add processors to config for handlers to access
if isinstance(self._config, dict):
self._config["image_processor"] = self._image_processor
self._config["page_tag_processor"] = self._page_tag_processor
self._config["chart_processor"] = self._chart_processor
self._config["metadata_formatter"] = self._metadata_formatter

# =========================================================================
# Public Properties
Expand Down Expand Up @@ -484,6 +505,26 @@ def chart_processor(self) -> Any:
"""Current ChartProcessor instance for this DocumentProcessor."""
return self._chart_processor

@property
def metadata_tag_config(self) -> Dict[str, Any]:
"""
Current metadata formatter configuration.

Returns:
Dictionary containing:
- metadata_tag_prefix: Opening tag for metadata section
- metadata_tag_suffix: Closing tag for metadata section
"""
return {
"metadata_tag_prefix": self._metadata_formatter.metadata_tag_prefix,
"metadata_tag_suffix": self._metadata_formatter.metadata_tag_suffix,
}

@property
def metadata_formatter(self) -> Any:
"""Current MetadataFormatter instance for this DocumentProcessor."""
return self._metadata_formatter

@property
def ocr_engine(self) -> Optional[Any]:
"""Current OCR engine instance."""
Expand Down Expand Up @@ -875,6 +916,34 @@ def _create_chart_processor(
tag_suffix=chart_tag_suffix
)

def _create_metadata_formatter(
self,
metadata_tag_prefix: Optional[str] = None,
metadata_tag_suffix: Optional[str] = None
) -> Any:
"""
Create a MetadataFormatter instance for this DocumentProcessor.

This creates an instance-specific MetadataFormatter that will be
passed to handlers via config.

Args:
metadata_tag_prefix: Opening tag (default: "<Document-Metadata>")
metadata_tag_suffix: Closing tag (default: "</Document-Metadata>")

Returns:
MetadataFormatter instance
"""
from contextifier.core.functions.metadata_extractor import MetadataFormatter

kwargs = {}
if metadata_tag_prefix is not None:
kwargs["metadata_tag_prefix"] = metadata_tag_prefix
if metadata_tag_suffix is not None:
kwargs["metadata_tag_suffix"] = metadata_tag_suffix

return MetadataFormatter(**kwargs)

def _build_supported_extensions(self) -> List[str]:
"""Build list of supported extensions."""
extensions = list(
Expand Down Expand Up @@ -940,6 +1009,19 @@ def _get_handler_registry(self) -> Dict[str, Callable]:
except ImportError as e:
self._logger.warning(f"DOC handler not available: {e}")

# RTF handler
try:
from contextifier.core.processor.rtf_handler import RTFHandler
rtf_handler = RTFHandler(
config=self._config,
image_processor=self._image_processor,
page_tag_processor=self._page_tag_processor,
chart_processor=self._chart_processor
)
self._handler_registry['rtf'] = rtf_handler.extract_text
except ImportError as e:
self._logger.warning(f"RTF handler not available: {e}")

# PPT/PPTX handler
try:
from contextifier.core.processor.ppt_handler import PPTHandler
Expand Down Expand Up @@ -997,7 +1079,7 @@ def _get_handler_registry(self) -> Dict[str, Callable]:

# HWPX handler
try:
from contextifier.core.processor.hwps_handler import HWPXHandler
from contextifier.core.processor.hwpx_handler import HWPXHandler
hwpx_handler = HWPXHandler(
config=self._config,
image_processor=self._image_processor,
Expand Down
58 changes: 48 additions & 10 deletions contextifier/core/functions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# libs/core/functions/__init__.py
"""
Functions - 공통 유틸리티 함수 모듈
Functions - Common Utility Functions Module

문서 처리에 사용되는 공통 유틸리티 함수들을 제공합니다.
Provides common utility functions used in document processing.

모듈 구성:
- utils: 텍스트 정리, 코드 정리, JSON 정리 등 유틸리티 함수
- img_processor: 이미지 처리 및 저장 (ImageProcessor 클래스)
- ppt2pdf: PPT를 PDF로 변환하는 함수
Module Components:
- utils: Text cleaning, code cleaning, JSON sanitization utilities
- img_processor: Image processing and storage (ImageProcessor class)
- storage_backend: Storage backend implementations (Local, MinIO, S3)
- metadata_extractor: Document metadata extraction interface

사용 예시:
Usage Example:
from contextifier.core.functions import clean_text, clean_code_text
from contextifier.core.functions import ImageProcessor, save_image_to_file
from contextifier.core.functions.storage_backend import LocalStorageBackend
from contextifier.core.functions.utils import sanitize_text_for_json
"""

Expand All @@ -21,26 +23,62 @@
sanitize_text_for_json,
)

# 이미지 처리 모듈
# Storage backend module
from contextifier.core.functions.storage_backend import (
StorageType,
BaseStorageBackend,
LocalStorageBackend,
MinIOStorageBackend,
S3StorageBackend,
create_storage_backend,
get_default_backend,
)

# Image processor module
from contextifier.core.functions.img_processor import (
ImageProcessor,
ImageProcessorConfig,
ImageFormat,
NamingStrategy,
save_image_to_file,
create_image_processor,
DEFAULT_IMAGE_CONFIG,
)

# Metadata extraction module
from contextifier.core.functions.metadata_extractor import (
MetadataField,
DocumentMetadata,
MetadataFormatter,
BaseMetadataExtractor,
format_metadata,
)

__all__ = [
# 텍스트 유틸리티
# Text utilities
"clean_text",
"clean_code_text",
"sanitize_text_for_json",
# 이미지 처리
# Storage backends
"StorageType",
"BaseStorageBackend",
"LocalStorageBackend",
"MinIOStorageBackend",
"S3StorageBackend",
"create_storage_backend",
"get_default_backend",
# Image processor (base class for all format-specific processors)
"ImageProcessor",
"ImageProcessorConfig",
"ImageFormat",
"NamingStrategy",
"save_image_to_file",
"create_image_processor",
"DEFAULT_IMAGE_CONFIG",
# Metadata extraction
"MetadataField",
"DocumentMetadata",
"MetadataFormatter",
"BaseMetadataExtractor",
"format_metadata",
]
Loading