diff --git a/Process Logic.md b/Process Logic.md new file mode 100644 index 0000000..0c54aa3 --- /dev/null +++ b/Process Logic.md @@ -0,0 +1,558 @@ +# Contextify Processing Flow + +--- + +## Main Flow + +``` +User calls: processor.extract_chunks(file_path) + │ + ▼ + DocumentProcessor.extract_chunks() + │ + ├─► extract_text() + │ │ + │ ├─► _create_current_file(file_path) + │ ├─► _get_handler(extension) + │ ├─► handler.extract_text(current_file) + │ └─► OCR processing (optional) + │ + └─► chunk_text() + │ + └─► create_chunks() +``` + +--- + +## PDF Handler Flow + +``` +PDFHandler.extract_text(current_file) + │ + ├─► file_converter.convert(file_data) [INTERFACE: PDFFileConverter] + │ └─► Binary → fitz.Document + │ + ├─► preprocessor.preprocess(doc) [INTERFACE: PDFPreprocessor] + │ └─► Pass-through (returns PreprocessedData with doc unchanged) + │ + ├─► metadata_extractor.extract() [INTERFACE: PDFMetadataExtractor] + │ + ├─► _extract_all_tables(doc, file_path) [INTERNAL] + │ + └─► For each page: + │ + ├─► ComplexityAnalyzer.analyze() [CLASS: pdf_complexity_analyzer] + │ └─► Returns PageComplexity with recommended_strategy + │ + ├─► Branch by strategy: + │ │ + │ ├─► FULL_PAGE_OCR: + │ │ └─► _process_page_full_ocr() + │ │ + │ ├─► BLOCK_IMAGE_OCR: + │ │ └─► _process_page_block_ocr() + │ │ + │ ├─► HYBRID: + │ │ └─► _process_page_hybrid() + │ │ + │ └─► TEXT_EXTRACTION (default): + │ └─► _process_page_text_extraction() + │ │ + │ ├─► VectorTextOCREngine.detect_and_extract() + │ ├─► extract_text_blocks() [FUNCTION] + │ ├─► format_image_processor methods [INTERFACE: PDFImageProcessor] + │ └─► merge_page_elements() [FUNCTION] + │ + └─► page_tag_processor.create_page_tag() [INTERFACE: PageTagProcessor] +``` + +--- + +## DOCX Handler Flow + +``` +DOCXHandler.extract_text(current_file) + │ + ├─► file_converter.validate(file_data) [INTERFACE: DOCXFileConverter] + │ └─► Check if valid ZIP with [Content_Types].xml + │ + ├─► If not valid DOCX: + │ └─► _extract_with_doc_handler_fallback() [INTERNAL] + │ └─► DOCHandler.extract_text() [DELEGATION] + │ + ├─► file_converter.convert(file_data) [INTERFACE: DOCXFileConverter] + │ └─► Binary → docx.Document + │ + ├─► preprocessor.preprocess(doc) [INTERFACE: DOCXPreprocessor] + │ └─► Returns PreprocessedData (doc in extracted_resources) + │ + ├─► chart_extractor.extract_all_from_file() [INTERFACE: DOCXChartExtractor] + │ └─► Pre-extract all charts (callback pattern) + │ + ├─► metadata_extractor.extract() [INTERFACE: DOCXMetadataExtractor] + │ + └─► For each element in doc.element.body: + │ + ├─► If paragraph ('p'): + │ └─► process_paragraph_element() [FUNCTION: docx_helper] + │ ├─► format_image_processor.process_drawing_element() + │ ├─► format_image_processor.extract_from_pict() + │ └─► get_next_chart() callback for charts + │ + └─► If table ('tbl'): + └─► process_table_element() [FUNCTION: docx_helper] +``` + +--- + +## DOC Handler Flow + +``` +DOCHandler.extract_text(current_file) + │ + ├─► file_converter.convert() [INTERFACE: DOCFileConverter] + │ │ + │ ├─► _detect_format() → DocFormat (RTF/OLE/HTML/DOCX) + │ │ + │ ├─► RTF: file_data (bytes) 반환 [Pass-through] + │ ├─► OLE: _convert_ole() → olefile.OleFileIO + │ ├─► HTML: _convert_html() → BeautifulSoup + │ └─► DOCX: _convert_docx() → docx.Document + │ + ├─► preprocessor.preprocess(converted_obj) [INTERFACE: DOCPreprocessor] + │ └─► Returns PreprocessedData (converted_obj in extracted_resources) + │ + ├─► RTF format detected: + │ └─► _delegate_to_rtf_handler() [DELEGATION] + │ └─► RTFHandler.extract_text(current_file) + │ + ├─► OLE format detected: + │ └─► _extract_from_ole_obj() [INTERNAL] + │ ├─► _extract_ole_metadata() + │ ├─► _extract_ole_text() + │ └─► _extract_ole_images() + │ + ├─► HTML format detected: + │ └─► _extract_from_html_obj() [INTERNAL] + │ ├─► _extract_html_metadata() + │ └─► BeautifulSoup parsing + │ + └─► DOCX format detected: + └─► _extract_from_docx_obj() [INTERNAL] + └─► docx.Document paragraph/table extraction +``` + +--- + +## RTF Handler Flow + +**구조**: Converter는 pass-through, Preprocessor에서 binary 처리, Handler에서 순차적 처리. + +``` +RTFHandler.extract_text(current_file) + │ + ├─► file_converter.convert() [INTERFACE: RTFFileConverter] + │ └─► Pass-through (returns raw bytes) + │ + ├─► preprocessor.preprocess() [INTERFACE: RTFPreprocessor] + │ │ + │ ├─► \binN tag processing (skip binary data) + │ ├─► \pict group image extraction + │ └─► Returns PreprocessedData (clean_content, image_tags, encoding) + │ + ├─► decode_content() [FUNCTION: rtf_decoder] + │ └─► bytes → string with detected encoding + │ + ├─► Build RTFConvertedData [DATACLASS] + │ + └─► _extract_from_converted() [INTERNAL] + │ + ├─► metadata_extractor.extract() [INTERFACE: RTFMetadataExtractor] + ├─► metadata_extractor.format() + │ + ├─► extract_tables_with_positions() [FUNCTION: rtf_table_extractor] + │ + ├─► extract_inline_content() [FUNCTION: rtf_content_extractor] + │ + └─► Build result string +``` + +--- + +## Excel Handler Flow (XLSX) + +``` +ExcelHandler.extract_text(current_file) [XLSX] + │ + ├─► file_converter.convert(file_data, extension='xlsx') [INTERFACE: ExcelFileConverter] + │ └─► Binary → openpyxl.Workbook + │ + ├─► preprocessor.preprocess(wb) [INTERFACE: ExcelPreprocessor] + │ └─► Returns PreprocessedData (wb in extracted_resources) + │ + ├─► _preload_xlsx_data() [INTERNAL] + │ ├─► metadata_extractor.extract() [INTERFACE: XLSXMetadataExtractor] + │ ├─► chart_extractor.extract_all_from_file() [INTERFACE: ExcelChartExtractor] + │ └─► format_image_processor.extract_images() [INTERFACE: ExcelImageProcessor] + │ + └─► For each sheet: + │ + ├─► _process_xlsx_sheet() [INTERNAL] + │ ├─► page_tag_processor.create_sheet_tag() [INTERFACE: PageTagProcessor] + │ ├─► extract_textboxes_from_xlsx() [FUNCTION] + │ ├─► convert_xlsx_sheet_to_table() [FUNCTION] + │ └─► convert_xlsx_objects_to_tables()[FUNCTION] + │ + └─► format_image_processor.get_sheet_images() [INTERFACE: ExcelImageProcessor] +``` + +--- + +## Excel Handler Flow (XLS) + +``` +ExcelHandler.extract_text(current_file) [XLS] + │ + ├─► file_converter.convert(file_data, extension='xls') [INTERFACE: ExcelFileConverter] + │ └─► Binary → xlrd.Book + │ + ├─► preprocessor.preprocess(wb) [INTERFACE: ExcelPreprocessor] + │ └─► Returns PreprocessedData (wb in extracted_resources) + │ + ├─► _get_xls_metadata_extractor().extract_and_format() [INTERFACE: XLSMetadataExtractor] + │ + └─► For each sheet: + │ + ├─► page_tag_processor.create_sheet_tag() [INTERFACE: PageTagProcessor] + │ + ├─► convert_xls_sheet_to_table() [FUNCTION] + │ + └─► convert_xls_objects_to_tables() [FUNCTION] +``` + +--- + +## PPT Handler Flow + +``` +PPTHandler.extract_text(current_file) + │ + ├─► file_converter.convert(file_data, file_stream) [INTERFACE: PPTFileConverter] + │ └─► Binary → pptx.Presentation + │ + ├─► preprocessor.preprocess(prs) [INTERFACE: PPTPreprocessor] + │ └─► Returns PreprocessedData (prs in extracted_resources) + │ + ├─► chart_extractor.extract_all_from_file() [INTERFACE: PPTChartExtractor] + │ └─► Pre-extract all charts (callback pattern) + │ + ├─► metadata_extractor.extract() [INTERFACE: PPTMetadataExtractor] + ├─► metadata_extractor.format() [INTERFACE: PPTMetadataExtractor] + │ + └─► For each slide: + │ + ├─► page_tag_processor.create_slide_tag() [INTERFACE: PageTagProcessor] + │ + └─► For each shape: + │ + ├─► If table: convert_table_to_html() [FUNCTION] + ├─► If chart: get_next_chart() callback [Pre-extracted] + ├─► If picture: process_image_shape() [FUNCTION] + ├─► If group: process_group_shape() [FUNCTION] + └─► If text: extract_text_with_bullets() [FUNCTION] +``` + +--- + +## HWP Handler Flow + +``` +HWPHandler.extract_text(current_file) + │ + ├─► file_converter.validate(file_data) [INTERFACE: HWPFileConverter] + │ └─► Check if OLE file (magic number check) + │ + ├─► If not OLE file: + │ └─► _handle_non_ole_file() [INTERNAL] + │ ├─► ZIP detected → HWPXHandler delegation + │ └─► HWP 3.0 → Not supported + │ + ├─► chart_extractor.extract_all_from_file() [INTERFACE: HWPChartExtractor] + │ + ├─► file_converter.convert() [INTERFACE: HWPFileConverter] + │ └─► Binary → olefile.OleFileIO + │ + ├─► preprocessor.preprocess(ole) [INTERFACE: HWPPreprocessor] + │ └─► Returns PreprocessedData (ole in extracted_resources) + │ + ├─► metadata_extractor.extract() [INTERFACE: HWPMetadataExtractor] + ├─► metadata_extractor.format() [INTERFACE: HWPMetadataExtractor] + │ + ├─► _parse_docinfo(ole) [INTERNAL] + │ └─► parse_doc_info() [FUNCTION] + │ + ├─► _extract_body_text(ole) [INTERNAL] + │ │ + │ └─► For each section: + │ ├─► decompress_section() [FUNCTION] + │ └─► _parse_section() [INTERNAL] + │ └─► _process_picture() [INTERNAL - format_image_processor 사용] + │ + ├─► format_image_processor.process_images_from_bindata() [INTERFACE: HWPImageProcessor] + │ + └─► file_converter.close(ole) [INTERFACE: HWPFileConverter] +``` + +--- + +## HWPX Handler Flow + +``` +HWPXHandler.extract_text(current_file) + │ + ├─► get_file_stream(current_file) [INHERITED: BaseHandler] + │ └─► BytesIO(file_data) + │ + ├─► _is_valid_zip(file_stream) [INTERNAL] + │ + ├─► chart_extractor.extract_all_from_file() [INTERFACE: HWPXChartExtractor] + │ + ├─► zipfile.ZipFile(file_stream) [EXTERNAL LIBRARY] + │ + ├─► preprocessor.preprocess(zf) [INTERFACE: HWPXPreprocessor] + │ └─► Returns PreprocessedData (extracted_resources available) + │ + ├─► metadata_extractor.extract() [INTERFACE: HWPXMetadataExtractor] + ├─► metadata_extractor.format() [INTERFACE: HWPXMetadataExtractor] + │ + ├─► parse_bin_item_map(zf) [FUNCTION] + │ + ├─► For each section: + │ │ + │ └─► parse_hwpx_section() [FUNCTION] + │ │ + │ ├─► format_image_processor.process_images() [INTERFACE: HWPXImageProcessor] + │ │ + │ └─► parse_hwpx_table() [FUNCTION] + │ + └─► format_image_processor.get_remaining_images() [INTERFACE: HWPXImageProcessor] + format_image_processor.process_images() [INTERFACE: HWPXImageProcessor] +``` + +--- + +## CSV Handler Flow + +``` +CSVHandler.extract_text(current_file) + │ + ├─► file_converter.convert(file_data, encoding) [INTERFACE: CSVFileConverter] + │ └─► Binary → Text (with encoding detection) + │ + ├─► preprocessor.preprocess(content) [INTERFACE: CSVPreprocessor] + │ └─► Returns PreprocessedData (content in clean_content) + │ + ├─► detect_delimiter(content) [FUNCTION] + │ + ├─► parse_csv_content(content, delimiter) [FUNCTION] + │ + ├─► detect_header(rows) [FUNCTION] + │ + ├─► metadata_extractor.extract(source_info) [INTERFACE: CSVMetadataExtractor] + │ └─► CSVSourceInfo contains: file_path, encoding, delimiter, rows, has_header + │ + └─► convert_rows_to_table(rows, has_header) [FUNCTION] + └─► Returns HTML table +``` + +--- + +## Text Handler Flow + +``` +TextHandler.extract_text(current_file) + │ + ├─► preprocessor.preprocess(file_data) [INTERFACE: TextPreprocessor] + │ └─► Returns PreprocessedData (file_data in clean_content) + │ + ├─► file_data.decode(encoding) [DIRECT: No FileConverter used] + │ └─► Try encodings: utf-8, utf-8-sig, cp949, euc-kr, latin-1, ascii + │ + └─► clean_text() / clean_code_text() [FUNCTION: utils.py] +``` + +Note: TextHandler는 file_converter를 사용하지 않고 직접 decode합니다. + +--- + +## HTML Handler Flow + +``` +HTMLReprocessor (Utility - NOT a BaseHandler subclass) + │ + ├─► clean_html_file(html_content) [FUNCTION] + │ │ + │ ├─► BeautifulSoup parsing + │ ├─► Remove unwanted tags (script, style, etc.) + │ ├─► Remove style attributes + │ ├─► _process_table_merged_cells() + │ └─► Return cleaned HTML string + │ + └─► Used by DOCHandler when HTML format detected +``` + +Note: HTML은 별도의 BaseHandler 서브클래스가 없습니다. + DOCHandler가 HTML 형식을 감지하면 내부적으로 BeautifulSoup으로 처리합니다. + +--- + +## Image File Handler Flow + +``` +ImageFileHandler.extract_text(current_file) + │ + ├─► preprocessor.preprocess(file_data) [INTERFACE: ImageFilePreprocessor] + │ └─► Returns PreprocessedData (file_data in clean_content) + │ + ├─► Validate file extension [INTERNAL] + │ └─► SUPPORTED_IMAGE_EXTENSIONS: jpg, jpeg, png, gif, bmp, webp + │ + ├─► If OCR engine is None: + │ └─► _build_image_tag(file_path) [INTERNAL] + │ └─► Return [image:path] tag + │ + └─► If OCR engine available: + └─► _ocr_engine.extract_text() [INTERFACE: BaseOCR] + └─► Image → Text via OCR +``` + +Note: ImageFileHandler는 OCR 엔진이 설정된 경우에만 실제 텍스트 추출이 가능합니다. + +--- + +## Chunking Flow + +``` +chunk_text(text, chunk_size, chunk_overlap) + │ + └─► create_chunks() [FUNCTION] + │ + ├─► _extract_document_metadata() [FUNCTION] + │ + ├─► Detect file type: + │ │ + │ ├─► Table-based (xlsx, xls, csv): + │ │ └─► chunk_multi_sheet_content() [FUNCTION] + │ │ + │ ├─► Text with page markers: + │ │ └─► chunk_by_pages() [FUNCTION] + │ │ + │ └─► Plain text: + │ └─► chunk_plain_text() [FUNCTION] + │ + └─► _prepend_metadata_to_chunks() [FUNCTION] +``` + +--- + +## Interface Integration Summary + +``` +┌─────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐ +│ Handler │ FileConverter │ Preprocessor │ MetadataExtractor │ ChartExtractor │ FormatImageProcessor│ +├─────────────┼─────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┤ +│ PDF │ ✅ PDFFileConverter │ ✅ PDFPreprocessor │ ✅ PDFMetadata │ ❌ NullChart │ ✅ PDFImage │ +│ DOCX │ ✅ DOCXFileConverter │ ✅ DOCXPreprocessor │ ✅ DOCXMetadata │ ✅ DOCXChart │ ✅ DOCXImage │ +│ DOC │ ✅ DOCFileConverter │ ✅ DOCPreprocessor │ ❌ NullMetadata │ ❌ NullChart │ ✅ DOCImage │ +│ RTF │ ✅ RTFFileConverter │ ✅ RTFPreprocessor* │ ✅ RTFMetadata │ ❌ NullChart │ ❌ Uses base │ +│ XLSX │ ✅ ExcelFileConverter│ ✅ ExcelPreprocessor │ ✅ XLSXMetadata │ ✅ ExcelChart │ ✅ ExcelImage │ +│ XLS │ ✅ ExcelFileConverter│ ✅ ExcelPreprocessor │ ✅ XLSMetadata │ ✅ ExcelChart │ ✅ ExcelImage │ +│ PPT/PPTX │ ✅ PPTFileConverter │ ✅ PPTPreprocessor │ ✅ PPTMetadata │ ✅ PPTChart │ ✅ PPTImage │ +│ HWP │ ✅ HWPFileConverter │ ✅ HWPPreprocessor │ ✅ HWPMetadata │ ✅ HWPChart │ ✅ HWPImage │ +│ HWPX │ ❌ None (직접 ZIP) │ ✅ HWPXPreprocessor │ ✅ HWPXMetadata │ ✅ HWPXChart │ ✅ HWPXImage │ +│ CSV │ ✅ CSVFileConverter │ ✅ CSVPreprocessor │ ✅ CSVMetadata │ ❌ NullChart │ ✅ CSVImage │ +│ TXT/MD/JSON │ ❌ None (직접 decode)│ ✅ TextPreprocessor │ ❌ NullMetadata │ ❌ NullChart │ ✅ TextImage │ +│ HTML │ ❌ N/A (유틸리티) │ ❌ N/A │ ❌ N/A │ ❌ N/A │ ❌ N/A │ +│ Image Files │ ✅ ImageFileConverter│ ✅ ImagePreprocessor │ ❌ NullMetadata │ ❌ NullChart │ ✅ ImageFileImage │ +└─────────────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┘ + +✅ = Interface implemented +❌ = Not applicable / NullExtractor / Not used +* = RTFPreprocessor has actual processing logic (image extraction, binary cleanup) +``` + +--- + +## Handler Processing Pipeline + +모든 핸들러는 동일한 처리 파이프라인을 따릅니다: + +``` +┌──────────────────────────────────────────────────────────────────────────────────┐ +│ Handler Processing Pipeline │ +├──────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ 1. FileConverter.convert() Binary → Format-specific object │ +│ │ (fitz.Document, docx.Document, olefile, etc.) │ +│ ▼ │ +│ 2. Preprocessor.preprocess() Process/clean the converted data │ +│ │ (image extraction, binary cleanup, encoding) │ +│ ▼ │ +│ 3. MetadataExtractor.extract() Extract document metadata │ +│ │ (title, author, created date, etc.) │ +│ ▼ │ +│ 4. Content Extraction Format-specific content extraction │ +│ │ (text, tables, images, charts) │ +│ ▼ │ +│ 5. Result Assembly Build final result string │ +│ │ +└──────────────────────────────────────────────────────────────────────────────────┘ + +Note: 대부분의 핸들러에서 Preprocessor는 pass-through (NullPreprocessor). + RTF는 예외로, RTFPreprocessor에서 실제 바이너리 처리가 이루어짐. +``` + +--- + +## Remaining Function-Based Components + +``` +┌─────────────┬────────────────────────────────────────────────────────────┐ +│ Handler │ Function-Based Components │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ PDF │ extract_text_blocks(), merge_page_elements(), │ +│ │ ComplexityAnalyzer, VectorTextOCREngine, │ +│ │ BlockImageEngine │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ DOCX │ process_paragraph_element(), process_table_element() │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ DOC │ Format detection, OLE/HTML/DOCX internal processing │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ RTF │ decode_content() (rtf_decoder.py) │ +│ │ extract_tables_with_positions() (rtf_table_extractor.py) │ +│ │ extract_inline_content() (rtf_content_extractor.py) │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ Excel │ extract_textboxes_from_xlsx(), convert_xlsx_sheet_to_table│ +│ │ convert_xls_sheet_to_table(), convert_*_objects_to_tables │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ PPT │ extract_text_with_bullets(), convert_table_to_html(), │ +│ │ process_image_shape(), process_group_shape() │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ HWP │ parse_doc_info(), decompress_section() │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ HWPX │ parse_bin_item_map(), parse_hwpx_section() │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ CSV │ detect_delimiter(), parse_csv_content(), detect_header(), │ +│ │ convert_rows_to_table() │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ Text │ clean_text(), clean_code_text() (utils.py) │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ HTML │ clean_html_file(), _process_table_merged_cells() │ +│ │ (html_reprocessor.py - utility, not handler) │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ Image │ OCR engine integration (BaseOCR subclass) │ +├─────────────┼────────────────────────────────────────────────────────────┤ +│ Chunking │ create_chunks(), chunk_by_pages(), chunk_plain_text(), │ +│ │ chunk_multi_sheet_content(), chunk_large_table() │ +└─────────────┴────────────────────────────────────────────────────────────┘ +``` diff --git a/contextifier/core/document_processor.py b/contextifier/core/document_processor.py index c262971..809997f 100644 --- a/contextifier/core/document_processor.py +++ b/contextifier/core/document_processor.py @@ -263,8 +263,8 @@ class DocumentProcessor: """ # === Supported File Type Classifications === - DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'pptx', 'ppt', 'hwp', 'hwpx']) - TEXT_TYPES = frozenset(['txt', 'md', 'markdown', 'rtf']) + DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'rtf', 'pptx', 'ppt', 'hwp', 'hwpx']) + TEXT_TYPES = frozenset(['txt', 'md', 'markdown']) CODE_TYPES = frozenset([ 'py', 'js', 'ts', 'java', 'cpp', 'c', 'h', 'cs', 'go', 'rs', 'php', 'rb', 'swift', 'kt', 'scala', 'dart', 'r', 'sql', @@ -291,6 +291,8 @@ def __init__( slide_tag_suffix: Optional[str] = None, chart_tag_prefix: Optional[str] = None, chart_tag_suffix: Optional[str] = None, + metadata_tag_prefix: Optional[str] = None, + metadata_tag_suffix: Optional[str] = None, **kwargs ): """ @@ -328,6 +330,12 @@ def __init__( chart_tag_suffix: Suffix for chart tags in extracted text - Default: "[/chart]" - Example: "" for XML format + metadata_tag_prefix: Opening tag for metadata section + - Default: "" + - Example: "" for custom format + metadata_tag_suffix: Closing tag for metadata section + - Default: "" + - Example: "" for custom format **kwargs: Additional configuration options Example: @@ -342,7 +350,9 @@ def __init__( ... page_tag_prefix="", ... page_tag_suffix="", ... chart_tag_prefix="", - ... chart_tag_suffix="" + ... chart_tag_suffix="", + ... metadata_tag_prefix="", + ... metadata_tag_suffix="" ... ) >>> # Markdown format @@ -359,6 +369,10 @@ def __init__( self._ocr_engine = ocr_engine self._kwargs = kwargs self._supported_extensions: Optional[List[str]] = None + + # Store metadata tag settings + self._metadata_tag_prefix = metadata_tag_prefix + self._metadata_tag_suffix = metadata_tag_suffix # Logger setup self._logger = logging.getLogger("contextify.processor") @@ -389,12 +403,19 @@ def __init__( chart_tag_prefix=chart_tag_prefix, chart_tag_suffix=chart_tag_suffix ) + + # Create instance-specific MetadataFormatter + self._metadata_formatter = self._create_metadata_formatter( + metadata_tag_prefix=metadata_tag_prefix, + metadata_tag_suffix=metadata_tag_suffix + ) # Add processors to config for handlers to access if isinstance(self._config, dict): self._config["image_processor"] = self._image_processor self._config["page_tag_processor"] = self._page_tag_processor self._config["chart_processor"] = self._chart_processor + self._config["metadata_formatter"] = self._metadata_formatter # ========================================================================= # Public Properties @@ -484,6 +505,26 @@ def chart_processor(self) -> Any: """Current ChartProcessor instance for this DocumentProcessor.""" return self._chart_processor + @property + def metadata_tag_config(self) -> Dict[str, Any]: + """ + Current metadata formatter configuration. + + Returns: + Dictionary containing: + - metadata_tag_prefix: Opening tag for metadata section + - metadata_tag_suffix: Closing tag for metadata section + """ + return { + "metadata_tag_prefix": self._metadata_formatter.metadata_tag_prefix, + "metadata_tag_suffix": self._metadata_formatter.metadata_tag_suffix, + } + + @property + def metadata_formatter(self) -> Any: + """Current MetadataFormatter instance for this DocumentProcessor.""" + return self._metadata_formatter + @property def ocr_engine(self) -> Optional[Any]: """Current OCR engine instance.""" @@ -875,6 +916,34 @@ def _create_chart_processor( tag_suffix=chart_tag_suffix ) + def _create_metadata_formatter( + self, + metadata_tag_prefix: Optional[str] = None, + metadata_tag_suffix: Optional[str] = None + ) -> Any: + """ + Create a MetadataFormatter instance for this DocumentProcessor. + + This creates an instance-specific MetadataFormatter that will be + passed to handlers via config. + + Args: + metadata_tag_prefix: Opening tag (default: "") + metadata_tag_suffix: Closing tag (default: "") + + Returns: + MetadataFormatter instance + """ + from contextifier.core.functions.metadata_extractor import MetadataFormatter + + kwargs = {} + if metadata_tag_prefix is not None: + kwargs["metadata_tag_prefix"] = metadata_tag_prefix + if metadata_tag_suffix is not None: + kwargs["metadata_tag_suffix"] = metadata_tag_suffix + + return MetadataFormatter(**kwargs) + def _build_supported_extensions(self) -> List[str]: """Build list of supported extensions.""" extensions = list( @@ -940,6 +1009,19 @@ def _get_handler_registry(self) -> Dict[str, Callable]: except ImportError as e: self._logger.warning(f"DOC handler not available: {e}") + # RTF handler + try: + from contextifier.core.processor.rtf_handler import RTFHandler + rtf_handler = RTFHandler( + config=self._config, + image_processor=self._image_processor, + page_tag_processor=self._page_tag_processor, + chart_processor=self._chart_processor + ) + self._handler_registry['rtf'] = rtf_handler.extract_text + except ImportError as e: + self._logger.warning(f"RTF handler not available: {e}") + # PPT/PPTX handler try: from contextifier.core.processor.ppt_handler import PPTHandler @@ -997,7 +1079,7 @@ def _get_handler_registry(self) -> Dict[str, Callable]: # HWPX handler try: - from contextifier.core.processor.hwps_handler import HWPXHandler + from contextifier.core.processor.hwpx_handler import HWPXHandler hwpx_handler = HWPXHandler( config=self._config, image_processor=self._image_processor, diff --git a/contextifier/core/functions/__init__.py b/contextifier/core/functions/__init__.py index 905bb4e..07d125c 100644 --- a/contextifier/core/functions/__init__.py +++ b/contextifier/core/functions/__init__.py @@ -1,17 +1,19 @@ # libs/core/functions/__init__.py """ -Functions - 공통 유틸리티 함수 모듈 +Functions - Common Utility Functions Module -문서 처리에 사용되는 공통 유틸리티 함수들을 제공합니다. +Provides common utility functions used in document processing. -모듈 구성: -- utils: 텍스트 정리, 코드 정리, JSON 정리 등 유틸리티 함수 -- img_processor: 이미지 처리 및 저장 (ImageProcessor 클래스) -- ppt2pdf: PPT를 PDF로 변환하는 함수 +Module Components: +- utils: Text cleaning, code cleaning, JSON sanitization utilities +- img_processor: Image processing and storage (ImageProcessor class) +- storage_backend: Storage backend implementations (Local, MinIO, S3) +- metadata_extractor: Document metadata extraction interface -사용 예시: +Usage Example: from contextifier.core.functions import clean_text, clean_code_text from contextifier.core.functions import ImageProcessor, save_image_to_file + from contextifier.core.functions.storage_backend import LocalStorageBackend from contextifier.core.functions.utils import sanitize_text_for_json """ @@ -21,7 +23,18 @@ sanitize_text_for_json, ) -# 이미지 처리 모듈 +# Storage backend module +from contextifier.core.functions.storage_backend import ( + StorageType, + BaseStorageBackend, + LocalStorageBackend, + MinIOStorageBackend, + S3StorageBackend, + create_storage_backend, + get_default_backend, +) + +# Image processor module from contextifier.core.functions.img_processor import ( ImageProcessor, ImageProcessorConfig, @@ -29,18 +42,43 @@ NamingStrategy, save_image_to_file, create_image_processor, + DEFAULT_IMAGE_CONFIG, +) + +# Metadata extraction module +from contextifier.core.functions.metadata_extractor import ( + MetadataField, + DocumentMetadata, + MetadataFormatter, + BaseMetadataExtractor, + format_metadata, ) __all__ = [ - # 텍스트 유틸리티 + # Text utilities "clean_text", "clean_code_text", "sanitize_text_for_json", - # 이미지 처리 + # Storage backends + "StorageType", + "BaseStorageBackend", + "LocalStorageBackend", + "MinIOStorageBackend", + "S3StorageBackend", + "create_storage_backend", + "get_default_backend", + # Image processor (base class for all format-specific processors) "ImageProcessor", "ImageProcessorConfig", "ImageFormat", "NamingStrategy", "save_image_to_file", "create_image_processor", + "DEFAULT_IMAGE_CONFIG", + # Metadata extraction + "MetadataField", + "DocumentMetadata", + "MetadataFormatter", + "BaseMetadataExtractor", + "format_metadata", ] diff --git a/contextifier/core/functions/file_converter.py b/contextifier/core/functions/file_converter.py new file mode 100644 index 0000000..74c8d55 --- /dev/null +++ b/contextifier/core/functions/file_converter.py @@ -0,0 +1,219 @@ +# libs/core/functions/file_converter.py +""" +BaseFileConverter - Abstract base class for file format conversion + +Defines the interface for converting binary file data to a workable format. +Each handler can optionally implement a format-specific converter. + +The converter's job is to transform raw binary data into a format-specific +object that the handler can work with (e.g., Document, Workbook, OLE file). + +This is the FIRST step in the processing pipeline: + Binary Data → FileConverter → Workable Object → Handler Processing + +Usage: + class PDFFileConverter(BaseFileConverter): + def convert(self, file_data: bytes, file_stream: BinaryIO) -> Any: + import fitz + return fitz.open(stream=file_data, filetype="pdf") + + def get_format_name(self) -> str: + return "PDF Document" +""" +from abc import ABC, abstractmethod +from io import BytesIO +from typing import Any, Optional, Union, BinaryIO + + +class BaseFileConverter(ABC): + """ + Abstract base class for file format converters. + + Converts raw binary file data into a format-specific workable object. + This is the first processing step before text extraction. + + Subclasses must implement: + - convert(): Convert binary data to workable format + - get_format_name(): Return human-readable format name + """ + + @abstractmethod + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary file data to a workable format. + + Args: + file_data: Raw binary file data + file_stream: Optional file stream (BytesIO) for libraries that prefer streams + **kwargs: Additional format-specific options + + Returns: + Format-specific object (Document, Workbook, OLE file, etc.) + + Raises: + ConversionError: If conversion fails + """ + pass + + @abstractmethod + def get_format_name(self) -> str: + """ + Return human-readable format name. + + Returns: + Format name string (e.g., "PDF Document", "DOCX Document") + """ + pass + + def validate(self, file_data: bytes) -> bool: + """ + Validate if the file data can be converted by this converter. + + Override this method to add format-specific validation. + Default implementation returns True. + + Args: + file_data: Raw binary file data + + Returns: + True if file can be converted, False otherwise + """ + return True + + def close(self, converted_object: Any) -> None: + """ + Close/cleanup the converted object if needed. + + Override this method if the converted object needs explicit cleanup. + Default implementation does nothing. + + Args: + converted_object: The object returned by convert() + """ + pass + + +class NullFileConverter(BaseFileConverter): + """ + Null implementation of file converter. + + Used as default when no conversion is needed. + Returns the original file data unchanged. + """ + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> bytes: + """Return file data unchanged.""" + return file_data + + def get_format_name(self) -> str: + """Return generic format name.""" + return "Raw Binary" + + +class PassThroughConverter(BaseFileConverter): + """ + Pass-through converter that returns file stream. + + Used for handlers that work directly with BytesIO streams. + """ + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> BinaryIO: + """Return BytesIO stream of file data.""" + if file_stream is not None: + file_stream.seek(0) + return file_stream + return BytesIO(file_data) + + def get_format_name(self) -> str: + """Return format name.""" + return "Binary Stream" + + +class TextFileConverter(BaseFileConverter): + """ + Converter for text-based files. + + Decodes binary data to text string using encoding detection. + """ + + DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii'] + + def __init__(self, encodings: Optional[list] = None): + """ + Initialize TextFileConverter. + + Args: + encodings: List of encodings to try (default: common encodings) + """ + self._encodings = encodings or self.DEFAULT_ENCODINGS + self._detected_encoding: Optional[str] = None + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + encoding: Optional[str] = None, + **kwargs + ) -> str: + """ + Convert binary data to text string. + + Args: + file_data: Raw binary file data + file_stream: Ignored for text conversion + encoding: Specific encoding to use (None for auto-detect) + **kwargs: Additional options + + Returns: + Decoded text string + + Raises: + UnicodeDecodeError: If decoding fails with all encodings + """ + # Try specified encoding first + if encoding: + try: + result = file_data.decode(encoding) + self._detected_encoding = encoding + return result + except UnicodeDecodeError: + pass + + # Try each encoding in order + for enc in self._encodings: + try: + result = file_data.decode(enc) + self._detected_encoding = enc + return result + except UnicodeDecodeError: + continue + + # Fallback: decode with errors='replace' + self._detected_encoding = 'utf-8' + return file_data.decode('utf-8', errors='replace') + + def get_format_name(self) -> str: + """Return format name with detected encoding.""" + if self._detected_encoding: + return f"Text ({self._detected_encoding})" + return "Text" + + @property + def detected_encoding(self) -> Optional[str]: + """Return the encoding detected during last conversion.""" + return self._detected_encoding diff --git a/contextifier/core/functions/img_processor.py b/contextifier/core/functions/img_processor.py index 3b3d755..a594ff1 100644 --- a/contextifier/core/functions/img_processor.py +++ b/contextifier/core/functions/img_processor.py @@ -1,25 +1,39 @@ -# libs/core/functions/img_processor.py +# contextifier/core/functions/img_processor.py """ Image Processing Module -Provides functionality to save image data to the local file system and convert to tag format. -A general-purpose image processing module that replaces the existing image upload functions. +Provides functionality to save image data to various storage backends +and convert to tag format. Uses Strategy pattern for storage backends. + +This is the BASE class for all image processors. +Format-specific processors (PDFImageProcessor, DOCXImageProcessor, etc.) +should inherit from ImageProcessor and override process_image() method. Main Features: -- Save image data to a specified directory +- Base ImageProcessor class with pluggable storage backend +- Save image data to specified storage (Local, MinIO, S3, etc.) - Return saved path in custom tag format - Duplicate image detection and handling - Support for various image formats +- Extensible for format-specific processing Usage Example: from contextifier.core.functions.img_processor import ImageProcessor + from contextifier.core.functions.storage_backend import ( + LocalStorageBackend, + MinIOStorageBackend, + ) - # Use with default settings + # Use with default settings (local storage) processor = ImageProcessor() tag = processor.save_image(image_bytes) - # Result: "[Image:temp/abc123.png]" + # Result: "[Image:temp/images/abc123.png]" - # Custom settings + # Use with MinIO storage (when implemented) + minio_backend = MinIOStorageBackend(endpoint="localhost:9000", bucket="images") + processor = ImageProcessor(storage_backend=minio_backend) + + # Custom tag format processor = ImageProcessor( directory_path="output/images", tag_prefix="" + + # Inherit for format-specific processing + class PDFImageProcessor(ImageProcessor): + def process_image(self, image_data: bytes, **kwargs) -> Optional[str]: + xref = kwargs.get('xref') + custom_name = f"pdf_xref_{xref}" if xref else None + return self.save_image(image_data, custom_name=custom_name) """ import hashlib import io import logging import os -import tempfile import uuid from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Union -logger = logging.getLogger("document-processor") +from contextifier.core.functions.storage_backend import ( + BaseStorageBackend, + LocalStorageBackend, + StorageType, + get_default_backend, +) + +logger = logging.getLogger("contextify.image_processor") class ImageFormat(Enum): - """Supported image formats""" + """Supported image formats.""" PNG = "png" JPEG = "jpeg" JPG = "jpg" @@ -55,7 +82,7 @@ class ImageFormat(Enum): class NamingStrategy(Enum): - """Image file naming strategies""" + """Image file naming strategies.""" HASH = "hash" # Content-based hash (prevents duplicates) UUID = "uuid" # Unique UUID SEQUENTIAL = "sequential" # Sequential numbering @@ -65,10 +92,10 @@ class NamingStrategy(Enum): @dataclass class ImageProcessorConfig: """ - ImageProcessor Configuration + ImageProcessor Configuration. Attributes: - directory_path: Directory path to save images + directory_path: Directory path or bucket prefix for saving images tag_prefix: Tag prefix (e.g., "[Image:") tag_suffix: Tag suffix (e.g., "]") naming_strategy: File naming strategy @@ -78,7 +105,7 @@ class ImageProcessorConfig: hash_algorithm: Hash algorithm (for hash strategy) max_filename_length: Maximum filename length """ - directory_path: str = "temp" + directory_path: str = "temp/images" tag_prefix: str = "[Image:" tag_suffix: str = "]" naming_strategy: NamingStrategy = NamingStrategy.HASH @@ -91,23 +118,29 @@ class ImageProcessorConfig: class ImageProcessor: """ - Image Processing Class - - Saves image data to the local file system and returns + Base Image Processing Class. + + Saves image data using a pluggable storage backend and returns the saved path in the specified tag format. - + + This is the BASE CLASS for all format-specific image processors. + Subclasses should override process_image() for format-specific handling. + Args: - directory_path: Image save directory (default: "temp") + directory_path: Image save directory (default: "temp/images") tag_prefix: Tag prefix (default: "[Image:") tag_suffix: Tag suffix (default: "]") naming_strategy: File naming strategy (default: HASH) - config: ImageProcessorConfig object (takes precedence over individual parameters) - + storage_backend: Storage backend instance (default: LocalStorageBackend) + config: ImageProcessorConfig object (takes precedence) + Examples: + >>> # Default usage (local storage) >>> processor = ImageProcessor() >>> tag = processor.save_image(image_bytes) - "[Image:temp/a1b2c3d4.png]" - + "[Image:temp/images/a1b2c3d4.png]" + + >>> # Custom directory and tags >>> processor = ImageProcessor( ... directory_path="images", ... tag_prefix="![image](", @@ -115,56 +148,90 @@ class ImageProcessor: ... ) >>> tag = processor.save_image(image_bytes) "![image](images/a1b2c3d4.png)" + + >>> # Subclass for format-specific processing + >>> class PDFImageProcessor(ImageProcessor): + ... def process_image(self, image_data, **kwargs): + ... xref = kwargs.get('xref') + ... return self.save_image(image_data, custom_name=f"pdf_{xref}") """ - + def __init__( self, - directory_path: str = "temp", + directory_path: str = "temp/images", tag_prefix: str = "[Image:", tag_suffix: str = "]", naming_strategy: Union[NamingStrategy, str] = NamingStrategy.HASH, + storage_backend: Optional[BaseStorageBackend] = None, config: Optional[ImageProcessorConfig] = None, ): + # Set config if config: self.config = config else: - # Convert string to Enum if needed if isinstance(naming_strategy, str): naming_strategy = NamingStrategy(naming_strategy.lower()) - + self.config = ImageProcessorConfig( directory_path=directory_path, tag_prefix=tag_prefix, tag_suffix=tag_suffix, naming_strategy=naming_strategy, ) - + + # Set storage backend (default: local) + self._storage_backend = storage_backend or get_default_backend() + # Track processed image hashes (for duplicate prevention) self._processed_hashes: Dict[str, str] = {} - + # Sequential counter (for sequential strategy) self._sequential_counter: int = 0 - - # Create directory + + # Logger + self._logger = logging.getLogger("contextify.image_processor.ImageProcessor") + + # Create directory if using local storage if self.config.create_directory: - self._ensure_directory_exists() - - def _ensure_directory_exists(self) -> None: - """Check if directory exists and create if not""" - path = Path(self.config.directory_path) - if not path.exists(): - path.mkdir(parents=True, exist_ok=True) - logger.debug(f"Created directory: {path}") - + self._ensure_storage_ready() + + @property + def storage_backend(self) -> BaseStorageBackend: + """Get the current storage backend.""" + return self._storage_backend + + @storage_backend.setter + def storage_backend(self, backend: BaseStorageBackend) -> None: + """ + Set storage backend. + + Args: + backend: New storage backend instance + """ + self._storage_backend = backend + if self.config.create_directory: + self._ensure_storage_ready() + + @property + def storage_type(self) -> StorageType: + """Get the current storage type.""" + return self._storage_backend.storage_type + + def _ensure_storage_ready(self) -> None: + """Ensure storage is ready.""" + self._storage_backend.ensure_ready(self.config.directory_path) + def _compute_hash(self, data: bytes) -> str: - """Compute hash of image data""" + """Compute hash of image data.""" hasher = hashlib.new(self.config.hash_algorithm) hasher.update(data) - return hasher.hexdigest()[:32] # Use first 32 characters - + return hasher.hexdigest()[:32] + def _detect_format(self, data: bytes) -> ImageFormat: - """Detect format from image data""" - # Detect format using magic bytes + """Detect format from image data using magic bytes.""" + if len(data) < 12: + return ImageFormat.UNKNOWN + if data[:8] == b'\x89PNG\r\n\x1a\n': return ImageFormat.PNG elif data[:2] == b'\xff\xd8': @@ -179,25 +246,27 @@ def _detect_format(self, data: bytes) -> ImageFormat: return ImageFormat.TIFF else: return ImageFormat.UNKNOWN - + def _generate_filename( self, data: bytes, image_format: ImageFormat, custom_name: Optional[str] = None ) -> str: - """Generate filename""" + """Generate filename based on naming strategy.""" if custom_name: - # Add extension - if not any(custom_name.lower().endswith(f".{fmt.value}") for fmt in ImageFormat if fmt != ImageFormat.UNKNOWN): - ext = image_format.value if image_format != ImageFormat.UNKNOWN else self.config.default_format.value + if not any(custom_name.lower().endswith(f".{fmt.value}") + for fmt in ImageFormat if fmt != ImageFormat.UNKNOWN): + ext = (image_format.value if image_format != ImageFormat.UNKNOWN + else self.config.default_format.value) return f"{custom_name}.{ext}" return custom_name - - ext = image_format.value if image_format != ImageFormat.UNKNOWN else self.config.default_format.value - + + ext = (image_format.value if image_format != ImageFormat.UNKNOWN + else self.config.default_format.value) + strategy = self.config.naming_strategy - + if strategy == NamingStrategy.HASH: base = self._compute_hash(data) elif strategy == NamingStrategy.UUID: @@ -210,28 +279,29 @@ def _generate_filename( base = f"img_{int(time.time() * 1000)}" else: base = self._compute_hash(data) - + filename = f"{base}.{ext}" - - # Limit filename length + if len(filename) > self.config.max_filename_length: max_base_len = self.config.max_filename_length - len(ext) - 1 filename = f"{base[:max_base_len]}.{ext}" - + return filename - + + def _build_file_path(self, filename: str) -> str: + """Build full file path from filename.""" + return os.path.join(self.config.directory_path, filename) + def _build_tag(self, file_path: str) -> str: - """Build tag from saved file path""" + """Build tag from file path.""" if self.config.use_absolute_path: path_str = str(Path(file_path).absolute()) else: - path_str = file_path - - # Normalize path separators (Windows -> Unix style) + path_str = self._storage_backend.build_url(file_path) + path_str = path_str.replace("\\", "/") - return f"{self.config.tag_prefix}{path_str}{self.config.tag_suffix}" - + def save_image( self, image_data: bytes, @@ -240,72 +310,148 @@ def save_image( skip_duplicate: bool = True, ) -> Optional[str]: """ - Save image data to file and return tag. - + Save image data and return tag. + Args: image_data: Image binary data custom_name: Custom filename (extension optional) processed_images: Set of processed image paths (for external duplicate tracking) - skip_duplicate: If True, skip saving duplicate images (return existing path) - + skip_duplicate: If True, skip saving duplicate images + Returns: Image tag string, or None on failure - + Examples: >>> processor = ImageProcessor() >>> tag = processor.save_image(png_bytes) - "[Image:temp/abc123.png]" + "[Image:temp/images/abc123.png]" """ if not image_data: - logger.warning("Empty image data provided") + self._logger.warning("Empty image data provided") return None - + try: # Detect image format image_format = self._detect_format(image_data) - - # Compute hash (for duplicate check) + + # Compute hash image_hash = self._compute_hash(image_data) - + # Check for duplicates if skip_duplicate and image_hash in self._processed_hashes: existing_path = self._processed_hashes[image_hash] - logger.debug(f"Duplicate image detected, returning existing: {existing_path}") + self._logger.debug(f"Duplicate image detected: {existing_path}") return self._build_tag(existing_path) - + # Generate filename filename = self._generate_filename(image_data, image_format, custom_name) - - # Full path - file_path = os.path.join(self.config.directory_path, filename) - + file_path = self._build_file_path(filename) + # Check external duplicate tracking if processed_images is not None and file_path in processed_images: - logger.debug(f"Image already processed externally: {file_path}") + self._logger.debug(f"Image already processed: {file_path}") return self._build_tag(file_path) - - # Ensure directory exists - self._ensure_directory_exists() - - # Save file - with open(file_path, 'wb') as f: - f.write(image_data) - - logger.debug(f"Image saved: {file_path}") - - # Update internal duplicate tracking + + # Ensure storage is ready + self._ensure_storage_ready() + + # Save using storage backend + if not self._storage_backend.save(image_data, file_path): + return None + + self._logger.debug(f"Image saved: {file_path}") + + # Update tracking self._processed_hashes[image_hash] = file_path - - # Update external duplicate tracking if processed_images is not None: processed_images.add(file_path) - + return self._build_tag(file_path) - + except Exception as e: - logger.error(f"Failed to save image: {e}") + self._logger.error(f"Failed to save image: {e}") return None - + + def process_image( + self, + image_data: bytes, + **kwargs + ) -> Optional[str]: + """ + Process and save image data. + + This is the main method for format-specific image processing. + Subclasses should override this method to provide format-specific + processing logic before saving. + + Default implementation simply saves the image. + + Args: + image_data: Raw image binary data + **kwargs: Format-specific options (e.g., xref, page_num, sheet_name) + + Returns: + Image tag string, or None on failure + + Examples: + >>> processor = ImageProcessor() + >>> tag = processor.process_image(png_bytes) + "[Image:temp/images/abc123.png]" + + >>> # Subclass example + >>> class PDFImageProcessor(ImageProcessor): + ... def process_image(self, image_data, **kwargs): + ... xref = kwargs.get('xref') + ... custom_name = f"pdf_xref_{xref}" if xref else None + ... return self.save_image(image_data, custom_name=custom_name) + """ + custom_name = kwargs.get('custom_name') + return self.save_image(image_data, custom_name=custom_name) + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded image from document. + + Override in subclasses for format-specific embedded image handling. + Default implementation just saves the image. + + Args: + image_data: Image binary data + image_name: Original image name in document + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + return self.save_image(image_data, custom_name=image_name) + + def process_chart_image( + self, + chart_data: bytes, + chart_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process chart as image. + + Override in subclasses for format-specific chart image handling. + Default implementation just saves the image. + + Args: + chart_data: Chart image binary data + chart_name: Chart name + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + return self.save_image(chart_data, custom_name=chart_name) + def save_image_from_pil( self, pil_image, @@ -315,128 +461,104 @@ def save_image_from_pil( quality: int = 95, ) -> Optional[str]: """ - Save PIL Image object to file and return tag. - + Save PIL Image object and return tag. + Args: pil_image: PIL Image object - image_format: Image format to save (None keeps original or uses default) + image_format: Image format to save custom_name: Custom filename processed_images: Set of processed image paths quality: JPEG quality (1-100) - + Returns: Image tag string, or None on failure """ try: from PIL import Image - + if not isinstance(pil_image, Image.Image): - logger.error("Invalid PIL Image object") + self._logger.error("Invalid PIL Image object") return None - - # Determine format + fmt = image_format or ImageFormat.PNG if fmt == ImageFormat.UNKNOWN: fmt = self.config.default_format - - # Convert to bytes + buffer = io.BytesIO() save_format = fmt.value.upper() if save_format == "JPG": save_format = "JPEG" - + save_kwargs = {} if save_format == "JPEG": save_kwargs["quality"] = quality elif save_format == "PNG": save_kwargs["compress_level"] = 6 - + pil_image.save(buffer, format=save_format, **save_kwargs) image_data = buffer.getvalue() - + return self.save_image(image_data, custom_name, processed_images) - + except Exception as e: - logger.error(f"Failed to save PIL image: {e}") + self._logger.error(f"Failed to save PIL image: {e}") return None - + def get_processed_count(self) -> int: - """Return number of processed images""" + """Return number of processed images.""" return len(self._processed_hashes) - + def get_processed_paths(self) -> List[str]: - """Return all processed image paths""" + """Return all processed image paths.""" return list(self._processed_hashes.values()) - + def clear_cache(self) -> None: - """Clear internal duplicate tracking cache""" + """Clear internal duplicate tracking cache.""" self._processed_hashes.clear() self._sequential_counter = 0 - + def cleanup(self, delete_files: bool = False) -> int: """ Clean up resources. - + Args: - delete_files: If True, also delete saved files - + delete_files: If True, delete saved files + Returns: Number of deleted files """ deleted = 0 - if delete_files: for path in self._processed_hashes.values(): - try: - if os.path.exists(path): - os.remove(path) - deleted += 1 - except Exception as e: - logger.warning(f"Failed to delete file {path}: {e}") - + if self._storage_backend.delete(path): + deleted += 1 self.clear_cache() return deleted - + def get_pattern_string(self) -> str: """ Get regex pattern string for matching image tags. - - Returns a regex pattern that matches the image tag format used by this processor. - The pattern captures the image path as group 1. - + Returns: - Regex pattern string for matching image tags - - Examples: - >>> processor = ImageProcessor() # default: [Image:...] - >>> processor.get_pattern_string() - '\\[Image:([^\\]]+)\\]' - - >>> processor = ImageProcessor(tag_prefix="") - >>> processor.get_pattern_string() - "" + Regex pattern string """ import re prefix = re.escape(self.config.tag_prefix) suffix = re.escape(self.config.tag_suffix) - - # Determine the capture group pattern based on suffix - # If suffix is empty, capture everything until whitespace or end + if not self.config.tag_suffix: capture = r'(\S+)' else: - # Use negated character class based on first char of suffix first_char = self.config.tag_suffix[0] capture = f'([^{re.escape(first_char)}]+)' - + return f'{prefix}{capture}{suffix}' # ============================================================================ -# Config-based ImageProcessor Access +# Default Configuration # ============================================================================ -# Default configuration values DEFAULT_IMAGE_CONFIG = { "directory_path": "temp/images", "tag_prefix": "[Image:", @@ -445,39 +567,39 @@ def get_pattern_string(self) -> str: } +# ============================================================================ +# Factory Function +# ============================================================================ + def create_image_processor( directory_path: Optional[str] = None, tag_prefix: Optional[str] = None, tag_suffix: Optional[str] = None, naming_strategy: Optional[Union[NamingStrategy, str]] = None, + storage_backend: Optional[BaseStorageBackend] = None, ) -> ImageProcessor: """ Create a new ImageProcessor instance. - + Args: - directory_path: Image save directory (default: "temp/images") - tag_prefix: Tag prefix (default: "[Image:") - tag_suffix: Tag suffix (default: "]") - naming_strategy: File naming strategy (default: HASH) - + directory_path: Image save directory + tag_prefix: Tag prefix + tag_suffix: Tag suffix + naming_strategy: File naming strategy + storage_backend: Storage backend instance + Returns: - New ImageProcessor instance - - Examples: - >>> processor = create_image_processor( - ... directory_path="output/images", - ... tag_prefix="" - ... ) + ImageProcessor instance """ if naming_strategy is not None and isinstance(naming_strategy, str): naming_strategy = NamingStrategy(naming_strategy.lower()) - + return ImageProcessor( directory_path=directory_path or DEFAULT_IMAGE_CONFIG["directory_path"], tag_prefix=tag_prefix or DEFAULT_IMAGE_CONFIG["tag_prefix"], tag_suffix=tag_suffix or DEFAULT_IMAGE_CONFIG["tag_suffix"], naming_strategy=naming_strategy or DEFAULT_IMAGE_CONFIG["naming_strategy"], + storage_backend=storage_backend, ) @@ -490,44 +612,33 @@ def save_image_to_file( ) -> Optional[str]: """ Save image to file and return tag. - - A simple function that replaces the existing image upload functions. - + + Convenience function for quick image saving using local storage. + Args: image_data: Image binary data directory_path: Save directory tag_prefix: Tag prefix tag_suffix: Tag suffix processed_images: Set for duplicate tracking - + Returns: Image tag string, or None on failure - - Examples: - >>> tag = save_image_to_file(image_bytes) - "[Image:temp/abc123.png]" - - >>> tag = save_image_to_file( - ... image_bytes, - ... directory_path="output", - ... tag_prefix="" - ... ) - "" """ processor = ImageProcessor( directory_path=directory_path, tag_prefix=tag_prefix, tag_suffix=tag_suffix, ) - return processor.save_image(image_data, processed_images=processed_images) __all__ = [ - # Classes + # Main class "ImageProcessor", + # Config "ImageProcessorConfig", + # Enums "ImageFormat", "NamingStrategy", # Factory function diff --git a/contextifier/core/functions/metadata_extractor.py b/contextifier/core/functions/metadata_extractor.py new file mode 100644 index 0000000..d47a84b --- /dev/null +++ b/contextifier/core/functions/metadata_extractor.py @@ -0,0 +1,542 @@ +# contextifier/core/functions/metadata_extractor.py +""" +Metadata Extractor Interface + +Provides abstract base class and common utilities for document metadata extraction. +Each handler's helper module should implement a concrete extractor inheriting from +BaseMetadataExtractor. + +This module defines: +- DocumentMetadata: Standardized metadata container dataclass +- MetadataField: Enum for standard metadata field names +- BaseMetadataExtractor: Abstract base class for metadata extractors +- MetadataFormatter: Shared formatter for consistent metadata output + +Usage Example: + from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, + MetadataFormatter, + ) + + class PDFMetadataExtractor(BaseMetadataExtractor): + def extract(self, source: Any) -> DocumentMetadata: + # PDF-specific extraction logic + ... +""" +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import Any, Dict, Optional + +logger = logging.getLogger("contextify.metadata") + + +class MetadataField(str, Enum): + """ + Standard metadata field names. + + These field names are used consistently across all document formats + to ensure uniform metadata handling. + """ + TITLE = "title" + SUBJECT = "subject" + AUTHOR = "author" + KEYWORDS = "keywords" + COMMENTS = "comments" + LAST_SAVED_BY = "last_saved_by" + CREATE_TIME = "create_time" + LAST_SAVED_TIME = "last_saved_time" + + # Additional fields for specific formats + VERSION = "version" + CATEGORY = "category" + COMPANY = "company" + MANAGER = "manager" + + # File-level metadata (for CSV, etc.) + FILE_NAME = "file_name" + FILE_SIZE = "file_size" + ENCODING = "encoding" + ROW_COUNT = "row_count" + COL_COUNT = "col_count" + + +@dataclass +class DocumentMetadata: + """ + Standardized metadata container for all document types. + + This dataclass provides a unified structure for storing document metadata + across all supported file formats. It includes common fields and allows + for format-specific custom fields. + + Attributes: + title: Document title + subject: Document subject + author: Document author/creator + keywords: Document keywords + comments: Document comments/description + last_saved_by: Last person who saved the document + create_time: Document creation timestamp + last_saved_time: Last modification timestamp + custom: Dictionary for format-specific additional fields + + Example: + >>> metadata = DocumentMetadata( + ... title="Annual Report", + ... author="John Doe", + ... create_time=datetime.now() + ... ) + >>> metadata.to_dict() + {'title': 'Annual Report', 'author': 'John Doe', ...} + """ + title: Optional[str] = None + subject: Optional[str] = None + author: Optional[str] = None + keywords: Optional[str] = None + comments: Optional[str] = None + last_saved_by: Optional[str] = None + create_time: Optional[datetime] = None + last_saved_time: Optional[datetime] = None + custom: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """ + Convert metadata to dictionary. + + Returns: + Dictionary containing all non-None metadata fields. + """ + result = {} + + if self.title: + result[MetadataField.TITLE.value] = self.title + if self.subject: + result[MetadataField.SUBJECT.value] = self.subject + if self.author: + result[MetadataField.AUTHOR.value] = self.author + if self.keywords: + result[MetadataField.KEYWORDS.value] = self.keywords + if self.comments: + result[MetadataField.COMMENTS.value] = self.comments + if self.last_saved_by: + result[MetadataField.LAST_SAVED_BY.value] = self.last_saved_by + if self.create_time: + result[MetadataField.CREATE_TIME.value] = self.create_time + if self.last_saved_time: + result[MetadataField.LAST_SAVED_TIME.value] = self.last_saved_time + + # Add custom fields + result.update(self.custom) + + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentMetadata": + """ + Create DocumentMetadata from dictionary. + + Standard fields are extracted into their respective attributes, + while non-standard fields go into the custom dictionary. + + Args: + data: Dictionary containing metadata fields. + + Returns: + DocumentMetadata instance. + """ + standard_fields = { + MetadataField.TITLE.value, + MetadataField.SUBJECT.value, + MetadataField.AUTHOR.value, + MetadataField.KEYWORDS.value, + MetadataField.COMMENTS.value, + MetadataField.LAST_SAVED_BY.value, + MetadataField.CREATE_TIME.value, + MetadataField.LAST_SAVED_TIME.value, + } + + custom = {k: v for k, v in data.items() if k not in standard_fields} + + return cls( + title=data.get(MetadataField.TITLE.value), + subject=data.get(MetadataField.SUBJECT.value), + author=data.get(MetadataField.AUTHOR.value), + keywords=data.get(MetadataField.KEYWORDS.value), + comments=data.get(MetadataField.COMMENTS.value), + last_saved_by=data.get(MetadataField.LAST_SAVED_BY.value), + create_time=data.get(MetadataField.CREATE_TIME.value), + last_saved_time=data.get(MetadataField.LAST_SAVED_TIME.value), + custom=custom, + ) + + def is_empty(self) -> bool: + """ + Check if metadata is empty (no fields set). + + Returns: + True if no metadata fields are set. + """ + return not self.to_dict() + + def __bool__(self) -> bool: + """Return True if metadata has any fields set.""" + return not self.is_empty() + + +class MetadataFormatter: + """ + Shared formatter for consistent metadata output. + + This class provides a unified way to format DocumentMetadata objects + as strings for inclusion in extracted text output. + + Attributes: + metadata_tag_prefix: Opening tag for metadata section (default: "") + metadata_tag_suffix: Closing tag for metadata section (default: "") + field_labels: Dictionary mapping field names to display labels + date_format: Date/time format string + language: Output language ('ko' for Korean, 'en' for English) + + Example: + >>> formatter = MetadataFormatter(language='en') + >>> text = formatter.format(metadata) + >>> print(text) + + Title: Annual Report + Author: John Doe + + """ + + # Field labels in Korean + LABELS_KO = { + MetadataField.TITLE.value: "제목", + MetadataField.SUBJECT.value: "주제", + MetadataField.AUTHOR.value: "작성자", + MetadataField.KEYWORDS.value: "키워드", + MetadataField.COMMENTS.value: "설명", + MetadataField.LAST_SAVED_BY.value: "마지막 저장자", + MetadataField.CREATE_TIME.value: "작성일", + MetadataField.LAST_SAVED_TIME.value: "수정일", + # Additional fields + MetadataField.VERSION.value: "버전", + MetadataField.CATEGORY.value: "범주", + MetadataField.COMPANY.value: "회사", + MetadataField.MANAGER.value: "관리자", + MetadataField.FILE_NAME.value: "파일명", + MetadataField.FILE_SIZE.value: "파일 크기", + MetadataField.ENCODING.value: "인코딩", + MetadataField.ROW_COUNT.value: "행 수", + MetadataField.COL_COUNT.value: "열 수", + } + + # Field labels in English + LABELS_EN = { + MetadataField.TITLE.value: "Title", + MetadataField.SUBJECT.value: "Subject", + MetadataField.AUTHOR.value: "Author", + MetadataField.KEYWORDS.value: "Keywords", + MetadataField.COMMENTS.value: "Comments", + MetadataField.LAST_SAVED_BY.value: "Last Saved By", + MetadataField.CREATE_TIME.value: "Created", + MetadataField.LAST_SAVED_TIME.value: "Last Modified", + # Additional fields + MetadataField.VERSION.value: "Version", + MetadataField.CATEGORY.value: "Category", + MetadataField.COMPANY.value: "Company", + MetadataField.MANAGER.value: "Manager", + MetadataField.FILE_NAME.value: "File Name", + MetadataField.FILE_SIZE.value: "File Size", + MetadataField.ENCODING.value: "Encoding", + MetadataField.ROW_COUNT.value: "Row Count", + MetadataField.COL_COUNT.value: "Column Count", + } + + # Standard field order for output + FIELD_ORDER = [ + MetadataField.TITLE.value, + MetadataField.SUBJECT.value, + MetadataField.AUTHOR.value, + MetadataField.KEYWORDS.value, + MetadataField.COMMENTS.value, + MetadataField.LAST_SAVED_BY.value, + MetadataField.CREATE_TIME.value, + MetadataField.LAST_SAVED_TIME.value, + ] + + def __init__( + self, + metadata_tag_prefix: str = "", + metadata_tag_suffix: str = "", + date_format: str = "%Y-%m-%d %H:%M:%S", + language: str = "ko", + indent: str = " ", + ): + """ + Initialize MetadataFormatter. + + Args: + metadata_tag_prefix: Opening tag for metadata section + metadata_tag_suffix: Closing tag for metadata section + date_format: strftime format for datetime values + language: Output language ('ko' or 'en') + indent: Indentation string for each field + """ + self.metadata_tag_prefix = metadata_tag_prefix + self.metadata_tag_suffix = metadata_tag_suffix + self.date_format = date_format + self.language = language + self.indent = indent + + # Select labels based on language + self.field_labels = self.LABELS_KO if language == "ko" else self.LABELS_EN + + def format(self, metadata: DocumentMetadata) -> str: + """ + Format DocumentMetadata as a string. + + Args: + metadata: DocumentMetadata instance to format. + + Returns: + Formatted metadata string, or empty string if metadata is empty. + """ + if not metadata: + return "" + + data = metadata.to_dict() + if not data: + return "" + + lines = [self.metadata_tag_prefix] + + # Output standard fields in order + for field_name in self.FIELD_ORDER: + if field_name in data: + value = data.pop(field_name) + formatted_line = self._format_field(field_name, value) + if formatted_line: + lines.append(formatted_line) + + # Output remaining custom fields + for field_name, value in data.items(): + formatted_line = self._format_field(field_name, value) + if formatted_line: + lines.append(formatted_line) + + lines.append(self.metadata_tag_suffix) + + return "\n".join(lines) + + def format_dict(self, metadata_dict: Dict[str, Any]) -> str: + """ + Format metadata dictionary as a string. + + Convenience method for formatting raw dictionaries without + first converting to DocumentMetadata. + + Args: + metadata_dict: Dictionary containing metadata fields. + + Returns: + Formatted metadata string. + """ + if not metadata_dict: + return "" + + return self.format(DocumentMetadata.from_dict(metadata_dict)) + + def _format_field(self, field_name: str, value: Any) -> Optional[str]: + """ + Format a single metadata field. + + Args: + field_name: Field name + value: Field value + + Returns: + Formatted field string, or None if value is empty. + """ + if value is None: + return None + + # Format datetime values + if isinstance(value, datetime): + value = value.strftime(self.date_format) + + # Get label (use field name as fallback) + label = self.field_labels.get(field_name, field_name.replace("_", " ").title()) + + return f"{self.indent}{label}: {value}" + + def get_label(self, field_name: str) -> str: + """ + Get display label for a field name. + + Args: + field_name: Field name + + Returns: + Display label for the field. + """ + return self.field_labels.get(field_name, field_name.replace("_", " ").title()) + + +class BaseMetadataExtractor(ABC): + """ + Abstract base class for metadata extractors. + + Each document format should implement a concrete extractor + that inherits from this class and provides format-specific + extraction logic. + + Subclasses must implement: + - extract(): Extract metadata from format-specific source object + + Subclasses may optionally override: + - format(): Customize metadata formatting + - get_formatter(): Provide custom formatter instance + + Attributes: + formatter: MetadataFormatter instance for output formatting + logger: Logger instance for this extractor + + Example: + class PDFMetadataExtractor(BaseMetadataExtractor): + def extract(self, doc) -> DocumentMetadata: + # Extract from PyMuPDF document object + pdf_meta = doc.metadata + return DocumentMetadata( + title=pdf_meta.get('title'), + author=pdf_meta.get('author'), + ... + ) + """ + + def __init__( + self, + formatter: Optional[MetadataFormatter] = None, + language: str = "ko", + ): + """ + Initialize BaseMetadataExtractor. + + Args: + formatter: Custom MetadataFormatter instance (optional) + language: Default language for formatter if not provided + """ + self._formatter = formatter or MetadataFormatter(language=language) + self._logger = logging.getLogger( + f"contextify.metadata.{self.__class__.__name__}" + ) + + @property + def formatter(self) -> MetadataFormatter: + """Get the metadata formatter instance.""" + return self._formatter + + @property + def logger(self) -> logging.Logger: + """Get the logger instance.""" + return self._logger + + @abstractmethod + def extract(self, source: Any) -> DocumentMetadata: + """ + Extract metadata from source object. + + This method must be implemented by subclasses to provide + format-specific metadata extraction logic. + + Args: + source: Format-specific source object (e.g., PyMuPDF doc, + python-docx Document, openpyxl Workbook, etc.) + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + pass + + def format(self, metadata: DocumentMetadata) -> str: + """ + Format metadata as a string. + + Uses the formatter to convert DocumentMetadata to a string. + Can be overridden by subclasses for custom formatting. + + Args: + metadata: DocumentMetadata instance to format. + + Returns: + Formatted metadata string. + """ + return self._formatter.format(metadata) + + def extract_and_format(self, source: Any) -> str: + """ + Extract metadata and format as string in one step. + + Convenience method that combines extract() and format(). + + Args: + source: Format-specific source object. + + Returns: + Formatted metadata string. + """ + try: + metadata = self.extract(source) + return self.format(metadata) + except Exception as e: + self._logger.warning(f"Failed to extract metadata: {e}") + return "" + + def extract_to_dict(self, source: Any) -> Dict[str, Any]: + """ + Extract metadata and return as dictionary. + + Convenience method that extracts metadata and converts to dict. + + Args: + source: Format-specific source object. + + Returns: + Dictionary containing metadata fields. + """ + try: + metadata = self.extract(source) + return metadata.to_dict() + except Exception as e: + self._logger.warning(f"Failed to extract metadata: {e}") + return {} + + +# Default formatter instance (Korean) +_default_formatter = MetadataFormatter(language="ko") + + +def format_metadata(metadata: Dict[str, Any]) -> str: + """ + Format metadata dictionary as a string. + + Convenience function using default formatter for backward compatibility. + + Args: + metadata: Dictionary containing metadata fields. + + Returns: + Formatted metadata string. + """ + return _default_formatter.format_dict(metadata) + + +__all__ = [ + "MetadataField", + "DocumentMetadata", + "MetadataFormatter", + "BaseMetadataExtractor", + "format_metadata", +] diff --git a/contextifier/core/functions/preprocessor.py b/contextifier/core/functions/preprocessor.py new file mode 100644 index 0000000..cbf471d --- /dev/null +++ b/contextifier/core/functions/preprocessor.py @@ -0,0 +1,161 @@ +# libs/core/functions/preprocessor.py +""" +BasePreprocessor - Abstract base class for data preprocessing + +Defines the interface for preprocessing data after file conversion. +Used when converted data needs special handling before content extraction. + +The preprocessor's job is to: +1. Clean/normalize converted data +2. Extract embedded resources (images, etc.) +3. Detect encoding information +4. Return preprocessed data ready for further processing + +Processing Pipeline Position: + 1. FileConverter.convert() → Format-specific object + 2. Preprocessor.preprocess() → Cleaned/processed data (THIS STEP) + 3. MetadataExtractor.extract() → Metadata + 4. Content extraction + +Usage: + class PDFPreprocessor(BasePreprocessor): + def preprocess(self, converted_data: Any, **kwargs) -> PreprocessedData: + # Process the fitz.Document, normalize pages, etc. + return PreprocessedData( + clean_content=b"", + encoding="utf-8", + extracted_resources={"document": converted_data} + ) + + def get_format_name(self) -> str: + return "PDF Preprocessor" +""" +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict + + +@dataclass +class PreprocessedData: + """ + Result of preprocessing operation. + + Contains cleaned content and any extracted resources. + + Attributes: + raw_content: Original input data (for reference) + clean_content: Processed content ready for use - THIS IS THE TRUE SOURCE + Can be any type: bytes, str, Document, Workbook, OleFileIO, etc. + encoding: Detected or default encoding (for text-based content) + extracted_resources: Dict of extracted resources (images, etc.) + metadata: Any metadata discovered during preprocessing + """ + raw_content: Any = None + clean_content: Any = None # TRUE SOURCE - The processed result + encoding: str = "utf-8" + extracted_resources: Dict[str, Any] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + + +class BasePreprocessor(ABC): + """ + Abstract base class for data preprocessors. + + Preprocesses converted data after FileConverter.convert(). + Used when converted data needs normalization or special handling + before content extraction. + + Processing Pipeline: + 1. FileConverter.convert() → Format-specific object + 2. Preprocessor.preprocess() → Cleaned/processed data (THIS STEP) + 3. MetadataExtractor.extract() → Metadata + 4. Content extraction + + Subclasses must implement: + - preprocess(): Process converted data and return PreprocessedData + - get_format_name(): Return human-readable format name + """ + + @abstractmethod + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess converted data. + + Args: + converted_data: Data from FileConverter.convert() + (format-specific object, bytes, or other type) + **kwargs: Additional format-specific options + + Returns: + PreprocessedData containing cleaned content and extracted resources + + Raises: + PreprocessingError: If preprocessing fails + """ + pass + + @abstractmethod + def get_format_name(self) -> str: + """ + Return human-readable format name. + + Returns: + Format name string (e.g., "PDF Preprocessor") + """ + pass + + def validate(self, data: Any) -> bool: + """ + Validate if the data can be preprocessed by this preprocessor. + + Override this method to add format-specific validation. + Default implementation returns True. + + Args: + data: Data to validate (converted data or raw bytes) + + Returns: + True if data can be preprocessed, False otherwise + """ + _ = data # Suppress unused argument warning + return True + + +class NullPreprocessor(BasePreprocessor): + """ + Null preprocessor that passes data through unchanged. + + Used as default when no preprocessing is needed. + clean_content always contains the processed result (same as input for pass-through). + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """Pass data through unchanged. clean_content = converted_data.""" + encoding = kwargs.get("encoding", "utf-8") + + # clean_content is ALWAYS the True Source - contains the processed result + # For pass-through, it's the same as the input + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE + encoding=encoding, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "Null Preprocessor (pass-through)" + + +__all__ = [ + 'BasePreprocessor', + 'NullPreprocessor', + 'PreprocessedData', +] diff --git a/contextifier/core/functions/storage_backend.py b/contextifier/core/functions/storage_backend.py new file mode 100644 index 0000000..2118594 --- /dev/null +++ b/contextifier/core/functions/storage_backend.py @@ -0,0 +1,381 @@ +# contextifier/core/functions/storage_backend.py +""" +Storage Backend Module + +Provides abstract base class and implementations for image storage backends. +ImageProcessor uses these backends to save images to different storage systems. + +Storage Backends: +- LocalStorageBackend: Save to local file system +- MinIOStorageBackend: Save to MinIO object storage (stub) +- S3StorageBackend: Save to AWS S3 (stub) + +Usage Example: + from contextifier.core.functions.storage_backend import ( + LocalStorageBackend, + MinIOStorageBackend, + ) + from contextifier.core.functions.img_processor import ImageProcessor + + # Use local storage (default) + processor = ImageProcessor() + + # Use MinIO storage + minio_backend = MinIOStorageBackend( + endpoint="localhost:9000", + bucket="images" + ) + processor = ImageProcessor(storage_backend=minio_backend) +""" +import logging +import os +from abc import ABC, abstractmethod +from enum import Enum +from pathlib import Path +from typing import Any, Dict, Optional + +logger = logging.getLogger("contextify.storage") + + +class StorageType(Enum): + """Storage backend types.""" + LOCAL = "local" + MINIO = "minio" + S3 = "s3" + AZURE_BLOB = "azure_blob" + GCS = "gcs" # Google Cloud Storage + + +class BaseStorageBackend(ABC): + """ + Abstract base class for storage backends. + + Each storage type implements this interface to provide + storage-specific save/delete logic. + + Subclasses must implement: + - save(): Save data to storage + - delete(): Delete file from storage + - exists(): Check if file exists + - ensure_ready(): Prepare storage (create dirs, validate connection) + """ + + def __init__(self, storage_type: StorageType): + self._storage_type = storage_type + self._logger = logging.getLogger( + f"contextify.storage.{self.__class__.__name__}" + ) + + @property + def storage_type(self) -> StorageType: + """Get storage type.""" + return self._storage_type + + @property + def logger(self) -> logging.Logger: + """Get logger.""" + return self._logger + + @abstractmethod + def save(self, data: bytes, file_path: str) -> bool: + """ + Save data to storage. + + Args: + data: Binary data to save + file_path: Target file path or key + + Returns: + True if successful, False otherwise + """ + pass + + @abstractmethod + def delete(self, file_path: str) -> bool: + """ + Delete file from storage. + + Args: + file_path: File path or key to delete + + Returns: + True if successful, False otherwise + """ + pass + + @abstractmethod + def exists(self, file_path: str) -> bool: + """ + Check if file exists in storage. + + Args: + file_path: File path or key to check + + Returns: + True if file exists + """ + pass + + @abstractmethod + def ensure_ready(self, directory_path: str) -> None: + """ + Ensure storage is ready (create directory, validate connection, etc.). + + Args: + directory_path: Base directory or bucket path + """ + pass + + def build_url(self, file_path: str) -> str: + """ + Build URL or path for the saved file. + + Override in subclasses for storage-specific URL formats. + + Args: + file_path: File path or key + + Returns: + URL or path string + """ + return file_path.replace("\\", "/") + + +class LocalStorageBackend(BaseStorageBackend): + """ + Local file system storage backend. + + Saves files to the local file system. + """ + + def __init__(self): + super().__init__(StorageType.LOCAL) + + def save(self, data: bytes, file_path: str) -> bool: + """Save data to local file.""" + try: + with open(file_path, 'wb') as f: + f.write(data) + return True + except Exception as e: + self._logger.error(f"Failed to save file {file_path}: {e}") + return False + + def delete(self, file_path: str) -> bool: + """Delete local file.""" + try: + if os.path.exists(file_path): + os.remove(file_path) + return True + return False + except Exception as e: + self._logger.warning(f"Failed to delete file {file_path}: {e}") + return False + + def exists(self, file_path: str) -> bool: + """Check if local file exists.""" + return os.path.exists(file_path) + + def ensure_ready(self, directory_path: str) -> None: + """Create directory if it doesn't exist.""" + path = Path(directory_path) + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + self._logger.debug(f"Created directory: {path}") + + +class MinIOStorageBackend(BaseStorageBackend): + """ + MinIO object storage backend (STUB - Not Implemented). + + This is a placeholder for MinIO integration. + Requires minio package to be installed. + + Args: + endpoint: MinIO server endpoint + access_key: MinIO access key + secret_key: MinIO secret key + bucket: Target bucket name + secure: Use HTTPS (default: True) + """ + + def __init__( + self, + endpoint: str = "localhost:9000", + access_key: str = "", + secret_key: str = "", + bucket: str = "images", + secure: bool = True, + ): + super().__init__(StorageType.MINIO) + self._endpoint = endpoint + self._access_key = access_key + self._secret_key = secret_key + self._bucket = bucket + self._secure = secure + self._client = None + + self._logger.warning( + "MinIOStorageBackend is a stub implementation. " + "Full implementation is pending." + ) + + @property + def bucket(self) -> str: + """Get bucket name.""" + return self._bucket + + @property + def endpoint(self) -> str: + """Get endpoint.""" + return self._endpoint + + def save(self, data: bytes, file_path: str) -> bool: + """Upload data to MinIO bucket.""" + raise NotImplementedError( + "MinIOStorageBackend.save() is not yet implemented. " + "Use LocalStorageBackend for now." + ) + + def delete(self, file_path: str) -> bool: + """Delete object from MinIO bucket.""" + raise NotImplementedError( + "MinIOStorageBackend.delete() is not yet implemented." + ) + + def exists(self, file_path: str) -> bool: + """Check if object exists in MinIO bucket.""" + raise NotImplementedError( + "MinIOStorageBackend.exists() is not yet implemented." + ) + + def ensure_ready(self, directory_path: str) -> None: + """Initialize MinIO client and ensure bucket exists.""" + raise NotImplementedError( + "MinIOStorageBackend.ensure_ready() is not yet implemented." + ) + + def build_url(self, file_path: str) -> str: + """Build MinIO URL for the file.""" + # Would return presigned URL or object path + protocol = "https" if self._secure else "http" + return f"{protocol}://{self._endpoint}/{self._bucket}/{file_path}" + + +class S3StorageBackend(BaseStorageBackend): + """ + AWS S3 storage backend (STUB - Not Implemented). + + This is a placeholder for AWS S3 integration. + Requires boto3 package to be installed. + + Args: + bucket: S3 bucket name + region: AWS region (default: "us-east-1") + prefix: Key prefix for uploaded objects + """ + + def __init__( + self, + bucket: str = "", + region: str = "us-east-1", + prefix: str = "", + ): + super().__init__(StorageType.S3) + self._bucket = bucket + self._region = region + self._prefix = prefix + self._client = None + + self._logger.warning( + "S3StorageBackend is a stub implementation. " + "Full implementation is pending." + ) + + @property + def bucket(self) -> str: + """Get bucket name.""" + return self._bucket + + @property + def region(self) -> str: + """Get region.""" + return self._region + + def save(self, data: bytes, file_path: str) -> bool: + """Upload data to S3 bucket.""" + raise NotImplementedError( + "S3StorageBackend.save() is not yet implemented. " + "Use LocalStorageBackend for now." + ) + + def delete(self, file_path: str) -> bool: + """Delete object from S3 bucket.""" + raise NotImplementedError( + "S3StorageBackend.delete() is not yet implemented." + ) + + def exists(self, file_path: str) -> bool: + """Check if object exists in S3 bucket.""" + raise NotImplementedError( + "S3StorageBackend.exists() is not yet implemented." + ) + + def ensure_ready(self, directory_path: str) -> None: + """Initialize S3 client and verify bucket access.""" + raise NotImplementedError( + "S3StorageBackend.ensure_ready() is not yet implemented." + ) + + def build_url(self, file_path: str) -> str: + """Build S3 URL for the file.""" + # Would return S3 URI or presigned URL + return f"s3://{self._bucket}/{file_path}" + + +# Default backend instance +_default_backend = LocalStorageBackend() + + +def get_default_backend() -> BaseStorageBackend: + """Get the default storage backend (local).""" + return _default_backend + + +def create_storage_backend( + storage_type: StorageType = StorageType.LOCAL, + **kwargs +) -> BaseStorageBackend: + """ + Factory function to create a storage backend. + + Args: + storage_type: Type of storage backend + **kwargs: Storage-specific options + + Returns: + BaseStorageBackend instance + """ + if storage_type == StorageType.LOCAL: + return LocalStorageBackend() + elif storage_type == StorageType.MINIO: + return MinIOStorageBackend(**kwargs) + elif storage_type == StorageType.S3: + return S3StorageBackend(**kwargs) + else: + raise ValueError(f"Unsupported storage type: {storage_type}") + + +__all__ = [ + # Enum + "StorageType", + # Base class + "BaseStorageBackend", + # Implementations + "LocalStorageBackend", + "MinIOStorageBackend", + "S3StorageBackend", + # Factory + "create_storage_backend", + "get_default_backend", +] diff --git a/contextifier/core/processor/__init__.py b/contextifier/core/processor/__init__.py index 35f83ed..6543ba0 100644 --- a/contextifier/core/processor/__init__.py +++ b/contextifier/core/processor/__init__.py @@ -7,7 +7,8 @@ Handler List: - pdf_handler: PDF document processing (adaptive complexity-based) - docx_handler: DOCX document processing -- doc_handler: DOC document processing (including RTF) +- doc_handler: DOC document processing (OLE, HTML, misnamed DOCX) +- rtf_handler: RTF document processing - ppt_handler: PPT/PPTX document processing - excel_handler: Excel (XLSX/XLS) document processing - hwp_processor: HWP document processing @@ -19,7 +20,8 @@ Helper Modules (subdirectories): - csv_helper/: CSV processing helper - docx_helper/: DOCX processing helper -- doc_helpers/: DOC/RTF processing helper +- doc_helpers/: DOC processing helper +- rtf_helper/: RTF processing helper - excel_helper/: Excel processing helper - hwp_helper/: HWP processing helper - hwpx_helper/: HWPX processing helper @@ -29,6 +31,7 @@ Usage Example: from contextifier.core.processor import PDFHandler from contextifier.core.processor import DOCXHandler + from contextifier.core.processor import RTFHandler from contextifier.core.processor.pdf_helpers import extract_pdf_metadata """ @@ -38,6 +41,7 @@ # === Document Handlers === from contextifier.core.processor.docx_handler import DOCXHandler from contextifier.core.processor.doc_handler import DOCHandler +from contextifier.core.processor.rtf_handler import RTFHandler from contextifier.core.processor.ppt_handler import PPTHandler # === Data Handlers === @@ -47,19 +51,21 @@ # === HWP Handlers === from contextifier.core.processor.hwp_handler import HWPHandler -from contextifier.core.processor.hwps_handler import HWPXHandler +from contextifier.core.processor.hwpx_handler import HWPXHandler # === Other Processors === # from contextifier.core.processor.html_reprocessor import ... # HTML reprocessing # === Helper Modules (subpackages) === from contextifier.core.processor import csv_helper +from contextifier.core.processor import doc_helpers from contextifier.core.processor import docx_helper from contextifier.core.processor import excel_helper from contextifier.core.processor import hwp_helper from contextifier.core.processor import hwpx_helper from contextifier.core.processor import pdf_helpers from contextifier.core.processor import ppt_helper +from contextifier.core.processor import rtf_helper __all__ = [ # PDF Handler @@ -67,6 +73,7 @@ # Document Handlers "DOCXHandler", "DOCHandler", + "RTFHandler", "PPTHandler", # Data Handlers "ExcelHandler", @@ -77,10 +84,12 @@ "HWPXHandler", # Helper subpackages "csv_helper", + "doc_helpers", "docx_helper", "excel_helper", "hwp_helper", "hwpx_helper", "pdf_helpers", "ppt_helper", + "rtf_helper", ] diff --git a/contextifier/core/processor/base_handler.py b/contextifier/core/processor/base_handler.py index 6b8d42f..14f5b7c 100644 --- a/contextifier/core/processor/base_handler.py +++ b/contextifier/core/processor/base_handler.py @@ -3,28 +3,69 @@ BaseHandler - Abstract base class for document processing handlers Defines the base interface for all document handlers. -Manages config, ImageProcessor, PageTagProcessor, and ChartProcessor passed from -DocumentProcessor at instance level for reuse by internal methods. +Manages config, ImageProcessor, PageTagProcessor, ChartProcessor, MetadataExtractor, +Preprocessor, and format-specific ImageProcessor passed from DocumentProcessor at +instance level for reuse by internal methods. -Each handler should override _create_chart_extractor() to provide a format-specific -chart extractor implementation. +Each handler should override: +- _create_file_converter(): Provide format-specific file converter +- _create_preprocessor(): Provide format-specific preprocessor +- _create_chart_extractor(): Provide format-specific chart extractor +- _create_metadata_extractor(): Provide format-specific metadata extractor +- _create_format_image_processor(): Provide format-specific image processor + +Processing Pipeline: + 1. file_converter.convert() - Binary → Format-specific object (e.g., bytes → fitz.Document) + 2. preprocessor.preprocess() - Process/clean the converted data + 3. metadata_extractor.extract() - Extract document metadata + 4. Format-specific content extraction (text, images, charts, tables) Usage Example: class PDFHandler(BaseHandler): + def _create_file_converter(self): + return PDFFileConverter() + + def _create_preprocessor(self): + return PDFPreprocessor() # Or NullPreprocessor() if no preprocessing needed + + def _create_metadata_extractor(self): + return PDFMetadataExtractor() + + def _create_format_image_processor(self): + return PDFImageProcessor(image_processor=self._image_processor) + def extract_text(self, current_file: CurrentFile, extract_metadata: bool = True) -> str: - # Access self.config, self.image_processor, self.page_tag_processor - # Use self.chart_extractor.process(chart_element) for chart extraction + # Step 1: Convert binary to format-specific object + doc = self.convert_file(current_file) + # Step 2: Preprocess the converted object + preprocessed = self.preprocess(doc) + # Step 3: Extract metadata + metadata = self.extract_metadata(doc) + # Step 4: Process content ... """ import io import logging from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, TYPE_CHECKING +from typing import Any, Dict, Optional, TYPE_CHECKING from contextifier.core.functions.img_processor import ImageProcessor from contextifier.core.functions.page_tag_processor import PageTagProcessor from contextifier.core.functions.chart_processor import ChartProcessor from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) +from contextifier.core.functions.file_converter import ( + BaseFileConverter, + NullFileConverter, +) +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + NullPreprocessor, + PreprocessedData, +) if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -32,27 +73,56 @@ def extract_text(self, current_file: CurrentFile, extract_metadata: bool = True) logger = logging.getLogger("document-processor") +class NullMetadataExtractor(BaseMetadataExtractor): + """ + Null implementation of metadata extractor. + + Used as default when no format-specific extractor is provided. + Always returns empty metadata. + """ + + def extract(self, source: Any) -> DocumentMetadata: + """Return empty metadata.""" + return DocumentMetadata() + + class BaseHandler(ABC): """ Abstract base class for document handlers. - + All handlers inherit from this class. - config, image_processor, page_tag_processor, and chart_processor are passed - at creation and stored as instance variables. - - Each handler should override _create_chart_extractor() to provide a - format-specific chart extractor. The chart_extractor is lazy-initialized - on first access. - + config, image_processor, page_tag_processor, chart_processor, metadata_extractor, + preprocessor, and format_image_processor are passed at creation and stored as + instance variables. + + Each handler should override: + - _create_file_converter(): Provide format-specific file converter + - _create_preprocessor(): Provide format-specific preprocessor + - _create_chart_extractor(): Provide format-specific chart extractor + - _create_metadata_extractor(): Provide format-specific metadata extractor + - _create_format_image_processor(): Provide format-specific image processor + + All are lazy-initialized on first access. + + Processing Pipeline: + 1. file_converter.convert() - Binary → Format-specific object + 2. preprocessor.preprocess() - Process/clean the converted data + 3. metadata_extractor.extract() - Extract document metadata + 4. Format-specific content extraction + Attributes: config: Configuration dictionary passed from DocumentProcessor - image_processor: ImageProcessor instance passed from DocumentProcessor + image_processor: Core ImageProcessor instance passed from DocumentProcessor + format_image_processor: Format-specific image processor (lazy-initialized) page_tag_processor: PageTagProcessor instance passed from DocumentProcessor chart_processor: ChartProcessor instance passed from DocumentProcessor chart_extractor: Format-specific chart extractor instance + preprocessor: Format-specific preprocessor instance + metadata_extractor: Format-specific metadata extractor instance + file_converter: Format-specific file converter instance logger: Logging instance """ - + def __init__( self, config: Optional[Dict[str, Any]] = None, @@ -62,7 +132,7 @@ def __init__( ): """ Initialize BaseHandler. - + Args: config: Configuration dictionary (passed from DocumentProcessor) image_processor: ImageProcessor instance (passed from DocumentProcessor) @@ -74,68 +144,194 @@ def __init__( self._page_tag_processor = page_tag_processor or self._get_page_tag_processor_from_config() self._chart_processor = chart_processor or self._get_chart_processor_from_config() self._chart_extractor: Optional[BaseChartExtractor] = None + self._metadata_extractor: Optional[BaseMetadataExtractor] = None + self._file_converter: Optional[BaseFileConverter] = None + self._preprocessor: Optional[BasePreprocessor] = None + self._format_image_processor: Optional[ImageProcessor] = None self._logger = logging.getLogger(f"document-processor.{self.__class__.__name__}") - + def _get_page_tag_processor_from_config(self) -> PageTagProcessor: """Get PageTagProcessor from config or create default.""" if self._config and "page_tag_processor" in self._config: return self._config["page_tag_processor"] return PageTagProcessor() - + def _get_chart_processor_from_config(self) -> ChartProcessor: """Get ChartProcessor from config or create default.""" if self._config and "chart_processor" in self._config: return self._config["chart_processor"] return ChartProcessor() - + def _create_chart_extractor(self) -> BaseChartExtractor: """ Create format-specific chart extractor. - + Override this method in subclasses to provide the appropriate chart extractor for the file format. - + Returns: BaseChartExtractor subclass instance """ return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self) -> BaseMetadataExtractor: + """ + Create format-specific metadata extractor. + + Override this method in subclasses to provide the appropriate + metadata extractor for the file format. + + Returns: + BaseMetadataExtractor subclass instance + """ + return NullMetadataExtractor() + + def _create_format_image_processor(self) -> ImageProcessor: + """ + Create format-specific image processor. + + Override this method in subclasses to provide the appropriate + image processor for the file format. + + Returns: + ImageProcessor subclass instance + """ + return self._image_processor + + def _create_file_converter(self) -> BaseFileConverter: + """ + Create format-specific file converter. + + Override this method in subclasses to provide the appropriate + file converter for the file format. + + The file converter transforms raw binary data into a workable + format-specific object (e.g., Document, Workbook, OLE file). + + Returns: + BaseFileConverter subclass instance + """ + return NullFileConverter() + + def _create_preprocessor(self) -> BasePreprocessor: + """ + Create format-specific preprocessor. + + Override this method in subclasses to provide the appropriate + preprocessor for the file format. + + The preprocessor processes/cleans the converted data before + further extraction. This is the SECOND step in the pipeline, + after file_converter.convert(). + + Pipeline: + 1. file_converter.convert() → Format-specific object + 2. preprocessor.preprocess() → Cleaned/processed data + 3. metadata_extractor.extract() → Metadata + 4. Content extraction + + Returns: + BasePreprocessor subclass instance (NullPreprocessor if no preprocessing needed) + """ + return NullPreprocessor() + @property def config(self) -> Dict[str, Any]: """Configuration dictionary.""" return self._config - + @property def image_processor(self) -> ImageProcessor: """ImageProcessor instance.""" return self._image_processor - + @property def page_tag_processor(self) -> PageTagProcessor: """PageTagProcessor instance.""" return self._page_tag_processor - + @property def chart_processor(self) -> ChartProcessor: """ChartProcessor instance.""" return self._chart_processor - + @property def chart_extractor(self) -> BaseChartExtractor: """ Format-specific chart extractor (lazy-initialized). - + Returns the chart extractor for this handler's file format. """ if self._chart_extractor is None: self._chart_extractor = self._create_chart_extractor() return self._chart_extractor - + + @property + def metadata_extractor(self) -> BaseMetadataExtractor: + """ + Format-specific metadata extractor (lazy-initialized). + + Returns the metadata extractor for this handler's file format. + """ + if self._metadata_extractor is None: + extractor = self._create_metadata_extractor() + # If subclass returns None, use NullMetadataExtractor + self._metadata_extractor = extractor if extractor is not None else NullMetadataExtractor() + return self._metadata_extractor + + @property + def format_image_processor(self) -> ImageProcessor: + """ + Format-specific image processor (lazy-initialized). + + Returns the image processor for this handler's file format. + Each handler should override _create_format_image_processor() to provide + format-specific image handling capabilities. + """ + if self._format_image_processor is None: + processor = self._create_format_image_processor() + # If subclass returns None, use default image_processor + self._format_image_processor = processor if processor is not None else self._image_processor + return self._format_image_processor + + @property + def file_converter(self) -> BaseFileConverter: + """ + Format-specific file converter (lazy-initialized). + + Returns the file converter for this handler's file format. + Each handler should override _create_file_converter() to provide + format-specific binary-to-object conversion. + """ + if self._file_converter is None: + converter = self._create_file_converter() + # If subclass returns None, use NullFileConverter + self._file_converter = converter if converter is not None else NullFileConverter() + return self._file_converter + + @property + def preprocessor(self) -> BasePreprocessor: + """ + Format-specific preprocessor (lazy-initialized). + + Returns the preprocessor for this handler's file format. + Each handler should override _create_preprocessor() to provide + format-specific data preprocessing after conversion. + + This is called AFTER file_converter.convert() to process/clean + the converted data before content extraction. + """ + if self._preprocessor is None: + preprocessor = self._create_preprocessor() + # If subclass returns None, use NullPreprocessor + self._preprocessor = preprocessor if preprocessor is not None else NullPreprocessor() + return self._preprocessor + @property def logger(self) -> logging.Logger: """Logger instance.""" return self._logger - + @abstractmethod def extract_text( self, @@ -145,26 +341,115 @@ def extract_text( ) -> str: """ Extract text from file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ pass - + + def extract_metadata(self, source: Any) -> DocumentMetadata: + """ + Extract metadata from source using format-specific extractor. + + Convenience method that wraps self.metadata_extractor.extract(). + + Args: + source: Format-specific source object + + Returns: + DocumentMetadata instance + """ + return self.metadata_extractor.extract(source) + + def format_metadata(self, metadata: DocumentMetadata) -> str: + """ + Format metadata as string. + + Convenience method that wraps self.metadata_extractor.format(). + + Args: + metadata: DocumentMetadata instance + + Returns: + Formatted metadata string + """ + return self.metadata_extractor.format(metadata) + + def extract_and_format_metadata(self, source: Any) -> str: + """ + Extract and format metadata in one step. + + Convenience method that combines extract and format. + + Args: + source: Format-specific source object + + Returns: + Formatted metadata string + """ + return self.metadata_extractor.extract_and_format(source) + + def convert_file(self, current_file: "CurrentFile", **kwargs) -> Any: + """ + Convert binary file data to workable format. + + Convenience method that wraps self.file_converter.convert(). + + This is the first step in the processing pipeline: + Binary Data → FileConverter → Workable Object + + Args: + current_file: CurrentFile dict containing file info and binary data + **kwargs: Additional format-specific options + + Returns: + Format-specific workable object (Document, Workbook, OLE file, etc.) + """ + file_data = current_file.get("file_data", b"") + file_stream = self.get_file_stream(current_file) + return self.file_converter.convert(file_data, file_stream, **kwargs) + + def preprocess(self, converted_data: Any, **kwargs) -> PreprocessedData: + """ + Preprocess the converted data. + + Convenience method that wraps self.preprocessor.preprocess(). + + This is the SECOND step in the processing pipeline: + 1. file_converter.convert() → Format-specific object + 2. preprocessor.preprocess() → Cleaned/processed data (THIS STEP) + 3. metadata_extractor.extract() → Metadata + 4. Content extraction + + Args: + converted_data: The data returned from file_converter.convert() + **kwargs: Additional format-specific options + + Returns: + PreprocessedData containing cleaned content and extracted resources + """ + # If converted_data is bytes, pass it directly + if isinstance(converted_data, bytes): + return self.preprocessor.preprocess(converted_data, **kwargs) + + # For other types, the preprocessor should handle it + # (e.g., Document object preprocessing) + return self.preprocessor.preprocess(converted_data, **kwargs) + def get_file_stream(self, current_file: "CurrentFile") -> io.BytesIO: """ Get a fresh BytesIO stream from current_file. - + Resets the stream position to the beginning for reuse. - + Args: current_file: CurrentFile dict - + Returns: BytesIO stream ready for reading """ @@ -174,17 +459,17 @@ def get_file_stream(self, current_file: "CurrentFile") -> io.BytesIO: return stream # Fallback: create new stream from file_data return io.BytesIO(current_file.get("file_data", b"")) - + def save_image(self, image_data: bytes, processed_images: Optional[set] = None) -> Optional[str]: """ Save image and return tag. - + Convenience method that wraps self.image_processor.save_image(). - + Args: image_data: Image binary data processed_images: Set of processed image hashes (for deduplication) - + Returns: Image tag string or None """ @@ -193,12 +478,12 @@ def save_image(self, image_data: bytes, processed_images: Optional[set] = None) def create_page_tag(self, page_number: int) -> str: """ Create a page number tag. - + Convenience method that wraps self.page_tag_processor.create_page_tag(). - + Args: page_number: Page number - + Returns: Page tag string (e.g., "[Page Number: 1]") """ @@ -207,12 +492,12 @@ def create_page_tag(self, page_number: int) -> str: def create_slide_tag(self, slide_number: int) -> str: """ Create a slide number tag. - + Convenience method that wraps self.page_tag_processor.create_slide_tag(). - + Args: slide_number: Slide number - + Returns: Slide tag string (e.g., "[Slide Number: 1]") """ @@ -221,12 +506,12 @@ def create_slide_tag(self, slide_number: int) -> str: def create_sheet_tag(self, sheet_name: str) -> str: """ Create a sheet name tag. - + Convenience method that wraps self.page_tag_processor.create_sheet_tag(). - + Args: sheet_name: Sheet name - + Returns: Sheet tag string (e.g., "[Sheet: Sheet1]") """ @@ -235,18 +520,24 @@ def create_sheet_tag(self, sheet_name: str) -> str: def process_chart(self, chart_element: Any) -> str: """ Process chart element using the format-specific chart extractor. - + This is the main method for chart processing. It uses the chart_extractor to extract data from the format-specific chart element and formats it using ChartProcessor. - + Args: chart_element: Format-specific chart object/element - + Returns: Formatted chart text with tags """ return self.chart_extractor.process(chart_element) -__all__ = ["BaseHandler"] +__all__ = [ + "BaseHandler", + "NullMetadataExtractor", + "BasePreprocessor", + "NullPreprocessor", + "PreprocessedData", +] diff --git a/contextifier/core/processor/csv_handler.py b/contextifier/core/processor/csv_handler.py index dbd8b2e..f7d52df 100644 --- a/contextifier/core/processor/csv_handler.py +++ b/contextifier/core/processor/csv_handler.py @@ -11,15 +11,15 @@ from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor from contextifier.core.processor.csv_helper import ( - CSVMetadata, - extract_csv_metadata, - format_metadata, detect_bom, detect_delimiter, parse_csv_content, detect_header, convert_rows_to_table, ) +from contextifier.core.processor.csv_helper.csv_metadata import CSVMetadataExtractor, CSVSourceInfo +from contextifier.core.processor.csv_helper.csv_image_processor import CSVImageProcessor +from contextifier.core.functions.img_processor import ImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -32,11 +32,29 @@ class CSVHandler(BaseHandler): """CSV/TSV File Processing Handler Class""" - + + def _create_file_converter(self): + """Create CSV-specific file converter.""" + from contextifier.core.processor.csv_helper.csv_file_converter import CSVFileConverter + return CSVFileConverter() + + def _create_preprocessor(self): + """Create CSV-specific preprocessor.""" + from contextifier.core.processor.csv_helper.csv_preprocessor import CSVPreprocessor + return CSVPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """CSV files do not contain charts. Return NullChartExtractor.""" return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create CSV-specific metadata extractor.""" + return CSVMetadataExtractor() + + def _create_format_image_processor(self) -> ImageProcessor: + """Create CSV-specific image processor.""" + return CSVImageProcessor() + def extract_text( self, current_file: "CurrentFile", @@ -47,113 +65,70 @@ def extract_text( ) -> str: """ Extract text from CSV/TSV file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata encoding: Encoding (None for auto-detect) delimiter: Delimiter (None for auto-detect) **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") ext = current_file.get("file_extension", os.path.splitext(file_path)[1]).lower() self.logger.info(f"CSV processing: {file_path}, ext: {ext}") - + if ext == '.tsv' and delimiter is None: delimiter = '\t' - + try: result_parts = [] - - # Decode file_data with encoding detection + + # Step 1: Decode file_data using file_converter file_data = current_file.get("file_data", b"") - content, detected_encoding = self._decode_with_encoding(file_data, encoding) - + content, detected_encoding = self.file_converter.convert(file_data, encoding=encoding) + + # Step 2: Preprocess - clean_content is the TRUE SOURCE + preprocessed = self.preprocess(content) + content = preprocessed.clean_content # TRUE SOURCE + if delimiter is None: delimiter = detect_delimiter(content) - + self.logger.info(f"CSV: encoding={detected_encoding}, delimiter={repr(delimiter)}") - + rows = parse_csv_content(content, delimiter) - + if not rows: return "" - + has_header = detect_header(rows) - + if extract_metadata: - metadata = extract_csv_metadata(file_path, detected_encoding, delimiter, rows, has_header) - metadata_str = format_metadata(metadata) + source_info = CSVSourceInfo( + file_path=file_path, + encoding=detected_encoding, + delimiter=delimiter, + rows=rows, + has_header=has_header + ) + metadata_str = self.extract_and_format_metadata(source_info) if metadata_str: result_parts.append(metadata_str + "\n\n") - + table = convert_rows_to_table(rows, has_header) if table: result_parts.append(table) - + result = "".join(result_parts) self.logger.info(f"CSV processing completed: {len(rows)} rows") - + return result - + except Exception as e: self.logger.error(f"Error extracting text from CSV {file_path}: {e}") import traceback self.logger.debug(traceback.format_exc()) raise - - def _decode_with_encoding( - self, - file_data: bytes, - preferred_encoding: Optional[str] = None - ) -> Tuple[str, str]: - """ - Decode bytes with encoding detection. - - Args: - file_data: Raw bytes data - preferred_encoding: Preferred encoding (None for auto-detect) - - Returns: - Tuple of (decoded content, detected encoding) - """ - # BOM detection - bom_encoding = detect_bom(file_data) - if bom_encoding: - try: - return file_data.decode(bom_encoding), bom_encoding - except UnicodeDecodeError: - pass - - # Try preferred encoding - if preferred_encoding: - try: - return file_data.decode(preferred_encoding), preferred_encoding - except UnicodeDecodeError: - self.logger.debug(f"Preferred encoding {preferred_encoding} failed") - - # Try chardet if available - try: - import chardet - detected = chardet.detect(file_data) - if detected and detected.get('encoding'): - enc = detected['encoding'] - try: - return file_data.decode(enc), enc - except UnicodeDecodeError: - pass - except ImportError: - pass - - # Try encoding candidates - for enc in ENCODING_CANDIDATES: - try: - return file_data.decode(enc), enc - except UnicodeDecodeError: - continue - - # Fallback to latin-1 (always succeeds) - return file_data.decode('latin-1'), 'latin-1' diff --git a/contextifier/core/processor/csv_helper/__init__.py b/contextifier/core/processor/csv_helper/__init__.py index a9da879..3c4e2e4 100644 --- a/contextifier/core/processor/csv_helper/__init__.py +++ b/contextifier/core/processor/csv_helper/__init__.py @@ -24,10 +24,13 @@ # Metadata from contextifier.core.processor.csv_helper.csv_metadata import ( - format_file_size, - get_delimiter_name, - extract_csv_metadata, - format_metadata, + CSVMetadataExtractor, + CSVSourceInfo, +) + +# Image Processor +from contextifier.core.processor.csv_helper.csv_image_processor import ( + CSVImageProcessor, ) # Encoding @@ -63,10 +66,11 @@ "MAX_COLS", "CSVMetadata", # Metadata - "format_file_size", - "get_delimiter_name", - "extract_csv_metadata", - "format_metadata", + "CSVMetadataExtractor", + "CSVSourceInfo", + # Image Processor + "CSVImageProcessor", + # Encoding # Encoding "detect_bom", "read_file_with_encoding", diff --git a/contextifier/core/processor/csv_helper/csv_constants.py b/contextifier/core/processor/csv_helper/csv_constants.py index 649d0a1..aa69a7e 100644 --- a/contextifier/core/processor/csv_helper/csv_constants.py +++ b/contextifier/core/processor/csv_helper/csv_constants.py @@ -27,10 +27,10 @@ # === 구분자 관련 상수 === -# CSV 구분자 후보 +# CSV delimiter candidates DELIMITER_CANDIDATES = [',', '\t', ';', '|'] -# 구분자 이름 매핑 +# Delimiter name mapping (Korean for output display) DELIMITER_NAMES = { ',': '쉼표 (,)', '\t': '탭 (\\t)', @@ -39,12 +39,12 @@ } -# === 처리 제한 상수 === +# === Processing limit constants === -# 최대 처리 행 수 (메모리 보호) +# Maximum rows to process (memory protection) MAX_ROWS = 100000 -# 최대 열 수 +# Maximum columns MAX_COLS = 1000 diff --git a/contextifier/core/processor/csv_helper/csv_file_converter.py b/contextifier/core/processor/csv_helper/csv_file_converter.py new file mode 100644 index 0000000..8c9f56f --- /dev/null +++ b/contextifier/core/processor/csv_helper/csv_file_converter.py @@ -0,0 +1,77 @@ +# libs/core/processor/csv_helper/csv_file_converter.py +""" +CSVFileConverter - CSV file format converter + +Converts binary CSV data to text string with encoding detection. +""" +from typing import Any, Optional, BinaryIO, Tuple + +from contextifier.core.functions.file_converter import TextFileConverter + + +class CSVFileConverter(TextFileConverter): + """ + CSV file converter. + + Converts binary CSV data to decoded text string. + Extends TextFileConverter with BOM detection. + """ + + # BOM markers + BOM_UTF8 = b'\xef\xbb\xbf' + BOM_UTF16_LE = b'\xff\xfe' + BOM_UTF16_BE = b'\xfe\xff' + + def __init__(self): + """Initialize CSVFileConverter.""" + super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1']) + self._delimiter: Optional[str] = None + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + encoding: Optional[str] = None, + delimiter: Optional[str] = None, + **kwargs + ) -> Tuple[str, str]: + """ + Convert binary CSV data to text string. + + Args: + file_data: Raw binary CSV data + file_stream: Ignored + encoding: Specific encoding to use + delimiter: CSV delimiter (for reference) + **kwargs: Additional options + + Returns: + Tuple of (decoded text, detected encoding) + """ + self._delimiter = delimiter + + # Check for BOM + bom_encoding = self._detect_bom(file_data) + if bom_encoding: + text = file_data.decode(bom_encoding) + self._detected_encoding = bom_encoding + return text, bom_encoding + + # Use parent's convert logic + text = super().convert(file_data, file_stream, encoding, **kwargs) + return text, self._detected_encoding or 'utf-8' + + def _detect_bom(self, file_data: bytes) -> Optional[str]: + """Detect encoding from BOM.""" + if file_data.startswith(self.BOM_UTF8): + return 'utf-8-sig' + elif file_data.startswith(self.BOM_UTF16_LE): + return 'utf-16-le' + elif file_data.startswith(self.BOM_UTF16_BE): + return 'utf-16-be' + return None + + def get_format_name(self) -> str: + """Return format name.""" + enc = self._detected_encoding or 'unknown' + return f"CSV ({enc})" diff --git a/contextifier/core/processor/csv_helper/csv_image_processor.py b/contextifier/core/processor/csv_helper/csv_image_processor.py new file mode 100644 index 0000000..87b7032 --- /dev/null +++ b/contextifier/core/processor/csv_helper/csv_image_processor.py @@ -0,0 +1,75 @@ +# contextifier/core/processor/csv_helper/csv_image_processor.py +""" +CSV Image Processor + +Provides CSV-specific image processing that inherits from ImageProcessor. +CSV files do not contain embedded images, so this is a minimal implementation. +""" +import logging +from typing import Any, Optional + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +logger = logging.getLogger("contextify.image_processor.csv") + + +class CSVImageProcessor(ImageProcessor): + """ + CSV-specific image processor. + + Inherits from ImageProcessor and provides CSV-specific processing. + CSV files do not contain embedded images, so this processor + provides a consistent interface without additional functionality. + + This class exists to maintain interface consistency across all handlers. + + Example: + processor = CSVImageProcessor() + + # No images in CSV, but interface is consistent + tag = processor.process_image(image_data) # Falls back to base implementation + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize CSVImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + **kwargs + ) -> Optional[str]: + """ + Process and save image data. + + CSV files do not contain embedded images, so this method + delegates to the base implementation. + + Args: + image_data: Raw image binary data + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return super().process_image(image_data, **kwargs) diff --git a/contextifier/core/processor/csv_helper/csv_metadata.py b/contextifier/core/processor/csv_helper/csv_metadata.py index 8229f48..ef493dc 100644 --- a/contextifier/core/processor/csv_helper/csv_metadata.py +++ b/contextifier/core/processor/csv_helper/csv_metadata.py @@ -1,14 +1,26 @@ -# csv_helper/csv_metadata.py +# contextifier/core/processor/csv_helper/csv_metadata.py """ -CSV 메타데이터 추출 및 포맷팅 +CSV Metadata Extraction Module -CSV 파일의 메타데이터를 추출하고 읽기 쉬운 형식으로 변환합니다. +Provides CSVMetadataExtractor class for extracting metadata from CSV files. +Implements BaseMetadataExtractor interface. + +CSV differs from regular documents - it provides file structure information as metadata: +- File name, file size, modification time +- Encoding, delimiter +- Row/column count, header information """ import logging import os +from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, + MetadataFormatter, +) from contextifier.core.processor.csv_helper.csv_constants import DELIMITER_NAMES logger = logging.getLogger("document-processor") @@ -16,13 +28,13 @@ def format_file_size(size_bytes: int) -> str: """ - 파일 크기를 읽기 쉬운 형식으로 변환합니다. + Convert file size to human-readable format. Args: - size_bytes: 파일 크기 (바이트) + size_bytes: File size in bytes Returns: - 포맷된 파일 크기 문자열 (예: "1.5 MB") + Formatted file size string (e.g., "1.5 MB") """ if size_bytes < 1024: return f"{size_bytes} B" @@ -36,87 +48,57 @@ def format_file_size(size_bytes: int) -> str: def get_delimiter_name(delimiter: str) -> str: """ - 구분자를 읽기 쉬운 이름으로 변환합니다. + Convert delimiter to human-readable name. Args: - delimiter: 구분자 문자 + delimiter: Delimiter character Returns: - 구분자의 읽기 쉬운 이름 (예: "쉼표 (,)") + Human-readable delimiter name (e.g., "Comma (,)") """ return DELIMITER_NAMES.get(delimiter, repr(delimiter)) -def extract_csv_metadata( - file_path: str, - encoding: str, - delimiter: str, - rows: List[List[str]], - has_header: bool -) -> Dict[str, Any]: +@dataclass +class CSVSourceInfo: """ - CSV 파일에서 메타데이터를 추출합니다. - - Args: - file_path: 파일 경로 - encoding: 감지된 인코딩 - delimiter: 감지된 구분자 - rows: 파싱된 행 데이터 - has_header: 헤더 존재 여부 - - Returns: - 메타데이터 딕셔너리 + Source information for CSV metadata extraction. + + Container for data passed to CSVMetadataExtractor.extract(). """ - metadata = {} - - try: - # 파일 정보 - file_stat = os.stat(file_path) - file_name = os.path.basename(file_path) - - metadata['file_name'] = file_name - metadata['file_size'] = format_file_size(file_stat.st_size) - metadata['modified_time'] = datetime.fromtimestamp(file_stat.st_mtime) - - # CSV 구조 정보 - metadata['encoding'] = encoding - metadata['delimiter'] = get_delimiter_name(delimiter) - metadata['row_count'] = len(rows) - metadata['col_count'] = len(rows[0]) if rows else 0 - metadata['has_header'] = '예' if has_header else '아니오' - - # 헤더 정보 (있는 경우) - if has_header and rows: - headers = [h.strip() for h in rows[0] if h.strip()] - if headers: - metadata['columns'] = ', '.join(headers[:10]) # 최대 10개 - if len(rows[0]) > 10: - metadata['columns'] += f' 외 {len(rows[0]) - 10}개' - - logger.debug(f"Extracted CSV metadata: {list(metadata.keys())}") - - except Exception as e: - logger.warning(f"Failed to extract CSV metadata: {e}") - - return metadata + file_path: str + encoding: str + delimiter: str + rows: List[List[str]] + has_header: bool -def format_metadata(metadata: Dict[str, Any]) -> str: +class CSVMetadataExtractor(BaseMetadataExtractor): """ - 메타데이터 딕셔너리를 읽기 쉬운 문자열로 변환합니다. - - Args: - metadata: 메타데이터 딕셔너리 - - Returns: - 포맷된 메타데이터 문자열 ( 태그 형식) + CSV Metadata Extractor. + + CSV 파일의 구조 정보를 메타데이터로 추출합니다. + + 지원 필드 (custom 필드에 저장): + - file_name, file_size, modified_time + - encoding, delimiter + - row_count, col_count, has_header, columns + + 사용법: + extractor = CSVMetadataExtractor() + source = CSVSourceInfo( + file_path="data.csv", + encoding="utf-8", + delimiter=",", + rows=parsed_rows, + has_header=True + ) + metadata = extractor.extract(source) + text = extractor.format(metadata) """ - if not metadata: - return "" - - lines = [""] - - field_names = { + + # CSV 특화 필드 라벨 + CSV_FIELD_LABELS = { 'file_name': '파일명', 'file_size': '파일 크기', 'modified_time': '수정일', @@ -127,17 +109,60 @@ def format_metadata(metadata: Dict[str, Any]) -> str: 'has_header': '헤더 존재', 'columns': '컬럼 목록', } - - for key, label in field_names.items(): - if key in metadata and metadata[key] is not None: - value = metadata[key] - - # datetime 객체 포맷팅 - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - - lines.append(f" {label}: {value}") - - lines.append("") - - return "\n".join(lines) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # CSV용 커스텀 포맷터 설정 + self._formatter.field_labels.update(self.CSV_FIELD_LABELS) + + def extract(self, source: CSVSourceInfo) -> DocumentMetadata: + """ + CSV 파일에서 메타데이터를 추출합니다. + + Args: + source: CSVSourceInfo 객체 (파일 경로, 인코딩, 구분자, 행 데이터, 헤더 여부) + + Returns: + 추출된 메타데이터가 담긴 DocumentMetadata 인스턴스 + """ + custom_fields: Dict[str, Any] = {} + + try: + # 파일 정보 + file_stat = os.stat(source.file_path) + file_name = os.path.basename(source.file_path) + + custom_fields['file_name'] = file_name + custom_fields['file_size'] = format_file_size(file_stat.st_size) + custom_fields['modified_time'] = datetime.fromtimestamp(file_stat.st_mtime) + + # CSV 구조 정보 + custom_fields['encoding'] = source.encoding + custom_fields['delimiter'] = get_delimiter_name(source.delimiter) + custom_fields['row_count'] = len(source.rows) + custom_fields['col_count'] = len(source.rows[0]) if source.rows else 0 + custom_fields['has_header'] = '예' if source.has_header else '아니오' + + # 헤더 정보 (있는 경우) + if source.has_header and source.rows: + headers = [h.strip() for h in source.rows[0] if h.strip()] + if headers: + custom_fields['columns'] = ', '.join(headers[:10]) # 최대 10개 + if len(source.rows[0]) > 10: + custom_fields['columns'] += f' 외 {len(source.rows[0]) - 10}개' + + self.logger.debug(f"Extracted CSV metadata: {list(custom_fields.keys())}") + + except Exception as e: + self.logger.warning(f"Failed to extract CSV metadata: {e}") + + # CSV는 표준 필드가 없고 모두 custom 필드 + return DocumentMetadata(custom=custom_fields) + + +__all__ = [ + 'CSVMetadataExtractor', + 'CSVSourceInfo', + 'format_file_size', + 'get_delimiter_name', +] diff --git a/contextifier/core/processor/csv_helper/csv_preprocessor.py b/contextifier/core/processor/csv_helper/csv_preprocessor.py new file mode 100644 index 0000000..0914754 --- /dev/null +++ b/contextifier/core/processor/csv_helper/csv_preprocessor.py @@ -0,0 +1,86 @@ +# contextifier/core/processor/csv_helper/csv_preprocessor.py +""" +CSV Preprocessor - Process CSV content after conversion. + +Processing Pipeline Position: + 1. CSVFileConverter.convert() → (content: str, encoding: str) + 2. CSVPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. CSVMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (rows, columns) + +Current Implementation: + - Pass-through (CSV uses decoded string content directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.csv.preprocessor") + + +class CSVPreprocessor(BasePreprocessor): + """ + CSV Content Preprocessor. + + Currently a pass-through implementation as CSV processing + is handled during the content extraction phase. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted CSV content. + + Args: + converted_data: Tuple of (content: str, encoding: str) from CSVFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the content and encoding + """ + metadata: Dict[str, Any] = {} + + content = "" + encoding = "utf-8" + + # Handle tuple return from CSVFileConverter + if isinstance(converted_data, tuple) and len(converted_data) >= 2: + content, encoding = converted_data[0], converted_data[1] + metadata['detected_encoding'] = encoding + if content: + lines = content.split('\n') + metadata['line_count'] = len(lines) + elif isinstance(converted_data, str): + content = converted_data + metadata['line_count'] = len(content.split('\n')) + + logger.debug("CSV preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the processed string content + return PreprocessedData( + raw_content=content, + clean_content=content, # TRUE SOURCE - string content for CSV + encoding=encoding, + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "CSV Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is CSV content.""" + if isinstance(data, tuple) and len(data) >= 2: + return isinstance(data[0], str) + return isinstance(data, str) + + +__all__ = ['CSVPreprocessor'] diff --git a/contextifier/core/processor/doc_handler.py b/contextifier/core/processor/doc_handler.py index f768e74..721f748 100644 --- a/contextifier/core/processor/doc_handler.py +++ b/contextifier/core/processor/doc_handler.py @@ -4,28 +4,25 @@ Class-based handler for DOC files inheriting from BaseHandler. Automatically detects file format (RTF, OLE, HTML, DOCX) and processes accordingly. +RTF processing is delegated to RTFHandler. """ import io import logging import os import re -import shutil -import tempfile import struct import base64 -from datetime import datetime from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING from enum import Enum import zipfile import olefile from bs4 import BeautifulSoup -from striprtf.striprtf import rtf_to_text -from contextifier.core.processor.doc_helpers.rtf_parser import parse_rtf, RTFDocument from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.img_processor import ImageProcessor from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor +from contextifier.core.processor.doc_helpers.doc_image_processor import DOCImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -48,25 +45,32 @@ class DocFormat(Enum): 'ZIP': b'PK\x03\x04', } -METADATA_FIELD_NAMES = { - 'title': '제목', - 'subject': '주제', - 'author': '작성자', - 'keywords': '키워드', - 'comments': '설명', - 'last_saved_by': '마지막 저장자', - 'create_time': '작성일', - 'last_saved_time': '수정일', -} - class DOCHandler(BaseHandler): """DOC file processing handler class.""" - + + def _create_file_converter(self): + """Create DOC-specific file converter.""" + from contextifier.core.processor.doc_helpers.doc_file_converter import DOCFileConverter + return DOCFileConverter() + + def _create_preprocessor(self): + """Create DOC-specific preprocessor.""" + from contextifier.core.processor.doc_helpers.doc_preprocessor import DOCPreprocessor + return DOCPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """DOC files chart extraction not yet implemented. Return NullChartExtractor.""" return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """DOC metadata extraction not yet implemented. Return None to use NullMetadataExtractor.""" + return None + + def _create_format_image_processor(self) -> ImageProcessor: + """Create DOC-specific image processor.""" + return DOCImageProcessor() + def extract_text( self, current_file: "CurrentFile", @@ -76,222 +80,142 @@ def extract_text( """Extract text from DOC file.""" file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + self.logger.info(f"DOC processing: {file_path}") - + if not file_data: self.logger.error(f"Empty file data: {file_path}") return f"[DOC file is empty: {file_path}]" - - doc_format = self._detect_format_from_bytes(file_data) - + try: + # Step 1: Use file_converter to detect format and convert + converted_obj, doc_format = self.file_converter.convert(file_data) + + # Step 2: Preprocess - may transform converted_obj in the future + preprocessed = self.preprocess(converted_obj) + converted_obj = preprocessed.clean_content # TRUE SOURCE + if doc_format == DocFormat.RTF: - return self._extract_from_rtf(current_file, extract_metadata) + # Delegate to RTFHandler for RTF processing + return self._delegate_to_rtf_handler(converted_obj, current_file, extract_metadata) elif doc_format == DocFormat.OLE: - return self._extract_from_ole(current_file, extract_metadata) + return self._extract_from_ole_obj(converted_obj, current_file, extract_metadata) elif doc_format == DocFormat.HTML: - return self._extract_from_html(current_file, extract_metadata) + return self._extract_from_html_obj(converted_obj, current_file, extract_metadata) elif doc_format == DocFormat.DOCX: - return self._extract_from_docx_misnamed(current_file, extract_metadata) + return self._extract_from_docx_obj(converted_obj, current_file, extract_metadata) else: - self.logger.warning(f"Unknown DOC format, trying OLE: {file_path}") + self.logger.warning(f"Unknown DOC format, trying OLE fallback: {file_path}") return self._extract_from_ole(current_file, extract_metadata) except Exception as e: self.logger.error(f"Error in DOC processing: {e}") return f"[DOC file processing failed: {str(e)}]" - - def _detect_format_from_bytes(self, file_data: bytes) -> DocFormat: - """Detect file format from binary data.""" - try: - header = file_data[:32] if len(file_data) >= 32 else file_data - - if not header: - return DocFormat.UNKNOWN - - if header.startswith(MAGIC_NUMBERS['RTF']): - return DocFormat.RTF - - if header.startswith(MAGIC_NUMBERS['OLE']): - return DocFormat.OLE - - if header.startswith(MAGIC_NUMBERS['ZIP']): - try: - file_stream = io.BytesIO(file_data) - with zipfile.ZipFile(file_stream, 'r') as zf: - if '[Content_Types].xml' in zf.namelist(): - return DocFormat.DOCX - except zipfile.BadZipFile: - pass - - header_lower = header.lower() - if header_lower.startswith(b' str: - """RTF file processing.""" + + def _delegate_to_rtf_handler(self, rtf_doc, current_file: "CurrentFile", extract_metadata: bool) -> str: + """ + Delegate RTF processing to RTFHandler. + + DOC 파일이 실제로는 RTF 형식인 경우, RTFHandler에 위임합니다. + RTFHandler.extract_text()는 raw bytes를 받으므로 current_file을 그대로 전달합니다. + + Args: + rtf_doc: Pre-converted RTFDocument object (unused, for consistency) + current_file: CurrentFile dict containing original file_data + extract_metadata: Whether to extract metadata + + Returns: + Extracted text + """ + from contextifier.core.processor.rtf_handler import RTFHandler + + rtf_handler = RTFHandler( + config=self.config, + image_processor=self._image_processor, + page_tag_processor=self._page_tag_processor, + chart_processor=self._chart_processor + ) + + # RTFHandler.extract_text()는 current_file에서 file_data를 직접 읽어 처리 + return rtf_handler.extract_text(current_file, extract_metadata=extract_metadata) + + def _extract_from_ole_obj(self, ole, current_file: "CurrentFile", extract_metadata: bool) -> str: + """OLE Compound Document processing using pre-converted OLE object.""" file_path = current_file.get("file_path", "unknown") - file_data = current_file.get("file_data", b"") - - self.logger.info(f"Processing RTF: {file_path}") - + + self.logger.info(f"Processing OLE: {file_path}") + + result_parts = [] + processed_images: Set[str] = set() + try: - content = file_data - - processed_images: Set[str] = set() - doc = parse_rtf(content, processed_images=processed_images, image_processor=self.image_processor) - - result_parts = [] - + # Metadata extraction if extract_metadata: - metadata_str = self._format_metadata(doc.metadata) + metadata = self._extract_ole_metadata(ole) + metadata_str = self.extract_and_format_metadata(metadata) if metadata_str: result_parts.append(metadata_str + "\n\n") - + page_tag = self.create_page_tag(1) result_parts.append(f"{page_tag}\n") - - inline_content = doc.get_inline_content() - if inline_content: - result_parts.append(inline_content) - else: - if doc.text_content: - result_parts.append(doc.text_content) - - for table in doc.tables: - if not table.rows: - continue - if table.is_real_table(): - result_parts.append("\n" + table.to_html() + "\n") - else: - result_parts.append("\n" + table.to_text_list() + "\n") - - result = "\n".join(result_parts) - result = re.sub(r'\[image:[^\]]*uploads/\.[^\]]*\]', '', result) - - return result - + + # Extract text from WordDocument stream + text = self._extract_ole_text(ole) + if text: + result_parts.append(text) + + # Extract images + images = self._extract_ole_images(ole, processed_images) + for img_tag in images: + result_parts.append(img_tag) + except Exception as e: - self.logger.error(f"RTF processing error: {e}") - return self._extract_rtf_fallback(current_file, extract_metadata) - - def _extract_rtf_fallback(self, current_file: "CurrentFile", extract_metadata: bool) -> str: - """RTF fallback (striprtf).""" - file_data = current_file.get("file_data", b"") - - content = None - for encoding in ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']: - try: - content = file_data.decode(encoding) - break - except (UnicodeDecodeError, UnicodeError): - continue - - if content is None: - content = file_data.decode('cp1252', errors='replace') - - result_parts = [] - - if extract_metadata: - metadata = self._extract_rtf_metadata(content) - metadata_str = self._format_metadata(metadata) - if metadata_str: - result_parts.append(metadata_str + "\n\n") - - page_tag = self.create_page_tag(1) - result_parts.append(f"{page_tag}\n") - - try: - text = rtf_to_text(content) - except: - text = re.sub(r'\\[a-z]+\d*\s?', '', content) - text = re.sub(r"\\'[0-9a-fA-F]{2}", '', text) - text = re.sub(r'[{}]', '', text) - - if text: - text = re.sub(r'\n{3,}', '\n\n', text) - result_parts.append(text.strip()) - + self.logger.error(f"OLE processing error: {e}") + return f"[DOC file processing failed: {str(e)}]" + finally: + # Close the OLE object + self.file_converter.close(ole) + return "\n".join(result_parts) - - def _extract_rtf_metadata(self, content: str) -> Dict[str, Any]: - """RTF metadata extraction.""" - metadata = {} - patterns = { - 'title': r'\\title\s*\{([^}]*)\}', - 'subject': r'\\subject\s*\{([^}]*)\}', - 'author': r'\\author\s*\{([^}]*)\}', - 'keywords': r'\\keywords\s*\{([^}]*)\}', - 'comments': r'\\doccomm\s*\{([^}]*)\}', - 'last_saved_by': r'\\operator\s*\{([^}]*)\}', - } - - for key, pattern in patterns.items(): - match = re.search(pattern, content, re.IGNORECASE) - if match: - value = match.group(1).strip() - if value: - metadata[key] = value - - return metadata - + def _extract_from_ole(self, current_file: "CurrentFile", extract_metadata: bool) -> str: """OLE Compound Document processing - extract text directly from WordDocument stream.""" file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + self.logger.info(f"Processing OLE: {file_path}") - + result_parts = [] processed_images: Set[str] = set() - + try: file_stream = io.BytesIO(file_data) with olefile.OleFileIO(file_stream) as ole: # Metadata extraction if extract_metadata: metadata = self._extract_ole_metadata(ole) - metadata_str = self._format_metadata(metadata) + metadata_str = self.extract_and_format_metadata(metadata) if metadata_str: result_parts.append(metadata_str + "\n\n") - + page_tag = self.create_page_tag(1) result_parts.append(f"{page_tag}\n") - + # Extract text from WordDocument stream text = self._extract_ole_text(ole) if text: result_parts.append(text) - + # Extract images images = self._extract_ole_images(ole, processed_images) for img_tag in images: result_parts.append(img_tag) - + except Exception as e: self.logger.error(f"OLE processing error: {e}") return f"[DOC file processing failed: {str(e)}]" - + return "\n".join(result_parts) - + def _extract_ole_metadata(self, ole: olefile.OleFileIO) -> Dict[str, Any]: """OLE 메타데이터 추출""" metadata = {} @@ -317,7 +241,7 @@ def _extract_ole_metadata(self, ole: olefile.OleFileIO) -> Dict[str, Any]: except Exception as e: self.logger.warning(f"Error extracting OLE metadata: {e}") return metadata - + def _decode_ole_string(self, value) -> str: """OLE 문자열 디코딩""" if value is None: @@ -332,7 +256,7 @@ def _decode_ole_string(self, value) -> str: continue return value.decode('utf-8', errors='replace').strip() return str(value).strip() - + def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str]) -> List[str]: """OLE에서 이미지 추출""" images = [] @@ -342,10 +266,10 @@ def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str] try: stream = ole.openstream(entry) data = stream.read() - + if data[:8] == b'\x89PNG\r\n\x1a\n' or data[:2] == b'\xff\xd8' or \ data[:6] in (b'GIF87a', b'GIF89a') or data[:2] == b'BM': - image_tag = self.image_processor.save_image(data) + image_tag = self.format_image_processor.save_image(data) if image_tag: images.append(f"\n{image_tag}\n") except: @@ -353,14 +277,108 @@ def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str] except Exception as e: self.logger.warning(f"Error extracting OLE images: {e}") return images - + + def _extract_from_html_obj(self, soup, current_file: "CurrentFile", extract_metadata: bool) -> str: + """HTML DOC processing using pre-converted BeautifulSoup object.""" + file_path = current_file.get("file_path", "unknown") + + self.logger.info(f"Processing HTML DOC: {file_path}") + + result_parts = [] + + if extract_metadata: + metadata = self._extract_html_metadata(soup) + metadata_str = self.extract_and_format_metadata(metadata) + if metadata_str: + result_parts.append(metadata_str + "\n\n") + + page_tag = self.create_page_tag(1) + result_parts.append(f"{page_tag}\n") + + # Copy soup to avoid modifying the original + soup_copy = BeautifulSoup(str(soup), 'html.parser') + + for tag in soup_copy(['script', 'style', 'meta', 'link', 'head']): + tag.decompose() + + text = soup_copy.get_text(separator='\n', strip=True) + text = re.sub(r'\n{3,}', '\n\n', text) + + if text: + result_parts.append(text) + + for table in soup_copy.find_all('table'): + table_html = str(table) + table_html = re.sub(r'\s+style="[^"]*"', '', table_html) + table_html = re.sub(r'\s+class="[^"]*"', '', table_html) + result_parts.append("\n" + table_html + "\n") + + for img in soup_copy.find_all('img'): + src = img.get('src', '') + if src and src.startswith('data:image'): + try: + match = re.match(r'data:image/(\w+);base64,(.+)', src) + if match: + image_data = base64.b64decode(match.group(2)) + image_tag = self.format_image_processor.save_image(image_data) + if image_tag: + result_parts.append(f"\n{image_tag}\n") + except: + pass + + return "\n".join(result_parts) + + def _extract_from_docx_obj(self, doc, current_file: "CurrentFile", extract_metadata: bool) -> str: + """Extract from misnamed DOCX using pre-converted Document object.""" + file_path = current_file.get("file_path", "unknown") + + self.logger.info(f"Processing misnamed DOCX: {file_path}") + + try: + result_parts = [] + + if extract_metadata: + # Basic metadata from docx Document + if hasattr(doc, 'core_properties'): + metadata = { + 'title': doc.core_properties.title or '', + 'author': doc.core_properties.author or '', + 'subject': doc.core_properties.subject or '', + 'keywords': doc.core_properties.keywords or '', + } + metadata = {k: v for k, v in metadata.items() if v} + metadata_str = self.extract_and_format_metadata(metadata) + if metadata_str: + result_parts.append(metadata_str + "\n\n") + + page_tag = self.create_page_tag(1) + result_parts.append(f"{page_tag}\n") + + for para in doc.paragraphs: + if para.text.strip(): + result_parts.append(para.text) + + for table in doc.tables: + for row in table.rows: + row_texts = [] + for cell in row.cells: + row_texts.append(cell.text.strip()) + if any(t for t in row_texts): + result_parts.append(" | ".join(row_texts)) + + return "\n".join(result_parts) + + except Exception as e: + self.logger.error(f"Error processing misnamed DOCX: {e}") + return f"[DOCX processing failed: {str(e)}]" + def _extract_from_html(self, current_file: "CurrentFile", extract_metadata: bool) -> str: """HTML DOC processing.""" file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + self.logger.info(f"Processing HTML DOC: {file_path}") - + content = None for encoding in ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'cp1252', 'latin-1']: try: @@ -368,37 +386,37 @@ def _extract_from_html(self, current_file: "CurrentFile", extract_metadata: bool break except (UnicodeDecodeError, UnicodeError): continue - + if content is None: content = file_data.decode('utf-8', errors='replace') - + result_parts = [] soup = BeautifulSoup(content, 'html.parser') - + if extract_metadata: metadata = self._extract_html_metadata(soup) - metadata_str = self._format_metadata(metadata) + metadata_str = self.extract_and_format_metadata(metadata) if metadata_str: result_parts.append(metadata_str + "\n\n") - + page_tag = self.create_page_tag(1) result_parts.append(f"{page_tag}\n") - + for tag in soup(['script', 'style', 'meta', 'link', 'head']): tag.decompose() - + text = soup.get_text(separator='\n', strip=True) text = re.sub(r'\n{3,}', '\n\n', text) - + if text: result_parts.append(text) - + for table in soup.find_all('table'): table_html = str(table) table_html = re.sub(r'\s+style="[^"]*"', '', table_html) table_html = re.sub(r'\s+class="[^"]*"', '', table_html) result_parts.append("\n" + table_html + "\n") - + for img in soup.find_all('img'): src = img.get('src', '') if src and src.startswith('data:image'): @@ -406,50 +424,50 @@ def _extract_from_html(self, current_file: "CurrentFile", extract_metadata: bool match = re.match(r'data:image/(\w+);base64,(.+)', src) if match: image_data = base64.b64decode(match.group(2)) - image_tag = self.image_processor.save_image(image_data) + image_tag = self.format_image_processor.save_image(image_data) if image_tag: result_parts.append(f"\n{image_tag}\n") except: pass - + return "\n".join(result_parts) - + def _extract_html_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]: """HTML metadata extraction.""" metadata = {} title_tag = soup.find('title') if title_tag and title_tag.string: metadata['title'] = title_tag.string.strip() - + meta_mappings = { 'author': 'author', 'description': 'comments', 'keywords': 'keywords', 'subject': 'subject', 'creator': 'author', 'producer': 'last_saved_by', } - + for meta in soup.find_all('meta'): name = meta.get('name', '').lower() content = meta.get('content', '') if name in meta_mappings and content: metadata[meta_mappings[name]] = content.strip() - + return metadata - + def _extract_from_docx_misnamed(self, current_file: "CurrentFile", extract_metadata: bool) -> str: """Process misnamed DOCX file.""" file_path = current_file.get("file_path", "unknown") - + self.logger.info(f"Processing misnamed DOCX: {file_path}") - + try: from contextifier.core.processor.docx_handler import DOCXHandler - + # Pass current_file directly - DOCXHandler now accepts CurrentFile - docx_handler = DOCXHandler(config=self.config, image_processor=self.image_processor) + docx_handler = DOCXHandler(config=self.config, image_processor=self.format_image_processor) return docx_handler.extract_text(current_file, extract_metadata=extract_metadata) except Exception as e: self.logger.error(f"Error processing misnamed DOCX: {e}") return f"[DOC file processing failed: {str(e)}]" - + def _extract_ole_text(self, ole: olefile.OleFileIO) -> str: """Extract text from OLE WordDocument stream.""" try: @@ -457,47 +475,47 @@ def _extract_ole_text(self, ole: olefile.OleFileIO) -> str: if not ole.exists('WordDocument'): self.logger.warning("WordDocument stream not found") return "" - + # Read Word Document stream word_stream = ole.openstream('WordDocument') word_data = word_stream.read() - + if len(word_data) < 12: return "" - + # FIB (File Information Block) parsing # Check magic number (0xA5EC or 0xA5DC) magic = struct.unpack(' str: """Word 스트림에서 텍스트 추출 (휴리스틱 방식)""" text_parts = [] - + # 방법 1: UTF-16LE 유니코드 텍스트 추출 try: # 연속된 유니코드 문자열 찾기 @@ -511,7 +529,7 @@ def _extract_text_from_word_stream(self, data: bytes) -> str: while j < len(data) - 1: char = data[j] next_byte = data[j+1] - + # ASCII 범위 유니코드 문자 또는 한글 if next_byte == 0x00 and (0x20 <= char <= 0x7E or char in (0x0D, 0x0A, 0x09)): unicode_bytes.extend([char, next_byte]) @@ -524,7 +542,7 @@ def _extract_text_from_word_stream(self, data: bytes) -> str: j += 2 else: break - + if len(unicode_bytes) >= 8: # 최소 4자 이상 try: text = bytes(unicode_bytes).decode('utf-16-le', errors='ignore') @@ -542,7 +560,7 @@ def _extract_text_from_word_stream(self, data: bytes) -> str: i += 1 except Exception as e: self.logger.debug(f"Unicode extraction error: {e}") - + # 결과 정리 if text_parts: # 중복 제거 및 연결 @@ -552,26 +570,10 @@ def _extract_text_from_word_stream(self, data: bytes) -> str: if part not in seen and len(part) > 3: seen.add(part) unique_parts.append(part) - + result = '\n'.join(unique_parts) # 과도한 줄바꿈 정리 result = re.sub(r'\n{3,}', '\n\n', result) return result.strip() - + return "" - - def _format_metadata(self, metadata: Dict[str, Any]) -> str: - """메타데이터 포맷팅""" - if not metadata: - return "" - - lines = [""] - for key, label in METADATA_FIELD_NAMES.items(): - if key in metadata and metadata[key]: - value = metadata[key] - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - lines.append(f" {label}: {value}") - lines.append("") - - return "\n".join(lines) diff --git a/contextifier/core/processor/doc_helpers/__init__.py b/contextifier/core/processor/doc_helpers/__init__.py index 2f5b8d5..70b2e7e 100644 --- a/contextifier/core/processor/doc_helpers/__init__.py +++ b/contextifier/core/processor/doc_helpers/__init__.py @@ -1,48 +1,24 @@ # libs/core/processor/doc_helpers/__init__.py """ -DOC/RTF Helper 모듈 +DOC Helper 모듈 -DOC 및 RTF 문서 처리에 필요한 유틸리티를 제공합니다. +DOC 문서 처리에 필요한 유틸리티를 제공합니다. + +RTF 관련 모듈들은 rtf_helper로 이동했습니다. +RTF 처리가 필요한 경우 rtf_helper를 사용하세요: + from contextifier.core.processor import rtf_helper + from contextifier.core.processor.rtf_helper import RTFParser 모듈 구성: -- rtf_constants: RTF 관련 상수 정의 -- rtf_models: RTF 데이터 모델 -- rtf_parser: RTF 파싱 -- rtf_decoder: RTF 디코딩 -- rtf_content_extractor: RTF 콘텐츠 추출 -- rtf_table_extractor: RTF 테이블 추출 -- rtf_metadata_extractor: RTF 메타데이터 추출 -- rtf_region_finder: RTF 영역 탐색 -- rtf_text_cleaner: RTF 텍스트 정리 -- rtf_bin_processor: RTF 바이너리 처리 +- doc_file_converter: DOC 파일 변환기 +- doc_image_processor: DOC 이미지 처리기 """ -# Constants -from contextifier.core.processor.doc_helpers.rtf_constants import * - -# Models -from contextifier.core.processor.doc_helpers.rtf_models import * - -# Parser -from contextifier.core.processor.doc_helpers.rtf_parser import * - -# Decoder -from contextifier.core.processor.doc_helpers.rtf_decoder import * - -# Content Extractor -from contextifier.core.processor.doc_helpers.rtf_content_extractor import * - -# Table Extractor -from contextifier.core.processor.doc_helpers.rtf_table_extractor import * - -# Metadata Extractor -from contextifier.core.processor.doc_helpers.rtf_metadata_extractor import * - -# Region Finder -from contextifier.core.processor.doc_helpers.rtf_region_finder import * - -# Text Cleaner -from contextifier.core.processor.doc_helpers.rtf_text_cleaner import * +# DOC-specific components +from contextifier.core.processor.doc_helpers.doc_file_converter import DOCFileConverter +from contextifier.core.processor.doc_helpers.doc_image_processor import DOCImageProcessor -# Binary Processor -from contextifier.core.processor.doc_helpers.rtf_bin_processor import * +__all__ = [ + 'DOCFileConverter', + 'DOCImageProcessor', +] diff --git a/contextifier/core/processor/doc_helpers/doc_file_converter.py b/contextifier/core/processor/doc_helpers/doc_file_converter.py new file mode 100644 index 0000000..2b3b3c9 --- /dev/null +++ b/contextifier/core/processor/doc_helpers/doc_file_converter.py @@ -0,0 +1,159 @@ +# libs/core/processor/doc_helpers/doc_file_converter.py +""" +DOCFileConverter - DOC file format converter + +Converts binary DOC data to appropriate format based on detection. +Supports RTF, OLE, HTML, and misnamed DOCX files. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO, Tuple +from enum import Enum +import zipfile + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class DocFormat(Enum): + """Detected DOC file format.""" + RTF = "rtf" + OLE = "ole" + HTML = "html" + DOCX = "docx" + UNKNOWN = "unknown" + + +class DOCFileConverter(BaseFileConverter): + """ + DOC file converter with format auto-detection. + + Detects actual format (RTF, OLE, HTML, DOCX) and converts accordingly. + """ + + # Magic numbers for format detection + MAGIC_RTF = b'{\\rtf' + MAGIC_OLE = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' + MAGIC_ZIP = b'PK\x03\x04' + + def __init__(self): + """Initialize DOCFileConverter.""" + self._detected_format: DocFormat = DocFormat.UNKNOWN + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Tuple[Any, DocFormat]: + """ + Convert binary DOC data to appropriate format. + + Args: + file_data: Raw binary DOC data + file_stream: Optional file stream + **kwargs: Additional options + + Returns: + Tuple of (converted object, detected format) + - RTF: (bytes, DocFormat.RTF) - 원본 바이너리 반환 (RTFHandler에서 처리) + - OLE: (olefile.OleFileIO, DocFormat.OLE) + - HTML: (BeautifulSoup, DocFormat.HTML) + - DOCX: (docx.Document, DocFormat.DOCX) + + Raises: + Exception: If conversion fails + """ + self._detected_format = self._detect_format(file_data) + + if self._detected_format == DocFormat.RTF: + # RTF는 원본 바이너리 반환 - RTFHandler.extract_text()에서 처리 + return file_data, self._detected_format + elif self._detected_format == DocFormat.OLE: + return self._convert_ole(file_data), self._detected_format + elif self._detected_format == DocFormat.HTML: + return self._convert_html(file_data), self._detected_format + elif self._detected_format == DocFormat.DOCX: + return self._convert_docx(file_data), self._detected_format + else: + # Try OLE as fallback + return self._convert_ole(file_data), DocFormat.OLE + + def _detect_format(self, file_data: bytes) -> DocFormat: + """Detect actual file format from binary data.""" + if not file_data: + return DocFormat.UNKNOWN + + header = file_data[:32] if len(file_data) >= 32 else file_data + + # Check RTF + if header.startswith(self.MAGIC_RTF): + return DocFormat.RTF + + # Check OLE + if header.startswith(self.MAGIC_OLE): + return DocFormat.OLE + + # Check ZIP (possible DOCX) + if header.startswith(self.MAGIC_ZIP): + try: + with zipfile.ZipFile(BytesIO(file_data), 'r') as zf: + if '[Content_Types].xml' in zf.namelist(): + return DocFormat.DOCX + except zipfile.BadZipFile: + pass + + # Check HTML + header_lower = header.lower() + if (header_lower.startswith(b' Any: + """Convert OLE data.""" + import olefile + return olefile.OleFileIO(BytesIO(file_data)) + + def _convert_html(self, file_data: bytes) -> Any: + """Convert HTML data.""" + from bs4 import BeautifulSoup + # Decode with fallback + try: + text = file_data.decode('utf-8') + except UnicodeDecodeError: + text = file_data.decode('cp949', errors='replace') + return BeautifulSoup(text, 'html.parser') + + def _convert_docx(self, file_data: bytes) -> Any: + """Convert misnamed DOCX data.""" + from docx import Document + return Document(BytesIO(file_data)) + + def get_format_name(self) -> str: + """Return detected format name.""" + format_names = { + DocFormat.RTF: "RTF Document", + DocFormat.OLE: "OLE Document (DOC)", + DocFormat.HTML: "HTML Document", + DocFormat.DOCX: "DOCX Document (misnamed)", + DocFormat.UNKNOWN: "Unknown DOC Format", + } + return format_names.get(self._detected_format, "Unknown") + + @property + def detected_format(self) -> DocFormat: + """Return detected format after conversion.""" + return self._detected_format + + def close(self, converted_object: Any) -> None: + """Close the converted object if needed.""" + if converted_object is not None: + if hasattr(converted_object, 'close'): + converted_object.close() diff --git a/contextifier/core/processor/doc_helpers/doc_image_processor.py b/contextifier/core/processor/doc_helpers/doc_image_processor.py new file mode 100644 index 0000000..c118c11 --- /dev/null +++ b/contextifier/core/processor/doc_helpers/doc_image_processor.py @@ -0,0 +1,179 @@ +# contextifier/core/processor/doc_helpers/doc_image_processor.py +""" +DOC Image Processor + +Provides DOC-specific image processing that inherits from ImageProcessor. +Handles images from RTF, OLE compound documents, and HTML-formatted DOC files. +""" +import logging +from typing import Any, Dict, Optional, Set + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +logger = logging.getLogger("contextify.image_processor.doc") + + +class DOCImageProcessor(ImageProcessor): + """ + DOC-specific image processor. + + Inherits from ImageProcessor and provides DOC-specific processing. + + Handles: + - RTF embedded images (pict, shppict, blipuid) + - OLE compound document images (Pictures stream, embedded objects) + - HTML-format DOC images (base64 encoded) + - WMF/EMF metafiles + + Example: + processor = DOCImageProcessor() + + # Process RTF picture + tag = processor.process_image(image_data, source="rtf", blipuid="abc123") + + # Process OLE embedded image + tag = processor.process_ole_image(ole_data, stream_name="Pictures/image1.png") + + # Process HTML base64 image + tag = processor.process_html_image(base64_data, src_attr="data:image/png;base64,...") + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize DOCImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + self._processed_blipuids: Set[str] = set() + + def process_image( + self, + image_data: bytes, + source: Optional[str] = None, + blipuid: Optional[str] = None, + stream_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save DOC image data. + + Args: + image_data: Raw image binary data + source: Image source type ("rtf", "ole", "html") + blipuid: RTF BLIP unique ID (for deduplication) + stream_name: OLE stream name + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + # Custom naming based on source + custom_name = None + + if source == "rtf" and blipuid: + # Use blipuid for RTF images (deduplication key) + if blipuid in self._processed_blipuids: + logger.debug(f"Skipping duplicate RTF image: {blipuid}") + return None + self._processed_blipuids.add(blipuid) + custom_name = f"rtf_{blipuid[:16]}" + elif source == "ole" and stream_name: + # Use stream name for OLE images + import os + custom_name = f"ole_{os.path.basename(stream_name).split('.')[0]}" + elif source == "html": + custom_name = None # Use hash-based naming + + return self.save_image(image_data, custom_name=custom_name) + + def process_ole_image( + self, + image_data: bytes, + stream_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process OLE compound document embedded image. + + Args: + image_data: Raw image binary data from OLE stream + stream_name: Name of the OLE stream + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return self.process_image( + image_data, + source="ole", + stream_name=stream_name, + **kwargs + ) + + def process_rtf_image( + self, + image_data: bytes, + blipuid: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process RTF embedded image. + + Args: + image_data: Raw image binary data from RTF + blipuid: BLIP unique ID for deduplication + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return self.process_image( + image_data, + source="rtf", + blipuid=blipuid, + **kwargs + ) + + def process_html_image( + self, + image_data: bytes, + src_attr: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process HTML-format DOC base64 image. + + Args: + image_data: Decoded image binary data + src_attr: Original src attribute value + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return self.process_image( + image_data, + source="html", + **kwargs + ) + + def reset_tracking(self) -> None: + """Reset processed image tracking for new document.""" + self._processed_blipuids.clear() diff --git a/contextifier/core/processor/doc_helpers/doc_preprocessor.py b/contextifier/core/processor/doc_helpers/doc_preprocessor.py new file mode 100644 index 0000000..cd2f5d3 --- /dev/null +++ b/contextifier/core/processor/doc_helpers/doc_preprocessor.py @@ -0,0 +1,83 @@ +# contextifier/core/processor/doc_helpers/doc_preprocessor.py +""" +DOC Preprocessor - Process DOC content after conversion. + +Processing Pipeline Position: + 1. DOCFileConverter.convert() → (converted_obj, DocFormat) + 2. DOCPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. Content extraction (depends on format: RTF, OLE, HTML, DOCX) + +Current Implementation: + - Pass-through (DOC delegates to format-specific handlers) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.doc.preprocessor") + + +class DOCPreprocessor(BasePreprocessor): + """ + DOC Document Preprocessor. + + Currently a pass-through implementation as DOC processing + delegates to format-specific handlers (RTF, OLE, HTML, DOCX). + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted DOC content. + + Args: + converted_data: Tuple of (converted_obj, DocFormat) from DOCFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the converted object + """ + metadata: Dict[str, Any] = {} + + converted_obj = converted_data + doc_format = None + + # Handle tuple return from DOCFileConverter + if isinstance(converted_data, tuple) and len(converted_data) >= 2: + converted_obj, doc_format = converted_data[0], converted_data[1] + if hasattr(doc_format, 'value'): + metadata['detected_format'] = doc_format.value + else: + metadata['detected_format'] = str(doc_format) + + logger.debug("DOC preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the converted object + # For DOC, this is the format-specific object (OLE, BeautifulSoup, etc.) + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_obj, # TRUE SOURCE - the converted object + encoding="utf-8", + extracted_resources={"doc_format": doc_format}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "DOC Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is DOC conversion result.""" + if isinstance(data, tuple) and len(data) >= 2: + return True + return data is not None + + +__all__ = ['DOCPreprocessor'] diff --git a/contextifier/core/processor/doc_helpers/rtf_bin_processor.py b/contextifier/core/processor/doc_helpers/rtf_bin_processor.py deleted file mode 100644 index 316d556..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_bin_processor.py +++ /dev/null @@ -1,537 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_bin_processor.py -""" -RTF Binary Data Processor - RTF 파일의 바이너리 데이터 처리기 - -RTF 파일 내의 바이너리 이미지 데이터를 처리합니다: -- bin 태그: 직접 바이너리 데이터 (JPEG, PNG, WMF 등) -- pict 그룹: 16진수 인코딩 또는 바이너리 이미지 - -주요 기능: -1. \binN 태그 스킵 (N 바이트의 바이너리 데이터를 건너뜀) -2. \pict 그룹에서 이미지 추출 -3. 이미지를 로컬에 저장하고 [image:path] 태그로 변환 - -RTF 스펙: -- \binN: N 바이트의 raw 바이너리 데이터가 뒤따름 -- \pict: 이미지 그룹 시작 -- \jpegblip: JPEG 형식 -- \pngblip: PNG 형식 -- \wmetafile: Windows Metafile -- \emfblip: Enhanced Metafile -""" -import logging -import re -import struct -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Set, Tuple - -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -# === 이미지 형식 상수 === - -# 매직 넘버로 이미지 형식 판별 -IMAGE_SIGNATURES = { - b'\xff\xd8\xff': 'jpeg', # JPEG - b'\x89PNG\r\n\x1a\n': 'png', # PNG - b'GIF87a': 'gif', # GIF87 - b'GIF89a': 'gif', # GIF89 - b'BM': 'bmp', # BMP - b'\xd7\xcd\xc6\x9a': 'wmf', # WMF (placeable) - b'\x01\x00\x09\x00': 'wmf', # WMF (standard) - b'\x01\x00\x00\x00': 'emf', # EMF -} - -# RTF 이미지 타입 매핑 -RTF_IMAGE_TYPES = { - 'jpegblip': 'jpeg', - 'pngblip': 'png', - 'wmetafile': 'wmf', - 'emfblip': 'emf', - 'dibitmap': 'bmp', - 'wbitmap': 'bmp', -} - - -@dataclass -class RTFBinaryRegion: - """RTF 바이너리 데이터 영역 정보""" - start_pos: int # 원본에서의 시작 위치 (바이트) - end_pos: int # 원본에서의 끝 위치 (바이트) - bin_type: str # "bin" 또는 "pict" - data_size: int # 바이너리 데이터 크기 - image_format: str = "" # 이미지 형식 (jpeg, png, wmf 등) - image_data: bytes = b"" # 추출된 이미지 데이터 - - -@dataclass -class RTFBinaryProcessResult: - """RTF 바이너리 처리 결과""" - clean_content: bytes # 바이너리가 제거/치환된 콘텐츠 - binary_regions: List[RTFBinaryRegion] = field(default_factory=list) - image_tags: Dict[int, str] = field(default_factory=dict) # 위치 -> 이미지 태그 - - -class RTFBinaryProcessor: - """ - RTF 바이너리 데이터 처리기 - - RTF 파일에서 바이너리 이미지 데이터를 추출하고, - 로컬에 저장하여 이미지 태그로 변환합니다. - """ - - def __init__( - self, - processed_images: Optional[Set[str]] = None, - image_processor: ImageProcessor = None - ): - """ - Args: - processed_images: 이미 처리된 이미지 해시 집합 (중복 방지) - image_processor: 이미지 처리기 - """ - self.processed_images = processed_images if processed_images is not None else set() - self.image_processor = image_processor - self.binary_regions: List[RTFBinaryRegion] = [] - self.image_tags: Dict[int, str] = {} - - def process(self, content: bytes) -> RTFBinaryProcessResult: - """ - RTF 바이너리 콘텐츠를 처리합니다. - - bin 태그의 바이너리 데이터를 스킵하고, - pict 그룹의 이미지를 추출하여 로컬에 저장합니다. - - Args: - content: RTF 파일 바이너리 콘텐츠 - - Returns: - 처리 결과 (정제된 콘텐츠, 바이너리 영역 정보, 이미지 태그) - """ - self.binary_regions = [] - self.image_tags = {} - - # 1단계: \bin 태그 위치 및 크기 파악 - bin_regions = self._find_bin_regions(content) - - # 2단계: \pict 그룹에서 이미지 추출 (bin 영역 외부) - pict_regions = self._find_pict_regions(content, bin_regions) - - # 3단계: 바이너리 영역 통합 및 정렬 - all_regions = bin_regions + pict_regions - all_regions.sort(key=lambda r: r.start_pos) - self.binary_regions = all_regions - - # 4단계: 이미지 추출 및 로컬 저장 - self._process_images() - - # 5단계: 바이너리 데이터를 제거한 콘텐츠 생성 - clean_content = self._remove_binary_data(content) - - return RTFBinaryProcessResult( - clean_content=clean_content, - binary_regions=self.binary_regions, - image_tags=self.image_tags - ) - - def _find_bin_regions(self, content: bytes) -> List[RTFBinaryRegion]: - """ - \binN 태그를 찾아 바이너리 영역을 식별합니다. - - RTF 스펙에서 binN은 N 바이트의 raw 바이너리 데이터가 뒤따름을 의미합니다. - 이 데이터는 문자열 디코딩 시 깨지므로 건너뛰어야 합니다. - - 중요: bin을 포함하는 상위 shppict 그룹 전체를 제거 영역으로 설정합니다. - - Args: - content: RTF 바이너리 콘텐츠 - - Returns: - 바이너리 영역 리스트 - """ - regions = [] - - # \bin 패턴 찾기: \binN (N은 바이트 수) - # RTF에서 \bin 다음의 숫자가 바이트 수를 나타냄 - pattern = rb'\\bin(\d+)' - - for match in re.finditer(pattern, content): - try: - bin_size = int(match.group(1)) - bin_tag_start = match.start() - bin_tag_end = match.end() - - # \bin 태그 다음에 공백이 있을 수 있음 - data_start = bin_tag_end - if data_start < len(content) and content[data_start:data_start+1] == b' ': - data_start += 1 - - data_end = data_start + bin_size - - if data_end <= len(content): - # 바이너리 데이터 추출 - binary_data = content[data_start:data_end] - - # 이미지 형식 감지 - image_format = self._detect_image_format(binary_data) - - # 상위 \shppict 그룹 찾기 - # \bin 위치에서 역방향으로 {\*\shppict 또는 {\\shppict 찾기 - group_start = bin_tag_start - group_end = data_end - - # 역방향으로 \shppict 검색 (최대 500바이트 뒤로) - search_start = max(0, bin_tag_start - 500) - search_area = content[search_start:bin_tag_start] - - # \shppict 찾기 - shppict_pos = search_area.rfind(b'\\shppict') - if shppict_pos != -1: - # 그룹 시작 { 찾기 - abs_pos = search_start + shppict_pos - brace_pos = abs_pos - while brace_pos > 0 and content[brace_pos:brace_pos+1] != b'{': - brace_pos -= 1 - group_start = brace_pos - - # 그룹 끝 } 찾기 (바이너리 데이터 이후) - depth = 1 - j = data_end - while j < len(content) and depth > 0: - if content[j:j+1] == b'{': - depth += 1 - elif content[j:j+1] == b'}': - depth -= 1 - j += 1 - group_end = j - - region = RTFBinaryRegion( - start_pos=group_start, - end_pos=group_end, - bin_type="bin", - data_size=bin_size, - image_format=image_format, - image_data=binary_data - ) - regions.append(region) - - logger.debug( - f"Found \\bin region: group_pos={group_start}-{group_end}, " - f"bin_pos={bin_tag_start}, size={bin_size}, " - f"format={image_format or 'unknown'}" - ) - - except (ValueError, IndexError) as e: - logger.debug(f"Error parsing \\bin tag: {e}") - continue - - logger.info(f"Found {len(regions)} \\bin regions in RTF") - return regions - - def _find_pict_regions( - self, - content: bytes, - exclude_regions: List[RTFBinaryRegion] - ) -> List[RTFBinaryRegion]: - """ - pict 그룹에서 16진수 인코딩된 이미지를 찾습니다. - - 주의: pict 그룹이 bin 태그를 포함하는 경우는 이미 _find_bin_regions에서 - 처리되었으므로 여기서는 스킵합니다. - - RTF 이미지 인코딩 방식: - 1. \bin 태그: 직접 바이너리 데이터 (이미 처리됨) - 2. 16진수: \pict ... [hex data] } 형태 - - Args: - content: RTF 바이너리 콘텐츠 - exclude_regions: 제외할 영역 (이미 처리된 \bin 영역) - - Returns: - pict 이미지 영역 리스트 (16진수 인코딩된 것만) - """ - regions = [] - - # \bin 태그 위치 집합 생성 (근처에 \bin이 있는 \pict는 스킵) - bin_tag_positions = set() - for region in exclude_regions: - if region.bin_type == "bin": - bin_tag_positions.add(region.start_pos) - - # 제외 영역을 빠르게 체크하기 위한 집합 생성 - excluded_ranges = [(r.start_pos, r.end_pos) for r in exclude_regions] - - def is_excluded(pos: int) -> bool: - """주어진 위치가 제외 영역에 포함되는지 확인""" - for start, end in excluded_ranges: - if start <= pos < end: - return True - return False - - def has_bin_nearby(pict_pos: int, search_range: int = 200) -> bool: - """ - pict 근처에 bin 태그가 있는지 확인. - pict 그룹이 bin 태그를 포함하면 True 반환. - """ - # \pict 위치부터 search_range 내에 \bin 태그가 있는지 확인 - for bin_pos in bin_tag_positions: - if pict_pos < bin_pos < pict_pos + search_range: - return True - return False - - try: - text_content = content.decode('cp1252', errors='replace') - - # \pict 그룹 찾기 - # 패턴: \pict\jpegblip... [hex data]} - pict_start_pattern = r'\\pict\s*((?:\\[a-zA-Z]+\d*\s*)*)' - - for match in re.finditer(pict_start_pattern, text_content): - start_pos = match.start() - - # 제외 영역인지 확인 - if is_excluded(start_pos): - continue - - # 근처에 \bin 태그가 있으면 스킵 (이미 처리됨) - if has_bin_nearby(start_pos): - logger.debug(f"Skipping \\pict at {start_pos} - has \\bin tag nearby") - continue - - attrs = match.group(1) - - # 이미지 타입 확인 - image_format = "" - for rtf_type, fmt in RTF_IMAGE_TYPES.items(): - if rtf_type in attrs: - image_format = fmt - break - - # 16진수 데이터 추출 - # \pict 속성들 다음에 16진수 데이터가 옴 - hex_start = match.end() - hex_data = [] - i = hex_start - - while i < len(text_content): - ch = text_content[i] - if ch in '0123456789abcdefABCDEF': - hex_data.append(ch) - elif ch in ' \t\r\n': - pass # 공백 무시 - elif ch == '}': - break # 그룹 끝 - elif ch == '\\': - # \bin 태그 확인 - if text_content[i:i+4] == '\\bin': - # \bin 태그가 있으면 이 \pict는 스킵 - logger.debug(f"Skipping \\pict at {start_pos} - contains \\bin tag") - hex_data = [] # 데이터 버리기 - break - # 다른 제어 워드까지 스킵 - while i < len(text_content) and text_content[i] not in ' \t\r\n}': - i += 1 - continue - else: - break - i += 1 - - hex_str = ''.join(hex_data) - - # 충분한 16진수 데이터가 있는 경우만 처리 - if len(hex_str) >= 32: # 최소 16바이트 이상 - try: - image_data = bytes.fromhex(hex_str) - - # 이미지 형식이 없으면 데이터에서 감지 - if not image_format: - image_format = self._detect_image_format(image_data) - - # 유효한 이미지인지 확인 - if image_format: - region = RTFBinaryRegion( - start_pos=start_pos, - end_pos=i, - bin_type="pict", - data_size=len(image_data), - image_format=image_format, - image_data=image_data - ) - regions.append(region) - - logger.debug( - f"Found \\pict region (hex): pos={start_pos}, " - f"hex_len={len(hex_str)}, format={image_format}" - ) - except ValueError as e: - logger.debug(f"Failed to decode hex data at {start_pos}: {e}") - - except Exception as e: - logger.warning(f"Error finding \\pict regions: {e}") - - logger.info(f"Found {len(regions)} \\pict regions (hex-encoded) in RTF") - return regions - - def _detect_image_format(self, data: bytes) -> str: - """ - 바이너리 데이터의 이미지 형식을 감지합니다. - - Args: - data: 이미지 바이너리 데이터 - - Returns: - 이미지 형식 문자열 (jpeg, png, wmf 등) 또는 빈 문자열 - """ - if not data or len(data) < 4: - return "" - - for signature, format_name in IMAGE_SIGNATURES.items(): - if data.startswith(signature): - return format_name - - # JPEG 확장 체크 (EXIF 헤더 등) - if len(data) >= 3: - if data[0:2] == b'\xff\xd8': - return 'jpeg' - - return "" - - def _process_images(self) -> None: - """ - 추출된 이미지를 로컬에 저장하고 태그를 생성합니다. - """ - for region in self.binary_regions: - if not region.image_data: - continue - - # 지원 가능한 이미지 형식인지 확인 - # WMF, EMF는 PIL에서 지원하지 않을 수 있음 - supported_formats = {'jpeg', 'png', 'gif', 'bmp'} - - if region.image_format in supported_formats: - image_tag = self.image_processor.save_image(region.image_data) - - if image_tag: - self.image_tags[region.start_pos] = f"\n{image_tag}\n" - logger.info( - f"Saved RTF image locally: {image_tag} " - f"(format={region.image_format}, size={region.data_size})" - ) - else: - # 저장 실패 시 빈 태그 (무시됨) - self.image_tags[region.start_pos] = "" - logger.warning(f"Image save failed, removing (pos={region.start_pos})") - else: - # WMF, EMF 등 미지원 형식은 플레이스홀더 - if region.image_format: - logger.debug( - f"Skipping unsupported image format: {region.image_format}" - ) - self.image_tags[region.start_pos] = "" # 빈 태그 (무시) - - def _remove_binary_data(self, content: bytes) -> bytes: - """ - 바이너리 데이터 영역을 제거한 콘텐츠를 생성합니다. - - \bin 태그와 바이너리 데이터를 이미지 태그로 치환하거나 제거합니다. - - Args: - content: 원본 RTF 바이너리 콘텐츠 - - Returns: - 정제된 콘텐츠 - """ - if not self.binary_regions: - return content - - # 영역을 역순으로 정렬하여 뒤에서부터 치환 (위치 변경 방지) - sorted_regions = sorted(self.binary_regions, key=lambda r: r.start_pos, reverse=True) - - result = bytearray(content) - - for region in sorted_regions: - # 해당 영역을 빈 바이트로 치환 (완전히 제거) - # 이미지 태그는 나중에 텍스트 레벨에서 삽입 - replacement = b'' - - # 이미지 태그가 있으면 마커 삽입 (나중에 텍스트 처리 시 사용) - if region.start_pos in self.image_tags: - tag = self.image_tags[region.start_pos] - if tag: - # 이미지 태그를 마커로 삽입 (ASCII 안전) - replacement = tag.encode('ascii', errors='replace') - - result[region.start_pos:region.end_pos] = replacement - - return bytes(result) - - def get_image_tag(self, position: int) -> str: - """ - 특정 위치의 이미지 태그를 반환합니다. - - Args: - position: RTF 내 위치 - - Returns: - 이미지 태그 문자열 또는 빈 문자열 - """ - return self.image_tags.get(position, "") - - -def preprocess_rtf_binary( - content: bytes, - processed_images: Optional[Set[str]] = None, - image_processor: ImageProcessor = None -) -> Tuple[bytes, Dict[int, str]]: - """ - RTF 콘텐츠에서 바이너리 데이터를 전처리합니다. - - \bin 태그의 바이너리 데이터를 제거하고, - 이미지는 로컬에 저장하여 태그로 변환합니다. - - 이 함수는 RTF 파서 전에 호출하여 바이너리 데이터로 인한 - 텍스트 깨짐을 방지합니다. - - Args: - content: RTF 파일 바이너리 콘텐츠 - processed_images: 처리된 이미지 해시 집합 (optional) - image_processor: 이미지 처리기 - - Returns: - (정제된 콘텐츠, 위치->이미지태그 딕셔너리) 튜플 - - Example: - >>> with open('file.rtf', 'rb') as f: - ... raw_content = f.read() - >>> clean_content, image_tags = preprocess_rtf_binary(raw_content) - >>> # 이후 RTF 파서에 clean_content 전달 - """ - processor = RTFBinaryProcessor(processed_images, image_processor) - result = processor.process(content) - return result.clean_content, result.image_tags - - -def extract_rtf_images( - content: bytes, - processed_images: Optional[Set[str]] = None, - image_processor: ImageProcessor = None -) -> List[str]: - """ - RTF 콘텐츠에서 모든 이미지를 추출하여 로컬에 저장합니다. - - Args: - content: RTF 파일 바이너리 콘텐츠 - processed_images: 처리된 이미지 해시 집합 (optional) - image_processor: 이미지 처리기 - - Returns: - 이미지 태그 리스트 (예: ["[image:bucket/uploads/hash.png]", ...]) - """ - processor = RTFBinaryProcessor(processed_images, image_processor) - result = processor.process(content) - - # 위치순으로 정렬된 이미지 태그 반환 - sorted_tags = sorted(result.image_tags.items(), key=lambda x: x[0]) - return [tag for pos, tag in sorted_tags if tag] diff --git a/contextifier/core/processor/doc_helpers/rtf_constants.py b/contextifier/core/processor/doc_helpers/rtf_constants.py deleted file mode 100644 index c7a7516..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_constants.py +++ /dev/null @@ -1,60 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_constants.py -""" -RTF Parser 상수 정의 - -RTF 파싱에 사용되는 상수들을 정의합니다. -""" - -# Shape 속성 이름들 (\sn으로 시작하는 속성들) - 텍스트에서 제거해야 함 -SHAPE_PROPERTY_NAMES = { - 'shapeType', 'fFlipH', 'fFlipV', 'txflTextFlow', 'fFilled', 'fLine', - 'dxTextLeft', 'dxTextRight', 'dyTextTop', 'dyTextBottom', - 'posrelh', 'posrelv', 'fBehindDocument', 'fLayoutInCell', 'fAllowOverlap', - 'fillColor', 'fillBackColor', 'fNoFillHitTest', 'lineColor', 'lineWidth', - 'posh', 'posv', 'fLockAnchor', 'fLockPosition', 'fLockAspectRatio', - 'fLockRotation', 'fLockCropping', 'fLockAgainstGrouping', 'fNoLineDrawDash', - 'wzName', 'wzDescription', 'pWrapPolygonVertices', 'dxWrapDistLeft', - 'dxWrapDistRight', 'dyWrapDistTop', 'dyWrapDistBottom', 'lidRegroup', - 'fEditedWrap', 'fBehindDocument', 'fOnDblClickNotify', 'fIsButton', - 'fOneD', 'fHidden', 'fPrint', 'geoLeft', 'geoTop', 'geoRight', 'geoBottom', - 'shapePath', 'pSegmentInfo', 'pVertices', 'fFillOK', 'fFillShadeShapeOK', - 'fGtextOK', 'fLineOK', 'f3DOK', 'fShadowOK', 'fArrowheadsOK', -} - -# 제외할 destination 키워드들 (본문이 아닌 영역) -EXCLUDE_DESTINATION_KEYWORDS = [ - r'\\header(?:f|l|r)?\b', # 헤더 - r'\\footer(?:f|l|r)?\b', # 푸터 - r'\\footnote\b', # 각주 - r'\\ftnsep\b', r'\\ftnsepc\b', # 각주 구분선 - r'\\aftncn\b', r'\\aftnsep\b', r'\\aftnsepc\b', # 미주 - r'\\pntext\b', r'\\pntxta\b', r'\\pntxtb\b', # 번호 매기기 -] - -# 제거할 destination 패턴들 -SKIP_DESTINATIONS = [ - 'themedata', 'colorschememapping', 'latentstyles', 'datastore', - 'xmlnstbl', 'wgrffmtfilter', 'generator', 'mmathPr', 'xmlopen', - 'background', 'pgptbl', 'listpicture', 'pnseclvl', 'revtbl', - 'bkmkstart', 'bkmkend', 'fldinst', 'objdata', 'objclass', - 'objemb', 'result', 'category', 'comment', 'company', 'creatim', - 'doccomm', 'hlinkbase', 'keywords', 'manager', 'operator', - 'revtim', 'subject', 'title', 'userprops', - 'nonshppict', 'blipuid', 'picprop', -] - -# 이미지 관련 destination -IMAGE_DESTINATIONS = ['shppict'] - -# 코드 페이지 -> 인코딩 매핑 -CODEPAGE_ENCODING_MAP = { - 949: 'cp949', - 932: 'cp932', - 936: 'gb2312', - 950: 'big5', - 1252: 'cp1252', - 65001: 'utf-8', -} - -# 기본 인코딩 시도 순서 -DEFAULT_ENCODINGS = ['cp949', 'utf-8', 'cp1252', 'latin-1'] diff --git a/contextifier/core/processor/doc_helpers/rtf_metadata_extractor.py b/contextifier/core/processor/doc_helpers/rtf_metadata_extractor.py deleted file mode 100644 index b456e8f..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_metadata_extractor.py +++ /dev/null @@ -1,78 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_metadata_extractor.py -""" -RTF 메타데이터 추출기 - -RTF 문서에서 메타데이터를 추출하는 기능을 제공합니다. -""" -import logging -import re -from datetime import datetime -from typing import Any, Dict - -from contextifier.core.processor.doc_helpers.rtf_decoder import ( - decode_hex_escapes, -) -from contextifier.core.processor.doc_helpers.rtf_text_cleaner import ( - clean_rtf_text, -) - -logger = logging.getLogger("document-processor") - - -def extract_metadata(content: str, encoding: str = "cp949") -> Dict[str, Any]: - """ - RTF 콘텐츠에서 메타데이터를 추출합니다. - - Args: - content: RTF 문자열 콘텐츠 - encoding: 사용할 인코딩 - - Returns: - 메타데이터 딕셔너리 - """ - metadata = {} - - # \info 그룹 찾기 - info_match = re.search(r'\\info\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', content) - if info_match: - info_content = info_match.group(1) - - # 각 메타데이터 필드 추출 - field_patterns = { - 'title': r'\\title\s*\{([^}]*)\}', - 'subject': r'\\subject\s*\{([^}]*)\}', - 'author': r'\\author\s*\{([^}]*)\}', - 'keywords': r'\\keywords\s*\{([^}]*)\}', - 'comments': r'\\doccomm\s*\{([^}]*)\}', - 'last_saved_by': r'\\operator\s*\{([^}]*)\}', - } - - for key, pattern in field_patterns.items(): - match = re.search(pattern, info_content) - if match: - value = decode_hex_escapes(match.group(1), encoding) - value = clean_rtf_text(value, encoding) - if value: - metadata[key] = value - - # 날짜 추출 - date_patterns = { - 'create_time': r'\\creatim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?', - 'last_saved_time': r'\\revtim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?', - } - - for key, pattern in date_patterns.items(): - match = re.search(pattern, content) - if match: - try: - year = int(match.group(1)) - month = int(match.group(2)) - day = int(match.group(3)) - hour = int(match.group(4)) if match.group(4) else 0 - minute = int(match.group(5)) if match.group(5) else 0 - metadata[key] = datetime(year, month, day, hour, minute) - except (ValueError, TypeError): - pass - - logger.debug(f"Extracted metadata: {list(metadata.keys())}") - return metadata diff --git a/contextifier/core/processor/doc_helpers/rtf_models.py b/contextifier/core/processor/doc_helpers/rtf_models.py deleted file mode 100644 index 0a0ee86..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_models.py +++ /dev/null @@ -1,364 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_models.py -""" -RTF Parser 데이터 모델 - -RTF 파싱에 사용되는 데이터 클래스들을 정의합니다. -""" -import re -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, NamedTuple, Tuple - - -class RTFCellInfo(NamedTuple): - """RTF 셀 정보 (병합 정보 포함)""" - text: str # 셀 텍스트 - h_merge_first: bool # 수평 병합 첫 번째 셀 (clmgf) - h_merge_cont: bool # 수평 병합 연속 셀 (clmrg) - v_merge_first: bool # 수직 병합 첫 번째 셀 (clvmgf) - v_merge_cont: bool # 수직 병합 연속 셀 (clvmrg) - right_boundary: int # 셀 오른쪽 경계 (twips) - - -@dataclass -class RTFTable: - """RTF 테이블 구조 (병합 셀 지원)""" - rows: List[List[RTFCellInfo]] = field(default_factory=list) - col_count: int = 0 - position: int = 0 # 문서 내 시작 위치 - end_position: int = 0 # 문서 내 종료 위치 - _logical_cells: List[List[Optional[RTFCellInfo]]] = field(default_factory=list, repr=False) - - def get_effective_col_count(self) -> int: - """ - 실제 유효한 열 수를 계산합니다. - 빈 셀만 있는 열은 제외합니다. - - Returns: - 실제 내용이 있는 최대 열 수 - """ - if not self.rows: - return 0 - - effective_counts = [] - for row in self.rows: - # 빈 셀과 병합된 셀을 제외한 유효 셀 수 계산 - non_empty_cells = [] - for i, cell in enumerate(row): - # 병합으로 건너뛰는 셀 제외 - if cell.h_merge_cont: - continue - # 내용이 있거나 수직 병합 시작인 경우 유효 - if cell.text.strip() or cell.v_merge_first: - non_empty_cells.append(i) - - if non_empty_cells: - # 마지막 유효 셀의 인덱스 + 1 - effective_counts.append(max(non_empty_cells) + 1) - - return max(effective_counts) if effective_counts else 0 - - def is_real_table(self) -> bool: - """ - 실제 테이블인지 판단합니다. - - n rows × 1 column 형태는 테이블이 아닌 단순 리스트로 간주합니다. - 빈 셀만 있는 열은 열 수에서 제외합니다. - - Returns: - True if 실제 테이블 (유효 열이 2개 이상), False otherwise - """ - if not self.rows: - return False - - # 유효 열 수로 판단 - effective_cols = self.get_effective_col_count() - return effective_cols >= 2 - - def _calculate_merge_info(self) -> List[List[Tuple[int, int]]]: - """ - 각 셀의 colspan, rowspan을 계산합니다. - - RTF의 병합 처리: - 1. 명시적 병합 플래그 (clmgf/clmrg, clvmgf/clvmrg) 사용 - 2. 열 경계(cellx) 값을 기반으로 암시적 colspan 계산 - - 테이블 전체의 고유 열 경계를 수집 - - 각 행의 셀이 몇 개의 논리적 열을 차지하는지 계산 - - Returns: - 각 셀별 (colspan, rowspan) 정보 2D 리스트 - (0, 0)은 이 셀이 다른 셀에 병합되어 건너뛰어야 함을 의미 - """ - if not self.rows: - return [] - - num_rows = len(self.rows) - - # 1단계: 전체 테이블의 고유 열 경계 수집 - all_boundaries = set() - for row in self.rows: - for cell in row: - if cell.right_boundary > 0: - all_boundaries.add(cell.right_boundary) - - # 정렬된 열 경계 리스트 - sorted_boundaries = sorted(all_boundaries) - total_logical_cols = len(sorted_boundaries) - - if total_logical_cols == 0: - # 열 경계 정보가 없으면 기본 처리 - max_cols = max(len(row) for row in self.rows) if self.rows else 0 - return [[(1, 1) for _ in range(max_cols)] for _ in range(num_rows)] - - # 경계값 -> 논리적 열 인덱스 매핑 - boundary_to_col = {b: i for i, b in enumerate(sorted_boundaries)} - - # 2단계: 각 행별로 셀의 colspan 계산 - # merge_info[row][logical_col] = (colspan, rowspan) 또는 (0, 0) - merge_info = [[None for _ in range(total_logical_cols)] for _ in range(num_rows)] - - for row_idx, row in enumerate(self.rows): - prev_boundary = 0 - for cell in row: - if cell.right_boundary <= 0: - continue - - # 이 셀이 차지하는 논리적 열 범위 계산 - start_col = 0 - for i, b in enumerate(sorted_boundaries): - if b <= prev_boundary: - start_col = i + 1 - else: - break - - end_col = boundary_to_col[cell.right_boundary] - colspan = end_col - start_col + 1 - - if colspan <= 0: - colspan = 1 - - # 시작 열에 셀 정보 기록 - if start_col < total_logical_cols: - merge_info[row_idx][start_col] = (colspan, 1, cell) - # 병합된 열들은 (0, 0)으로 표시 - for col in range(start_col + 1, start_col + colspan): - if col < total_logical_cols: - merge_info[row_idx][col] = (0, 0, None) - - prev_boundary = cell.right_boundary - - # 3단계: 수직 병합 (rowspan) 처리 - for col_idx in range(total_logical_cols): - row_idx = 0 - while row_idx < num_rows: - info = merge_info[row_idx][col_idx] - if info is None or len(info) < 3 or info[2] is None: - row_idx += 1 - continue - - colspan, _, cell = info - if colspan == 0: - row_idx += 1 - continue - - if cell.v_merge_first: - # 수직 병합 시작 - rowspan = 1 - for next_row in range(row_idx + 1, num_rows): - next_info = merge_info[next_row][col_idx] - if next_info is None or len(next_info) < 3 or next_info[2] is None: - break - _, _, next_cell = next_info - if next_cell.v_merge_cont: - rowspan += 1 - merge_info[next_row][col_idx] = (0, 0, None) - else: - break - - merge_info[row_idx][col_idx] = (colspan, rowspan, cell) - row_idx += rowspan - elif cell.v_merge_cont: - merge_info[row_idx][col_idx] = (0, 0, None) - row_idx += 1 - else: - row_idx += 1 - - # 4단계: 최종 결과 (colspan, rowspan)만 반환 - result = [] - for row_idx in range(num_rows): - row_result = [] - for col_idx in range(total_logical_cols): - info = merge_info[row_idx][col_idx] - if info is None: - row_result.append((1, 1)) - elif len(info) >= 2: - row_result.append((info[0], info[1])) - else: - row_result.append((1, 1)) - result.append(row_result) - - # 실제 셀 데이터도 저장 (to_html에서 사용) - self._logical_cells = [] - for row_idx in range(num_rows): - row_cells = [] - for col_idx in range(total_logical_cols): - info = merge_info[row_idx][col_idx] - if info is not None and len(info) >= 3 and info[2] is not None: - row_cells.append(info[2]) - else: - row_cells.append(None) - self._logical_cells.append(row_cells) - - return result - - def to_html(self) -> str: - """테이블을 HTML로 변환 (병합 셀 지원)""" - if not self.rows: - return "" - - merge_info = self._calculate_merge_info() - - # _logical_cells가 없으면 기존 방식 사용 - if not hasattr(self, '_logical_cells') or not self._logical_cells: - return self._to_html_legacy(merge_info) - - html_parts = [''] - - for row_idx, row_merge in enumerate(merge_info): - html_parts.append('') - - for col_idx, (colspan, rowspan) in enumerate(row_merge): - if colspan == 0 or rowspan == 0: - continue - - cell = self._logical_cells[row_idx][col_idx] if col_idx < len(self._logical_cells[row_idx]) else None - cell_text = cell.text if cell and cell.text else '' - - attrs = [] - if colspan > 1: - attrs.append(f'colspan="{colspan}"') - if rowspan > 1: - attrs.append(f'rowspan="{rowspan}"') - - attr_str = ' ' + ' '.join(attrs) if attrs else '' - html_parts.append(f'{cell_text}') - - html_parts.append('') - - html_parts.append('
') - return '\n'.join(html_parts) - - def _to_html_legacy(self, merge_info: List[List[Tuple[int, int]]]) -> str: - """기존 HTML 변환 (열 경계 정보 없을 때)""" - html_parts = [''] - - for row_idx, row in enumerate(self.rows): - html_parts.append('') - - for col_idx, cell in enumerate(row): - # 병합 정보 확인 - if col_idx < len(merge_info[row_idx]) and merge_info[row_idx][col_idx]: - colspan, rowspan = merge_info[row_idx][col_idx] - - if colspan == 0 and rowspan == 0: - # 이 셀은 다른 셀에 병합됨 - 건너뜀 - continue - - # 셀 내용 정리 - cell_text = re.sub(r'\s+', ' ', cell.text).strip() - - # 속성 생성 - attrs = [] - if colspan > 1: - attrs.append(f'colspan="{colspan}"') - if rowspan > 1: - attrs.append(f'rowspan="{rowspan}"') - - attr_str = ' ' + ' '.join(attrs) if attrs else '' - html_parts.append(f'{cell_text}') - else: - # 병합 정보 없음 - 일반 셀 - cell_text = re.sub(r'\s+', ' ', cell.text).strip() - html_parts.append(f'') - - html_parts.append('') - - html_parts.append('
{cell_text}
') - return '\n'.join(html_parts) - - def to_text_list(self) -> str: - """ - 1열 테이블을 텍스트 리스트로 변환합니다. - - - 1×1 테이블: 셀 내용만 반환 (컨테이너 테이블) - - n×1 테이블: 각 행을 빈 줄로 구분하여 반환 - - Returns: - 텍스트 형식의 문자열 - """ - if not self.rows: - return "" - - # 1×1 테이블: 셀 내용만 반환 (컨테이너 테이블) - if len(self.rows) == 1 and len(self.rows[0]) == 1: - return self.rows[0][0].text - - lines = [] - for row in self.rows: - if row: - # 첫 번째 셀만 사용 (1열 테이블) - cell_text = row[0].text - if cell_text: - lines.append(cell_text) - - # 빈 줄로 구분 - return '\n\n'.join(lines) - - -@dataclass -class RTFContentPart: - """문서 내 콘텐츠 조각 (텍스트 또는 테이블)""" - content_type: str # "text" 또는 "table" - position: int # 원본 문서 내 위치 - text: str = "" # content_type이 "text"인 경우 - table: Optional['RTFTable'] = None # content_type이 "table"인 경우 - - -@dataclass -class RTFDocument: - """RTF 문서 구조""" - text_content: str = "" - tables: List[RTFTable] = field(default_factory=list) - metadata: Dict[str, Any] = field(default_factory=dict) - images: List[bytes] = field(default_factory=list) - image_tags: List[str] = field(default_factory=list) # v3: 로컬 저장된 이미지 태그 - encoding: str = "cp949" - # v2: 인라인 콘텐츠 - 원래 순서대로 정렬된 콘텐츠 조각들 - content_parts: List[RTFContentPart] = field(default_factory=list) - - def get_inline_content(self) -> str: - """ - 테이블이 원래 위치에 인라인으로 배치된 전체 콘텐츠를 반환합니다. - - Returns: - 인라인 배치된 전체 텍스트 - """ - if not self.content_parts: - # 호환성: content_parts가 없으면 기존 방식으로 반환 - return self.text_content - - # 위치순 정렬 - sorted_parts = sorted(self.content_parts, key=lambda p: p.position) - - result_parts = [] - for part in sorted_parts: - if part.content_type == "text" and part.text.strip(): - result_parts.append(part.text) - elif part.content_type == "table" and part.table: - if part.table.is_real_table(): - result_parts.append(part.table.to_html()) - else: - text_list = part.table.to_text_list() - if text_list: - result_parts.append(text_list) - - return '\n\n'.join(result_parts) diff --git a/contextifier/core/processor/doc_helpers/rtf_parser.py b/contextifier/core/processor/doc_helpers/rtf_parser.py deleted file mode 100644 index 56a047f..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_parser.py +++ /dev/null @@ -1,200 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_parser.py -""" -RTF Parser - RTF 파일 바이너리 직접 파싱 (리팩터링 버전) - -LibreOffice 없이 RTF 파일을 직접 분석하여: -- 텍스트 추출 (원래 위치 유지) -- 테이블을 HTML로 변환 (인라인 배치) -- 병합 셀 처리 (clmgf/clmrg/clvmgf/clvmrg) -- 메타데이터 추출 -- 이미지 추출 - -RTF 1.5+ 스펙 기반 구현 - -이 파일은 기능별로 분리된 모듈들을 조합하여 사용합니다: -- rtf_constants.py: 상수 정의 -- rtf_models.py: 데이터 모델 (RTFCellInfo, RTFTable, RTFContentPart, RTFDocument) -- rtf_decoder.py: 인코딩/디코딩 유틸리티 -- rtf_text_cleaner.py: 텍스트 정리 유틸리티 -- rtf_metadata_extractor.py: 메타데이터 추출 -- rtf_table_extractor.py: 테이블 추출/파싱 -- rtf_content_extractor.py: 인라인 콘텐츠 추출 -- rtf_region_finder.py: 제외 영역 탐색 -- rtf_bin_processor.py: 바이너리 전처리 -""" -import logging -from typing import Optional, Set - -from contextifier.core.functions.img_processor import ImageProcessor - -# 모델 임포트 (외부에서 사용할 수 있도록) -from contextifier.core.processor.doc_helpers.rtf_models import ( - RTFCellInfo, - RTFTable, - RTFContentPart, - RTFDocument, -) - -# 디코더 임포트 -from contextifier.core.processor.doc_helpers.rtf_decoder import ( - detect_encoding, - decode_content, - decode_hex_escapes, -) - -# 텍스트 클리너 임포트 -from contextifier.core.processor.doc_helpers.rtf_text_cleaner import ( - clean_rtf_text, - remove_shprslt_blocks, -) - -# 메타데이터 추출기 임포트 -from contextifier.core.processor.doc_helpers.rtf_metadata_extractor import ( - extract_metadata, -) - -# 테이블 추출기 임포트 -from contextifier.core.processor.doc_helpers.rtf_table_extractor import ( - extract_tables_with_positions, -) - -# 콘텐츠 추출기 임포트 -from contextifier.core.processor.doc_helpers.rtf_content_extractor import ( - extract_inline_content, - extract_text_legacy, -) - -# 바이너리 처리기 임포트 -from contextifier.core.processor.doc_helpers.rtf_bin_processor import ( - preprocess_rtf_binary, -) - -logger = logging.getLogger("document-processor") - - -class RTFParser: - """ - RTF 파일 파서 (리팩터링 버전) - - RTF 바이너리를 직접 파싱하여 텍스트, 테이블, 메타데이터를 추출합니다. - - 기능별로 분리된 모듈들을 조합하여 사용합니다. - """ - - def __init__( - self, - encoding: str = "cp949", - processed_images: Optional[Set[str]] = None, - image_processor: ImageProcessor = None - ): - """ - Args: - encoding: 기본 인코딩 (한글 문서는 보통 cp949) - processed_images: 처리된 이미지 해시 집합 (중복 방지) - image_processor: 이미지 처리기 - """ - self.encoding = encoding - self.processed_images = processed_images if processed_images is not None else set() - self.image_processor = image_processor - self.document = RTFDocument(encoding=encoding) - - # 파싱 상태 - self._content: str = "" - self._raw_content: bytes = b"" # 원본 바이너리 - self._image_tags = {} # 위치 -> 이미지 태그 - - def parse(self, content: bytes) -> RTFDocument: - """ - RTF 바이너리를 파싱합니다. - - Args: - content: RTF 파일 바이트 데이터 - - Returns: - 파싱된 RTFDocument 객체 - """ - self._raw_content = content - - # 바이너리 데이터 전처리 (\bin 태그 처리, 이미지 추출) - clean_content, self._image_tags = preprocess_rtf_binary( - content, - processed_images=self.processed_images, - image_processor=self.image_processor - ) - - # 이미지 태그를 문서에 저장 (유효한 태그만) - self.document.image_tags = [ - tag for tag in self._image_tags.values() - if tag and tag.strip() and '/uploads/.' not in tag - ] - - # 인코딩 감지 및 디코딩 - self.encoding = detect_encoding(clean_content, self.encoding) - self._content = decode_content(clean_content, self.encoding) - - # \shprslt 블록 제거 (중복 콘텐츠 방지) - self._content = remove_shprslt_blocks(self._content) - - # 메타데이터 추출 - self.document.metadata = extract_metadata(self._content, self.encoding) - - # 테이블 추출 (위치 정보 포함) - tables, table_regions = extract_tables_with_positions( - self._content, - self.encoding - ) - self.document.tables = tables - - # 인라인 콘텐츠 추출 (테이블 위치 유지) - self.document.content_parts = extract_inline_content( - self._content, - table_regions, - self.encoding - ) - - # 호환성을 위해 기존 text_content도 설정 - self.document.text_content = extract_text_legacy( - self._content, - self.encoding - ) - - return self.document - - -def parse_rtf( - content: bytes, - encoding: str = "cp949", - processed_images: Optional[Set[str]] = None, - image_processor: ImageProcessor = None -) -> RTFDocument: - """ - RTF 파일을 파싱합니다. - - 바이너리 이미지 데이터를 로컬에 저장하고 태그로 변환합니다. - - Args: - content: RTF 파일 바이트 데이터 - encoding: 기본 인코딩 - processed_images: 처리된 이미지 해시 집합 (중복 방지, optional) - image_processor: 이미지 처리기 - - Returns: - 파싱된 RTFDocument 객체 - """ - parser = RTFParser( - encoding=encoding, - processed_images=processed_images, - image_processor=image_processor - ) - return parser.parse(content) - - -# 하위 호환성을 위한 re-export -__all__ = [ - 'RTFParser', - 'RTFDocument', - 'RTFTable', - 'RTFCellInfo', - 'RTFContentPart', - 'parse_rtf', -] diff --git a/contextifier/core/processor/doc_helpers/rtf_region_finder.py b/contextifier/core/processor/doc_helpers/rtf_region_finder.py deleted file mode 100644 index 946ade0..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_region_finder.py +++ /dev/null @@ -1,121 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_region_finder.py -""" -RTF 영역 탐색기 - -RTF 문서에서 제외해야 할 영역(헤더, 푸터, 각주 등)을 찾는 기능을 제공합니다. -""" -import logging -import re -from typing import List, Tuple - -from contextifier.core.processor.doc_helpers.rtf_constants import ( - EXCLUDE_DESTINATION_KEYWORDS, -) - -logger = logging.getLogger("document-processor") - - -def find_excluded_regions(content: str) -> List[Tuple[int, int]]: - r""" - 문서 본문이 아닌 제외 영역을 찾습니다. - - RTF에서 \header, \footer, \footnote 등의 그룹은 본문이 아니므로 - 테이블 및 텍스트 추출에서 제외해야 합니다. - - 주의: RTF 테이블은 \trowd에서 시작하여 \row로 끝나는데, - footer/header 그룹이 \trowd만 포함하고 셀 내용과 \row는 그룹 밖에 - 있을 수 있습니다. 따라서 footer/header 그룹 안에서 시작하는 테이블의 - 전체 범위(\row까지)를 제외해야 합니다. - - 제외 대상: - - \header, \headerf, \headerl, \headerr (헤더) - - \footer, \footerf, \footerl, \footerr (푸터) - - \footnote, \ftnsep, \ftnsepc, \aftncn, \aftnsep, \aftnsepc (각주) - - \pntext, \pntxta, \pntxtb (번호 매기기 텍스트) - - 위 그룹 안에서 시작하는 테이블의 전체 범위 (\trowd ~ \row) - - Args: - content: RTF 콘텐츠 - - Returns: - 제외 영역 리스트 [(start, end), ...] - """ - excluded_regions = [] - - pattern = '|'.join(EXCLUDE_DESTINATION_KEYWORDS) - - for match in re.finditer(pattern, content): - keyword_start = match.start() - keyword_end = match.end() - - # 이 키워드가 속한 그룹의 시작점('{') 찾기 - group_start = keyword_start - search_back = min(keyword_start, 50) # 최대 50자 뒤로 검색 - for i in range(keyword_start - 1, keyword_start - search_back - 1, -1): - if i < 0: - break - if content[i] == '{': - group_start = i - break - elif content[i] == '}': - # 다른 그룹이 끝났으면 중단 - break - - # 그룹의 끝('}') 찾기 - 중첩 괄호 처리 - depth = 1 - i = keyword_end - while i < len(content) and depth > 0: - if content[i] == '{': - depth += 1 - elif content[i] == '}': - depth -= 1 - i += 1 - group_end = i - - # footer/header 그룹 안에 \trowd가 있으면, \row까지 확장 - group_content = content[group_start:group_end] - if '\\trowd' in group_content: - # 이 그룹 끝 이후에 매칭되는 \row 찾기 - row_match = re.search(r'\\row(?![a-z])', content[group_end:]) - if row_match: - # \row의 끝까지 제외 영역 확장 - extended_end = group_end + row_match.end() - group_end = extended_end - logger.debug(f"Extended excluded region to include table row: {group_start}~{group_end}") - - excluded_regions.append((group_start, group_end)) - - # 겹치는 영역 병합 및 정렬 - if not excluded_regions: - return [] - - excluded_regions.sort(key=lambda x: x[0]) - merged = [excluded_regions[0]] - - for start, end in excluded_regions[1:]: - last_start, last_end = merged[-1] - if start <= last_end: - # 겹치면 병합 - merged[-1] = (last_start, max(last_end, end)) - else: - merged.append((start, end)) - - logger.debug(f"Found {len(merged)} excluded regions (header/footer/footnote)") - return merged - - -def is_in_excluded_region(pos: int, excluded_regions: List[Tuple[int, int]]) -> bool: - """ - 주어진 위치가 제외 영역 안에 있는지 확인합니다. - - Args: - pos: 확인할 위치 - excluded_regions: 제외 영역 리스트 - - Returns: - 제외 영역 안에 있으면 True - """ - for start, end in excluded_regions: - if start <= pos < end: - return True - return False diff --git a/contextifier/core/processor/doc_helpers/rtf_table_extractor.py b/contextifier/core/processor/doc_helpers/rtf_table_extractor.py deleted file mode 100644 index 27de72d..0000000 --- a/contextifier/core/processor/doc_helpers/rtf_table_extractor.py +++ /dev/null @@ -1,307 +0,0 @@ -# service/document_processor/processor/doc_helpers/rtf_table_extractor.py -""" -RTF 테이블 추출기 - -RTF 문서에서 테이블을 추출하고 파싱하는 기능을 제공합니다. -""" -import logging -import re -from typing import List, Optional, Tuple - -from contextifier.core.processor.doc_helpers.rtf_models import ( - RTFCellInfo, - RTFTable, -) -from contextifier.core.processor.doc_helpers.rtf_decoder import ( - decode_hex_escapes, -) -from contextifier.core.processor.doc_helpers.rtf_text_cleaner import ( - clean_rtf_text, -) -from contextifier.core.processor.doc_helpers.rtf_region_finder import ( - find_excluded_regions, - is_in_excluded_region, -) - -logger = logging.getLogger("document-processor") - - -def extract_tables_with_positions( - content: str, - encoding: str = "cp949" -) -> Tuple[List[RTFTable], List[Tuple[int, int, RTFTable]]]: - r""" - RTF에서 테이블을 추출합니다 (위치 정보 포함). - - RTF 테이블 구조: - - \trowd: 테이블 행 시작 (row definition) - - \cellx: 셀 경계 위치 정의 - - \clmgf: 수평 병합 시작 - - \clmrg: 수평 병합 계속 - - \clvmgf: 수직 병합 시작 - - \clvmrg: 수직 병합 계속 - - \intbl: 셀 내 단락 - - \cell: 셀 끝 - - \row: 행 끝 - - Args: - content: RTF 문자열 콘텐츠 - encoding: 사용할 인코딩 - - Returns: - (테이블 리스트, 테이블 영역 리스트) 튜플 - """ - tables = [] - table_regions = [] - - # 제외 영역 찾기 (header, footer, footnote 등) - excluded_regions = find_excluded_regions(content) - - # 1단계: \row로 끝나는 모든 위치 찾기 - row_positions = [] - for match in re.finditer(r'\\row(?![a-z])', content): - row_positions.append(match.end()) - - if not row_positions: - return tables, table_regions - - # 2단계: 각 \row 전에 있는 \trowd 찾기 (해당 행의 시작) - all_rows = [] - for i, row_end in enumerate(row_positions): - # 이전 \row 위치 또는 시작점 - if i == 0: - search_start = 0 - else: - search_start = row_positions[i - 1] - - # 이 영역에서 첫 번째 \trowd 찾기 - segment = content[search_start:row_end] - trowd_match = re.search(r'\\trowd', segment) - - if trowd_match: - row_start = search_start + trowd_match.start() - - # 제외 영역(header/footer/footnote) 안에 있는 행은 무시 - if is_in_excluded_region(row_start, excluded_regions): - logger.debug(f"Skipping table row at {row_start} (in header/footer/footnote)") - continue - - row_text = content[row_start:row_end] - all_rows.append((row_start, row_end, row_text)) - - if not all_rows: - return tables, table_regions - - # 연속된 행들을 테이블로 그룹화 - table_groups = [] # [(start_pos, end_pos, [row_texts])] - current_table = [] - current_start = -1 - current_end = -1 - prev_end = -1 - - for row_start, row_end, row_text in all_rows: - # 이전 행과 150자 이내면 같은 테이블 - if prev_end == -1 or row_start - prev_end < 150: - if current_start == -1: - current_start = row_start - current_table.append(row_text) - current_end = row_end - else: - if current_table: - table_groups.append((current_start, current_end, current_table)) - current_table = [row_text] - current_start = row_start - current_end = row_end - prev_end = row_end - - if current_table: - table_groups.append((current_start, current_end, current_table)) - - logger.info(f"Found {len(table_groups)} table groups") - - # 각 테이블 그룹 파싱 - for start_pos, end_pos, table_rows in table_groups: - table = _parse_table_with_merge(table_rows, encoding) - if table and table.rows: - table.position = start_pos - table.end_position = end_pos - tables.append(table) - table_regions.append((start_pos, end_pos, table)) - - logger.info(f"Extracted {len(tables)} tables") - return tables, table_regions - - -def _parse_table_with_merge(rows: List[str], encoding: str = "cp949") -> Optional[RTFTable]: - """ - 테이블 행들을 파싱하여 RTFTable 객체로 변환 (병합 셀 지원) - - Args: - rows: 테이블 행 텍스트 리스트 - encoding: 사용할 인코딩 - - Returns: - RTFTable 객체 - """ - table = RTFTable() - - for row_text in rows: - cells = _extract_cells_with_merge(row_text, encoding) - if cells: - table.rows.append(cells) - if len(cells) > table.col_count: - table.col_count = len(cells) - - return table if table.rows else None - - -def _extract_cells_with_merge(row_text: str, encoding: str = "cp949") -> List[RTFCellInfo]: - """ - 테이블 행에서 셀 내용과 병합 정보를 추출합니다. - - Args: - row_text: 테이블 행 RTF 텍스트 - encoding: 사용할 인코딩 - - Returns: - RTFCellInfo 리스트 - """ - cells = [] - - # 1단계: 셀 정의 파싱 (cellx 전까지의 속성들) - cell_defs = [] - - # \cell 다음에 x가 오지 않는 첫 번째 \cell 찾기 - first_cell_idx = -1 - pos = 0 - while True: - idx = row_text.find('\\cell', pos) - if idx == -1: - first_cell_idx = len(row_text) - break - # \cell 다음이 x인지 확인 (\cellx는 건너뜀) - if idx + 5 < len(row_text) and row_text[idx + 5] == 'x': - pos = idx + 1 - continue - first_cell_idx = idx - break - - def_part = row_text[:first_cell_idx] - - current_def = { - 'h_merge_first': False, - 'h_merge_cont': False, - 'v_merge_first': False, - 'v_merge_cont': False, - 'right_boundary': 0 - } - - cell_def_pattern = r'\\cl(?:mgf|mrg|vmgf|vmrg)|\\cellx(-?\d+)' - - for match in re.finditer(cell_def_pattern, def_part): - token = match.group() - if token == '\\clmgf': - current_def['h_merge_first'] = True - elif token == '\\clmrg': - current_def['h_merge_cont'] = True - elif token == '\\clvmgf': - current_def['v_merge_first'] = True - elif token == '\\clvmrg': - current_def['v_merge_cont'] = True - elif token.startswith('\\cellx'): - if match.group(1): - current_def['right_boundary'] = int(match.group(1)) - cell_defs.append(current_def.copy()) - # 다음 셀을 위해 초기화 - current_def = { - 'h_merge_first': False, - 'h_merge_cont': False, - 'v_merge_first': False, - 'v_merge_cont': False, - 'right_boundary': 0 - } - - # 2단계: 셀 내용 추출 - cell_texts = _extract_cell_texts(row_text, encoding) - - # 3단계: 셀 정의와 내용 매칭 - for i, cell_text in enumerate(cell_texts): - if i < len(cell_defs): - cell_def = cell_defs[i] - else: - cell_def = { - 'h_merge_first': False, - 'h_merge_cont': False, - 'v_merge_first': False, - 'v_merge_cont': False, - 'right_boundary': 0 - } - - cells.append(RTFCellInfo( - text=cell_text, - h_merge_first=cell_def['h_merge_first'], - h_merge_cont=cell_def['h_merge_cont'], - v_merge_first=cell_def['v_merge_first'], - v_merge_cont=cell_def['v_merge_cont'], - right_boundary=cell_def['right_boundary'] - )) - - return cells - - -def _extract_cell_texts(row_text: str, encoding: str = "cp949") -> List[str]: - r""" - 행에서 셀 텍스트만 추출합니다. - - Args: - row_text: 테이블 행 RTF 텍스트 - encoding: 사용할 인코딩 - - Returns: - 셀 텍스트 리스트 - """ - cell_texts = [] - - # 1단계: 모든 \cell 위치 찾기 (cellx가 아닌 순수 \cell만) - cell_positions = [] - pos = 0 - while True: - idx = row_text.find('\\cell', pos) - if idx == -1: - break - # \cell 다음이 x인지 확인 - next_pos = idx + 5 - if next_pos < len(row_text) and row_text[next_pos] == 'x': - pos = idx + 1 - continue - cell_positions.append(idx) - pos = idx + 1 - - if not cell_positions: - return cell_texts - - # 2단계: 첫 번째 \cell 위치 이전에서 마지막 \cellx 찾기 - first_cell_pos = cell_positions[0] - def_part = row_text[:first_cell_pos] - - last_cellx_end = 0 - for match in re.finditer(r'\\cellx-?\d+', def_part): - last_cellx_end = match.end() - - if last_cellx_end == 0: - last_cellx_end = 0 - - # 3단계: 각 셀 내용 추출 - prev_end = last_cellx_end - for cell_end in cell_positions: - cell_content = row_text[prev_end:cell_end] - - # RTF 디코딩 및 클리닝 - decoded = decode_hex_escapes(cell_content, encoding) - clean = clean_rtf_text(decoded, encoding) - cell_texts.append(clean) - - # 다음 셀은 \cell 다음부터 - prev_end = cell_end + 5 # len('\\cell') = 5 - - return cell_texts diff --git a/contextifier/core/processor/docx_handler.py b/contextifier/core/processor/docx_handler.py index b83b885..9de1418 100644 --- a/contextifier/core/processor/docx_handler.py +++ b/contextifier/core/processor/docx_handler.py @@ -45,14 +45,13 @@ from contextifier.core.processor.docx_helper import ( # Constants ElementType, - # Metadata - extract_docx_metadata, - format_metadata, # Table process_table_element, # Paragraph process_paragraph_element, ) +from contextifier.core.processor.docx_helper.docx_metadata import DOCXMetadataExtractor +from contextifier.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor logger = logging.getLogger("document-processor") @@ -64,24 +63,47 @@ class DOCXHandler(BaseHandler): """ DOCX Document Processing Handler - + Inherits from BaseHandler to manage config and image_processor at instance level. - + Fallback Chain: 1. Enhanced DOCX processing (python-docx with BytesIO stream) 2. DOCHandler fallback (for non-ZIP files: RTF, OLE, HTML, etc.) 3. Simple text extraction 4. Error message - + Usage: handler = DOCXHandler(config=config, image_processor=image_processor) text = handler.extract_text(current_file) """ - + + def _create_file_converter(self): + """Create DOCX-specific file converter.""" + from contextifier.core.processor.docx_helper.docx_file_converter import DOCXFileConverter + return DOCXFileConverter() + + def _create_preprocessor(self): + """Create DOCX-specific preprocessor.""" + from contextifier.core.processor.docx_helper.docx_preprocessor import DOCXPreprocessor + return DOCXPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Create DOCX-specific chart extractor.""" return DOCXChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create DOCX-specific metadata extractor.""" + return DOCXMetadataExtractor() + + def _create_format_image_processor(self): + """Create DOCX-specific image processor.""" + return DOCXImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + def extract_text( self, current_file: "CurrentFile", @@ -90,36 +112,27 @@ def extract_text( ) -> str: """ Extract text from DOCX file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text (with inline image tags, table HTML) """ file_path = current_file.get("file_path", "unknown") + file_data = current_file.get("file_data", b"") self.logger.info(f"DOCX processing: {file_path}") - - # Check if file is a valid ZIP (DOCX is a ZIP-based format) - if self._is_valid_zip(current_file): + + # Check if file is a valid DOCX using file_converter validation + if self.file_converter.validate(file_data): return self._extract_docx_enhanced(current_file, extract_metadata) else: - # Not a valid ZIP, try DOCHandler fallback - self.logger.warning(f"File is not a valid ZIP, trying DOCHandler fallback: {file_path}") + # Not a valid DOCX, try DOCHandler fallback + self.logger.warning(f"File is not a valid DOCX, trying DOCHandler fallback: {file_path}") return self._extract_with_doc_handler_fallback(current_file, extract_metadata) - - def _is_valid_zip(self, current_file: "CurrentFile") -> bool: - """Check if file is a valid ZIP archive.""" - try: - file_stream = self.get_file_stream(current_file) - with zipfile.ZipFile(file_stream, 'r') as zf: - # Check for DOCX-specific content - return '[Content_Types].xml' in zf.namelist() - except (zipfile.BadZipFile, Exception): - return False - + def _extract_with_doc_handler_fallback( self, current_file: "CurrentFile", @@ -127,41 +140,41 @@ def _extract_with_doc_handler_fallback( ) -> str: """ Fallback to DOCHandler for non-ZIP files. - + Handles RTF, OLE, HTML, and other formats that might be incorrectly named as .docx files. """ file_path = current_file.get("file_path", "unknown") - + try: from contextifier.core.processor.doc_handler import DOCHandler - + doc_handler = DOCHandler( config=self.config, - image_processor=self.image_processor + image_processor=self.format_image_processor ) - + # DOCHandler still uses file_path, so pass it directly result = doc_handler.extract_text(current_file, extract_metadata=extract_metadata) - + if result and not result.startswith("[DOC"): self.logger.info(f"DOCHandler fallback successful for: {file_path}") return result else: # DOCHandler also failed, try simple extraction return self._extract_simple_text_fallback(current_file) - + except Exception as e: self.logger.error(f"DOCHandler fallback failed: {e}") return self._extract_simple_text_fallback(current_file) - + def _extract_simple_text_fallback(self, current_file: "CurrentFile") -> str: """ Last resort: try to extract any readable text from the file. """ file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + try: # Try different encodings for encoding in ['utf-8', 'cp949', 'euc-kr', 'latin-1']: @@ -171,20 +184,20 @@ def _extract_simple_text_fallback(self, current_file: "CurrentFile") -> str: import re text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text) text = text.strip() - + if text and len(text) > 50: # Must have meaningful content self.logger.info(f"Simple text extraction successful with {encoding}: {file_path}") return text except (UnicodeDecodeError, Exception): continue - + raise ValueError("Could not decode file with any known encoding") - + except Exception as e: self.logger.error(f"All extraction methods failed for: {file_path}") raise RuntimeError(f"DOCX file processing failed: {file_path}. " f"File is not a valid DOCX, DOC, RTF, or text file.") - + def _extract_docx_enhanced( self, current_file: "CurrentFile", @@ -192,7 +205,7 @@ def _extract_docx_enhanced( ) -> str: """ Enhanced DOCX processing. - + - Document order preservation (body element traversal) - Metadata extraction - Inline image extraction and local saving @@ -201,12 +214,17 @@ def _extract_docx_enhanced( - Page break handling """ file_path = current_file.get("file_path", "unknown") + file_data = current_file.get("file_data", b"") self.logger.info(f"Enhanced DOCX processing: {file_path}") try: - # Use BytesIO stream to avoid path encoding issues - file_stream = self.get_file_stream(current_file) - doc = Document(file_stream) + # Step 1: Use file_converter to convert binary to Document + doc = self.file_converter.convert(file_data) + + # Step 2: Preprocess - may transform doc in the future + preprocessed = self.preprocess(doc) + doc = preprocessed.clean_content # TRUE SOURCE + result_parts = [] processed_images: Set[str] = set() current_page = 1 @@ -215,10 +233,10 @@ def _extract_docx_enhanced( total_charts = 0 # Pre-extract all charts using ChartExtractor - file_stream.seek(0) + file_stream = self.get_file_stream(current_file) chart_data_list = self.chart_extractor.extract_all_from_file(file_stream) chart_idx = [0] # Mutable container for closure - + def get_next_chart() -> str: """Callback to get the next pre-extracted chart content.""" if chart_idx[0] < len(chart_data_list): @@ -229,11 +247,10 @@ def get_next_chart() -> str: # Metadata extraction if extract_metadata: - metadata = extract_docx_metadata(doc) - metadata_str = format_metadata(metadata) + metadata_str = self.extract_and_format_metadata(doc) if metadata_str: result_parts.append(metadata_str + "\n\n") - self.logger.info(f"DOCX metadata extracted: {list(metadata.keys())}") + self.logger.info(f"DOCX metadata extracted") # Start page 1 page_tag = self.create_page_tag(current_page) @@ -247,7 +264,7 @@ def get_next_chart() -> str: # Paragraph processing - pass chart_callback for pre-extracted charts content, has_page_break, img_count, chart_count = process_paragraph_element( body_elem, doc, processed_images, file_path, - image_processor=self.image_processor, + image_processor=self.format_image_processor, chart_callback=get_next_chart ) @@ -281,14 +298,14 @@ def get_next_chart() -> str: self.logger.error(f"Error in enhanced DOCX processing: {e}") self.logger.debug(traceback.format_exc()) return self._extract_docx_simple_text(current_file) - + def _format_chart_data(self, chart_data) -> str: """Format ChartData using ChartProcessor.""" from contextifier.core.functions.chart_extractor import ChartData - + if not isinstance(chart_data, ChartData): return "" - + if chart_data.has_data(): return self.chart_processor.format_chart_data( chart_type=chart_data.chart_type, @@ -301,12 +318,12 @@ def _format_chart_data(self, chart_data) -> str: chart_type=chart_data.chart_type, title=chart_data.title ) - + def _extract_docx_simple_text(self, current_file: "CurrentFile") -> str: """Simple text extraction (fallback).""" try: - file_stream = self.get_file_stream(current_file) - doc = Document(file_stream) + file_data = current_file.get("file_data", b"") + doc = self.file_converter.convert(file_data) result_parts = [] for para in doc.paragraphs: diff --git a/contextifier/core/processor/docx_helper/__init__.py b/contextifier/core/processor/docx_helper/__init__.py index e04ad87..e4a572a 100644 --- a/contextifier/core/processor/docx_helper/__init__.py +++ b/contextifier/core/processor/docx_helper/__init__.py @@ -1,17 +1,16 @@ -# service/document_processor/processor/docx_helper/__init__.py +# contextifier/core/processor/docx_helper/__init__.py """ -DOCX Helper 모듈 +DOCX Helper Module -DOCX 문서 처리에 필요한 유틸리티를 기능별로 분리한 모듈입니다. +Utility modules for DOCX document processing. -모듈 구성: -- docx_constants: 상수, Enum, 데이터클래스 (ElementType, NAMESPACES 등) -- docx_metadata: 메타데이터 추출 및 포맷팅 -- docx_chart_extractor: 차트 추출 (ChartExtractor) -- docx_image: 이미지 추출 및 업로드 -- docx_table: 테이블 HTML 변환 (rowspan/colspan 지원) -- docx_drawing: Drawing 요소 처리 (이미지/다이어그램) -- docx_paragraph: Paragraph 처리 및 페이지 브레이크 +Module structure: +- docx_constants: Constants, Enum, dataclasses (ElementType, NAMESPACES, etc.) +- docx_metadata: Metadata extraction (DOCXMetadataExtractor) +- docx_chart_extractor: Chart extraction (DOCXChartExtractor) +- docx_image_processor: Image/drawing processing (DOCXImageProcessor) +- docx_table: Table HTML conversion (rowspan/colspan support) +- docx_paragraph: Paragraph processing and page breaks """ # Constants @@ -24,8 +23,7 @@ # Metadata from contextifier.core.processor.docx_helper.docx_metadata import ( - extract_docx_metadata, - format_metadata, + DOCXMetadataExtractor, ) # Chart Extractor @@ -33,10 +31,9 @@ DOCXChartExtractor, ) -# Image -from contextifier.core.processor.docx_helper.docx_image import ( - extract_image_from_drawing, - process_pict_element, +# Image Processor (replaces docx_image.py utility functions) +from contextifier.core.processor.docx_helper.docx_image_processor import ( + DOCXImageProcessor, ) # Table @@ -49,11 +46,6 @@ extract_table_as_text, ) -# Drawing -from contextifier.core.processor.docx_helper.docx_drawing import ( - process_drawing_element, -) - # Paragraph from contextifier.core.processor.docx_helper.docx_paragraph import ( process_paragraph_element, @@ -68,13 +60,11 @@ 'NAMESPACES', 'CHART_TYPE_MAP', # Metadata - 'extract_docx_metadata', - 'format_metadata', + 'DOCXMetadataExtractor', # Chart Extractor 'DOCXChartExtractor', - # Image - 'extract_image_from_drawing', - 'process_pict_element', + # Image Processor + 'DOCXImageProcessor', # Table 'TableCellInfo', 'process_table_element', @@ -82,8 +72,6 @@ 'estimate_column_count', 'extract_cell_text', 'extract_table_as_text', - # Drawing - 'process_drawing_element', # Paragraph 'process_paragraph_element', 'has_page_break_element', diff --git a/contextifier/core/processor/docx_helper/docx_drawing.py b/contextifier/core/processor/docx_helper/docx_drawing.py deleted file mode 100644 index 3fab728..0000000 --- a/contextifier/core/processor/docx_helper/docx_drawing.py +++ /dev/null @@ -1,121 +0,0 @@ -# service/document_processor/processor/docx_helper/docx_drawing.py -""" -DOCX Drawing Element Processing Utility - -Processes Drawing elements (images, charts, diagrams) in DOCX documents. -- process_drawing_element: Process Drawing element (branch to image/chart/diagram) -- extract_diagram_from_drawing: Extract diagram from Drawing - -Note: Chart extraction is handled separately by DOCXChartExtractor. - This module only detects chart presence for counting/positioning. -""" -import logging -from typing import Optional, Set, Tuple, Callable - -from docx import Document - -from contextifier.core.processor.docx_helper.docx_constants import ElementType, NAMESPACES -from contextifier.core.processor.docx_helper.docx_image import extract_image_from_drawing -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -def process_drawing_element( - drawing_elem, - doc: Document, - processed_images: Set[str], - file_path: str = None, - image_processor: Optional[ImageProcessor] = None, - chart_callback: Optional[Callable[[], str]] = None -) -> Tuple[str, Optional[ElementType]]: - """ - Process Drawing element (image, chart, diagram). - - Args: - drawing_elem: drawing XML element - doc: python-docx Document object - processed_images: Set of processed image paths (deduplication) - file_path: Original file path - image_processor: ImageProcessor instance - chart_callback: Callback function to get next chart content. - Called when chart is detected, should return formatted chart string. - - Returns: - (content, element_type) tuple - """ - try: - # Check inline or anchor - inline = drawing_elem.find('.//wp:inline', NAMESPACES) - anchor = drawing_elem.find('.//wp:anchor', NAMESPACES) - - container = inline if inline is not None else anchor - if container is None: - return "", None - - # Check graphic data - graphic = container.find('.//a:graphic', NAMESPACES) - if graphic is None: - return "", None - - graphic_data = graphic.find('a:graphicData', NAMESPACES) - if graphic_data is None: - return "", None - - uri = graphic_data.get('uri', '') - - # Image case - if 'picture' in uri.lower(): - return extract_image_from_drawing(graphic_data, doc, processed_images, image_processor) - - # Chart case - use callback to get pre-extracted chart content - if 'chart' in uri.lower(): - if chart_callback: - chart_content = chart_callback() - return chart_content, ElementType.CHART - return "", ElementType.CHART - - # Diagram case - if 'diagram' in uri.lower(): - return extract_diagram_from_drawing(graphic_data, doc) - - # Other drawing - return "", None - - except Exception as e: - logger.warning(f"Error processing drawing element: {e}") - return "", None - - -def extract_diagram_from_drawing(graphic_data, doc: Document) -> Tuple[str, Optional[ElementType]]: - """ - Extract diagram information from Drawing. - - Args: - graphic_data: graphicData XML element - doc: python-docx Document object - - Returns: - (content, element_type) tuple - """ - try: - # Try to extract text from diagram - texts = [] - for t_elem in graphic_data.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}t'): - if t_elem.text: - texts.append(t_elem.text.strip()) - - if texts: - return f"[Diagram: {' / '.join(texts)}]", ElementType.DIAGRAM - - return "[Diagram]", ElementType.DIAGRAM - - except Exception as e: - logger.warning(f"Error extracting diagram from drawing: {e}") - return "[Diagram]", ElementType.DIAGRAM - - -__all__ = [ - 'process_drawing_element', - 'extract_diagram_from_drawing', -] diff --git a/contextifier/core/processor/docx_helper/docx_drawing_new.py b/contextifier/core/processor/docx_helper/docx_drawing_new.py deleted file mode 100644 index 3fab728..0000000 --- a/contextifier/core/processor/docx_helper/docx_drawing_new.py +++ /dev/null @@ -1,121 +0,0 @@ -# service/document_processor/processor/docx_helper/docx_drawing.py -""" -DOCX Drawing Element Processing Utility - -Processes Drawing elements (images, charts, diagrams) in DOCX documents. -- process_drawing_element: Process Drawing element (branch to image/chart/diagram) -- extract_diagram_from_drawing: Extract diagram from Drawing - -Note: Chart extraction is handled separately by DOCXChartExtractor. - This module only detects chart presence for counting/positioning. -""" -import logging -from typing import Optional, Set, Tuple, Callable - -from docx import Document - -from contextifier.core.processor.docx_helper.docx_constants import ElementType, NAMESPACES -from contextifier.core.processor.docx_helper.docx_image import extract_image_from_drawing -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -def process_drawing_element( - drawing_elem, - doc: Document, - processed_images: Set[str], - file_path: str = None, - image_processor: Optional[ImageProcessor] = None, - chart_callback: Optional[Callable[[], str]] = None -) -> Tuple[str, Optional[ElementType]]: - """ - Process Drawing element (image, chart, diagram). - - Args: - drawing_elem: drawing XML element - doc: python-docx Document object - processed_images: Set of processed image paths (deduplication) - file_path: Original file path - image_processor: ImageProcessor instance - chart_callback: Callback function to get next chart content. - Called when chart is detected, should return formatted chart string. - - Returns: - (content, element_type) tuple - """ - try: - # Check inline or anchor - inline = drawing_elem.find('.//wp:inline', NAMESPACES) - anchor = drawing_elem.find('.//wp:anchor', NAMESPACES) - - container = inline if inline is not None else anchor - if container is None: - return "", None - - # Check graphic data - graphic = container.find('.//a:graphic', NAMESPACES) - if graphic is None: - return "", None - - graphic_data = graphic.find('a:graphicData', NAMESPACES) - if graphic_data is None: - return "", None - - uri = graphic_data.get('uri', '') - - # Image case - if 'picture' in uri.lower(): - return extract_image_from_drawing(graphic_data, doc, processed_images, image_processor) - - # Chart case - use callback to get pre-extracted chart content - if 'chart' in uri.lower(): - if chart_callback: - chart_content = chart_callback() - return chart_content, ElementType.CHART - return "", ElementType.CHART - - # Diagram case - if 'diagram' in uri.lower(): - return extract_diagram_from_drawing(graphic_data, doc) - - # Other drawing - return "", None - - except Exception as e: - logger.warning(f"Error processing drawing element: {e}") - return "", None - - -def extract_diagram_from_drawing(graphic_data, doc: Document) -> Tuple[str, Optional[ElementType]]: - """ - Extract diagram information from Drawing. - - Args: - graphic_data: graphicData XML element - doc: python-docx Document object - - Returns: - (content, element_type) tuple - """ - try: - # Try to extract text from diagram - texts = [] - for t_elem in graphic_data.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}t'): - if t_elem.text: - texts.append(t_elem.text.strip()) - - if texts: - return f"[Diagram: {' / '.join(texts)}]", ElementType.DIAGRAM - - return "[Diagram]", ElementType.DIAGRAM - - except Exception as e: - logger.warning(f"Error extracting diagram from drawing: {e}") - return "[Diagram]", ElementType.DIAGRAM - - -__all__ = [ - 'process_drawing_element', - 'extract_diagram_from_drawing', -] diff --git a/contextifier/core/processor/docx_helper/docx_file_converter.py b/contextifier/core/processor/docx_helper/docx_file_converter.py new file mode 100644 index 0000000..38e278d --- /dev/null +++ b/contextifier/core/processor/docx_helper/docx_file_converter.py @@ -0,0 +1,75 @@ +# libs/core/processor/docx_helper/docx_file_converter.py +""" +DOCXFileConverter - DOCX file format converter + +Converts binary DOCX data to python-docx Document object. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO +import zipfile + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class DOCXFileConverter(BaseFileConverter): + """ + DOCX file converter using python-docx. + + Converts binary DOCX data to Document object. + """ + + # ZIP magic number (DOCX is a ZIP file) + ZIP_MAGIC = b'PK\x03\x04' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary DOCX data to Document object. + + Args: + file_data: Raw binary DOCX data + file_stream: Optional file stream + **kwargs: Additional options + + Returns: + docx.Document object + + Raises: + Exception: If DOCX cannot be opened + """ + from docx import Document + + stream = file_stream if file_stream is not None else BytesIO(file_data) + stream.seek(0) + return Document(stream) + + def get_format_name(self) -> str: + """Return format name.""" + return "DOCX Document" + + def validate(self, file_data: bytes) -> bool: + """ + Validate if data is a valid DOCX (ZIP with specific structure). + + Args: + file_data: Raw binary file data + + Returns: + True if file appears to be a DOCX + """ + if not file_data or len(file_data) < 4: + return False + + if not file_data[:4] == self.ZIP_MAGIC: + return False + + # Check for DOCX-specific content + try: + with zipfile.ZipFile(BytesIO(file_data), 'r') as zf: + return '[Content_Types].xml' in zf.namelist() + except zipfile.BadZipFile: + return False diff --git a/contextifier/core/processor/docx_helper/docx_image.py b/contextifier/core/processor/docx_helper/docx_image.py index e066597..e515c59 100644 --- a/contextifier/core/processor/docx_helper/docx_image.py +++ b/contextifier/core/processor/docx_helper/docx_image.py @@ -5,15 +5,20 @@ DOCX 문서에서 이미지를 추출하고 로컬에 저장합니다. - extract_image_from_drawing: Drawing 요소에서 이미지 추출 - process_pict_element: 레거시 VML pict 요소 처리 + +Note: 이 함수들은 DOCXImageProcessor의 메서드를 호출하는 wrapper입니다. + 실제 로직은 DOCXImageProcessor에 통합되어 있습니다. """ import logging -from typing import Optional, Set, Tuple +from typing import Optional, Set, Tuple, TYPE_CHECKING from docx import Document -from docx.oxml.ns import qn -from contextifier.core.functions.img_processor import ImageProcessor -from contextifier.core.processor.docx_helper.docx_constants import ElementType, NAMESPACES +from contextifier.core.processor.docx_helper.docx_constants import ElementType + +if TYPE_CHECKING: + from contextifier.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor + from contextifier.core.functions.img_processor import ImageProcessor logger = logging.getLogger("document-processor") @@ -22,7 +27,7 @@ def extract_image_from_drawing( graphic_data, doc: Document, processed_images: Set[str], - image_processor: ImageProcessor + image_processor: "ImageProcessor" ) -> Tuple[str, Optional[ElementType]]: """ Drawing에서 이미지를 추출합니다. @@ -31,39 +36,42 @@ def extract_image_from_drawing( graphic_data: graphicData XML 요소 doc: python-docx Document 객체 processed_images: 처리된 이미지 경로 집합 (중복 방지) - image_processor: ImageProcessor 인스턴스 + image_processor: ImageProcessor 인스턴스 (DOCXImageProcessor 권장) Returns: (content, element_type) 튜플 """ + # DOCXImageProcessor인 경우 통합된 메서드 사용 + if hasattr(image_processor, 'extract_from_drawing'): + content, is_image = image_processor.extract_from_drawing( + graphic_data, doc, processed_images + ) + return (content, ElementType.IMAGE) if is_image else ("", None) + + # Fallback: 기존 로직 (ImageProcessor 기본 클래스인 경우) + from docx.oxml.ns import qn + from contextifier.core.processor.docx_helper.docx_constants import NAMESPACES try: - # blip 요소 찾기 (이미지 참조) blip = graphic_data.find('.//a:blip', NAMESPACES) if blip is None: return "", None - # Relationship ID r_embed = blip.get(qn('r:embed')) r_link = blip.get(qn('r:link')) - rId = r_embed or r_link + if not rId: return "", None - # Relationship에서 이미지 파트 찾기 try: rel = doc.part.rels.get(rId) if rel is None: return "", None - # 이미지 데이터 추출 if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'): image_data = rel.target_part.blob - - # 로컬에 저장 image_tag = image_processor.save_image(image_data, processed_images=processed_images) - if image_tag: return f"\n{image_tag}\n", ElementType.IMAGE @@ -82,7 +90,7 @@ def process_pict_element( pict_elem, doc: Document, processed_images: Set[str], - image_processor: ImageProcessor + image_processor: "ImageProcessor" ) -> str: """ 레거시 VML pict 요소를 처리합니다. @@ -91,14 +99,17 @@ def process_pict_element( pict_elem: pict XML 요소 doc: python-docx Document 객체 processed_images: 처리된 이미지 경로 집합 (중복 방지) - image_processor: ImageProcessor 인스턴스 + image_processor: ImageProcessor 인스턴스 (DOCXImageProcessor 권장) Returns: 이미지 마크업 문자열 """ + # DOCXImageProcessor인 경우 통합된 메서드 사용 + if hasattr(image_processor, 'extract_from_pict'): + return image_processor.extract_from_pict(pict_elem, doc, processed_images) + # Fallback: 기존 로직 (ImageProcessor 기본 클래스인 경우) try: - # VML imagedata 찾기 ns_v = 'urn:schemas-microsoft-com:vml' ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' diff --git a/contextifier/core/processor/docx_helper/docx_image_processor.py b/contextifier/core/processor/docx_helper/docx_image_processor.py new file mode 100644 index 0000000..a9966c9 --- /dev/null +++ b/contextifier/core/processor/docx_helper/docx_image_processor.py @@ -0,0 +1,410 @@ +# contextifier/core/processor/docx_helper/docx_image_processor.py +""" +DOCX Image Processor + +Provides DOCX-specific image processing that inherits from ImageProcessor. +Handles embedded images, drawing elements (image/diagram), and relationship-based images. + +This class consolidates all DOCX image and drawing extraction logic including: +- Drawing/picture element extraction (blip) +- Diagram text extraction from drawings +- Legacy VML pict element processing +- Relationship-based image loading +""" +import logging +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING + +from docx.oxml.ns import qn + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend +from contextifier.core.processor.docx_helper.docx_constants import ElementType + +if TYPE_CHECKING: + from docx import Document + from docx.opc.part import Part + +logger = logging.getLogger("contextify.image_processor.docx") + +# DOCX XML namespaces +NAMESPACES = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', + 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', +} + + +class DOCXImageProcessor(ImageProcessor): + """ + DOCX-specific image processor. + + Inherits from ImageProcessor and provides DOCX-specific processing. + + Handles: + - Embedded images via relationships + - Drawing/picture elements + - Inline images in runs + - Shape images + + Example: + processor = DOCXImageProcessor() + + # Process relationship-based image + tag = processor.process_image(image_data, rel_id="rId1") + + # Process from part + tag = processor.process_image_part(image_part) + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize DOCXImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + rel_id: Optional[str] = None, + image_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save DOCX image data. + + Args: + image_data: Raw image binary data + rel_id: Relationship ID (for naming) + image_name: Original image name + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and rel_id is not None: + custom_name = f"docx_{rel_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_image_part( + self, + image_part: "Part", + rel_id: Optional[str] = None, + ) -> Optional[str]: + """ + Process image from OOXML part. + + Args: + image_part: OOXML Part containing image data + rel_id: Relationship ID + + Returns: + Image tag string, or None on failure + """ + try: + image_data = image_part.blob + if not image_data: + return None + + # Try to get original filename + image_name = None + if hasattr(image_part, 'partname'): + partname = str(image_part.partname) + if '/' in partname: + image_name = partname.split('/')[-1] + + return self.process_image( + image_data, + rel_id=rel_id, + image_name=image_name + ) + + except Exception as e: + self._logger.warning(f"Failed to process image part: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + embed_id: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded DOCX image. + + Args: + image_data: Image binary data + image_name: Original image filename + embed_id: Embed relationship ID + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and embed_id is not None: + custom_name = f"docx_embed_{embed_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_drawing_image( + self, + image_data: bytes, + drawing_id: Optional[str] = None, + description: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process DOCX drawing/picture element image. + + Args: + image_data: Image binary data + drawing_id: Drawing element ID + description: Image description/alt text + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = None + if drawing_id is not None: + custom_name = f"docx_drawing_{drawing_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def extract_from_drawing( + self, + graphic_data, + doc: "Document", + processed_images: Set[str], + ) -> Tuple[str, bool]: + """ + Extract image from Drawing graphic data element. + + This is the core DOCX image extraction logic that was previously + in docx_image.py extract_image_from_drawing() function. + + Args: + graphic_data: graphicData XML element + doc: python-docx Document object + processed_images: Set of processed image paths (deduplication) + + Returns: + (image_tag, is_image) tuple. image_tag is the tag string or empty, + is_image indicates if an image was found. + """ + try: + # Find blip element (image reference) + blip = graphic_data.find('.//a:blip', NAMESPACES) + if blip is None: + return "", False + + # Get relationship ID + r_embed = blip.get(qn('r:embed')) + r_link = blip.get(qn('r:link')) + rId = r_embed or r_link + + if not rId: + return "", False + + # Find image part from relationship + try: + rel = doc.part.rels.get(rId) + if rel is None: + return "", False + + # Extract image data + if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'): + image_data = rel.target_part.blob + + # Save using process_image with rel_id + image_tag = self.process_image( + image_data, + rel_id=rId, + processed_images=processed_images + ) + + if image_tag: + return f"\n{image_tag}\n", True + + return "[Unknown Image]", True + + except Exception as e: + logger.warning(f"Error extracting image from relationship: {e}") + return "[Unknown Image]", True + + except Exception as e: + logger.warning(f"Error extracting image from drawing: {e}") + return "", False + + def extract_from_pict( + self, + pict_elem, + doc: "Document", + processed_images: Set[str], + ) -> str: + """ + Extract image from legacy VML pict element. + + This is the core DOCX VML image extraction logic that was previously + in docx_image.py process_pict_element() function. + + Args: + pict_elem: pict XML element + doc: python-docx Document object + processed_images: Set of processed image paths (deduplication) + + Returns: + Image tag string or placeholder + """ + try: + # Find VML imagedata + ns_v = 'urn:schemas-microsoft-com:vml' + ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' + + imagedata = pict_elem.find('.//{%s}imagedata' % ns_v) + if imagedata is None: + return "[Unknown Image]" + + rId = imagedata.get('{%s}id' % ns_r) + if not rId: + return "[Unknown Image]" + + try: + rel = doc.part.rels.get(rId) + if rel and hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'): + image_data = rel.target_part.blob + image_tag = self.process_image( + image_data, + rel_id=rId, + processed_images=processed_images + ) + if image_tag: + return f"\n{image_tag}\n" + except Exception: + pass + + return "[Unknown Image]" + + except Exception as e: + logger.warning(f"Error processing pict element: {e}") + return "" + + def process_drawing_element( + self, + drawing_elem, + doc: "Document", + processed_images: Set[str], + chart_callback: Optional[Callable[[], str]] = None, + ) -> Tuple[str, Optional[ElementType]]: + """ + Process Drawing element (image, chart, diagram). + + Main entry point for handling all drawing elements in DOCX. + Branches to appropriate handler based on content type. + + Args: + drawing_elem: drawing XML element + doc: python-docx Document object + processed_images: Set of processed image paths (deduplication) + chart_callback: Callback function to get next chart content + + Returns: + (content, element_type) tuple + """ + try: + # Check inline or anchor + inline = drawing_elem.find('.//wp:inline', NAMESPACES) + anchor = drawing_elem.find('.//wp:anchor', NAMESPACES) + + container = inline if inline is not None else anchor + if container is None: + return "", None + + # Check graphic data + graphic = container.find('.//a:graphic', NAMESPACES) + if graphic is None: + return "", None + + graphic_data = graphic.find('a:graphicData', NAMESPACES) + if graphic_data is None: + return "", None + + uri = graphic_data.get('uri', '') + + # Image case + if 'picture' in uri.lower(): + content, is_image = self.extract_from_drawing( + graphic_data, doc, processed_images + ) + return (content, ElementType.IMAGE) if is_image else ("", None) + + # Chart case - delegate to callback + if 'chart' in uri.lower(): + if chart_callback: + chart_content = chart_callback() + return chart_content, ElementType.CHART + return "", ElementType.CHART + + # Diagram case + if 'diagram' in uri.lower(): + return self.extract_diagram(graphic_data) + + return "", None + + except Exception as e: + logger.warning(f"Error processing drawing element: {e}") + return "", None + + def extract_diagram( + self, + graphic_data, + ) -> Tuple[str, Optional[ElementType]]: + """ + Extract diagram information from Drawing. + + Args: + graphic_data: graphicData XML element + + Returns: + (content, element_type) tuple + """ + try: + texts = [] + ns_a = 'http://schemas.openxmlformats.org/drawingml/2006/main' + for t_elem in graphic_data.findall('.//{%s}t' % ns_a): + if t_elem.text: + texts.append(t_elem.text.strip()) + + if texts: + return f"[Diagram: {' / '.join(texts)}]", ElementType.DIAGRAM + + return "[Diagram]", ElementType.DIAGRAM + + except Exception as e: + logger.warning(f"Error extracting diagram: {e}") + return "[Diagram]", ElementType.DIAGRAM + + +__all__ = ["DOCXImageProcessor"] diff --git a/contextifier/core/processor/docx_helper/docx_metadata.py b/contextifier/core/processor/docx_helper/docx_metadata.py index 15bb127..a651734 100644 --- a/contextifier/core/processor/docx_helper/docx_metadata.py +++ b/contextifier/core/processor/docx_helper/docx_metadata.py @@ -1,112 +1,71 @@ -# service/document_processor/processor/docx_helper/docx_metadata.py +# contextifier/core/processor/docx_helper/docx_metadata.py """ -DOCX 메타데이터 추출 유틸리티 +DOCX Metadata Extraction Module -DOCX 문서의 core_properties에서 메타데이터를 추출하고 포맷팅합니다. -- extract_docx_metadata: 메타데이터 딕셔너리 추출 -- format_metadata: 메타데이터를 읽기 쉬운 문자열로 변환 +Provides DOCXMetadataExtractor class for extracting metadata from DOCX documents +using python-docx core_properties. Implements BaseMetadataExtractor interface. """ import logging -from datetime import datetime -from typing import Any, Dict +from typing import Any, Optional from docx import Document -logger = logging.getLogger("document-processor") - - -def extract_docx_metadata(doc: Document) -> Dict[str, Any]: - """ - DOCX 문서에서 메타데이터를 추출합니다. - - python-docx의 core_properties를 통해 다음 정보를 추출합니다: - - 제목 (title) - - 주제 (subject) - - 작성자 (author) - - 키워드 (keywords) - - 설명 (comments) - - 마지막 수정자 (last_modified_by) - - 작성일 (created) - - 수정일 (modified) - - Args: - doc: python-docx Document 객체 - - Returns: - 메타데이터 딕셔너리 - """ - metadata = {} - - try: - props = doc.core_properties - - if props.title: - metadata['title'] = props.title.strip() - if props.subject: - metadata['subject'] = props.subject.strip() - if props.author: - metadata['author'] = props.author.strip() - if props.keywords: - metadata['keywords'] = props.keywords.strip() - if props.comments: - metadata['comments'] = props.comments.strip() - if props.last_modified_by: - metadata['last_saved_by'] = props.last_modified_by.strip() - if props.created: - metadata['create_time'] = props.created - if props.modified: - metadata['last_saved_time'] = props.modified - - logger.debug(f"Extracted DOCX metadata: {list(metadata.keys())}") +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) - except Exception as e: - logger.warning(f"Failed to extract DOCX metadata: {e}") - - return metadata +logger = logging.getLogger("document-processor") -def format_metadata(metadata: Dict[str, Any]) -> str: +class DOCXMetadataExtractor(BaseMetadataExtractor): """ - 메타데이터 딕셔너리를 읽기 쉬운 문자열로 변환합니다. - - Args: - metadata: 메타데이터 딕셔너리 - - Returns: - 포맷된 메타데이터 문자열 + DOCX Metadata Extractor. + + Extracts metadata from python-docx Document objects. + + Supported fields: + - title, subject, author, keywords, comments + - last_saved_by, create_time, last_saved_time + + Usage: + extractor = DOCXMetadataExtractor() + metadata = extractor.extract(docx_document) + text = extractor.format(metadata) """ - if not metadata: - return "" - - lines = [""] - - field_names = { - 'title': '제목', - 'subject': '주제', - 'author': '작성자', - 'keywords': '키워드', - 'comments': '설명', - 'last_saved_by': '마지막 저장자', - 'create_time': '작성일', - 'last_saved_time': '수정일', - } - - for key, label in field_names.items(): - if key in metadata and metadata[key]: - value = metadata[key] - - # datetime 객체 포맷팅 - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - - lines.append(f" {label}: {value}") - - lines.append("") - - return "\n".join(lines) + + def extract(self, source: Document) -> DocumentMetadata: + """ + Extract metadata from DOCX document. + + Args: + source: python-docx Document object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + try: + props = source.core_properties + + return DocumentMetadata( + title=self._get_stripped(props.title), + subject=self._get_stripped(props.subject), + author=self._get_stripped(props.author), + keywords=self._get_stripped(props.keywords), + comments=self._get_stripped(props.comments), + last_saved_by=self._get_stripped(props.last_modified_by), + create_time=props.created, + last_saved_time=props.modified, + ) + except Exception as e: + self.logger.warning(f"Failed to extract DOCX metadata: {e}") + return DocumentMetadata() + + def _get_stripped(self, value: Optional[str]) -> Optional[str]: + """Return stripped string value, or None if empty.""" + return value.strip() if value else None __all__ = [ - 'extract_docx_metadata', - 'format_metadata', + 'DOCXMetadataExtractor', ] diff --git a/contextifier/core/processor/docx_helper/docx_paragraph.py b/contextifier/core/processor/docx_helper/docx_paragraph.py index 0c929fc..e3b842c 100644 --- a/contextifier/core/processor/docx_helper/docx_paragraph.py +++ b/contextifier/core/processor/docx_helper/docx_paragraph.py @@ -1,20 +1,22 @@ -# service/document_processor/processor/docx_helper/docx_paragraph.py +# contextifier/core/processor/docx_helper/docx_paragraph.py """ DOCX Paragraph Processing Utility Processes Paragraph elements in DOCX documents. - process_paragraph_element: Process Paragraph element - has_page_break_element: Check for page break + +Image and drawing extraction is handled by DOCXImageProcessor. """ import logging -from typing import Optional, Set, Tuple, Callable +from typing import Optional, Set, Tuple, Callable, TYPE_CHECKING from docx import Document from contextifier.core.processor.docx_helper.docx_constants import ElementType, NAMESPACES -from contextifier.core.processor.docx_helper.docx_drawing import process_drawing_element -from contextifier.core.processor.docx_helper.docx_image import process_pict_element -from contextifier.core.functions.img_processor import ImageProcessor + +if TYPE_CHECKING: + from contextifier.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor logger = logging.getLogger("document-processor") @@ -24,7 +26,7 @@ def process_paragraph_element( doc: Document, processed_images: Set[str], file_path: str = None, - image_processor: Optional[ImageProcessor] = None, + image_processor: Optional["DOCXImageProcessor"] = None, chart_callback: Optional[Callable[[], str]] = None ) -> Tuple[str, bool, int, int]: """ @@ -37,7 +39,7 @@ def process_paragraph_element( doc: python-docx Document object processed_images: Set of processed image paths (deduplication) file_path: Original file path - image_processor: ImageProcessor instance + image_processor: DOCXImageProcessor instance chart_callback: Callback function to get next chart content Returns: @@ -59,13 +61,14 @@ def process_paragraph_element( if t_elem.text: content_parts.append(t_elem.text) - # Process Drawing (image/chart/diagram) + # Process Drawing (image/chart/diagram) via DOCXImageProcessor for drawing_elem in run_elem.findall('w:drawing', NAMESPACES): - drawing_content, drawing_type = process_drawing_element( - drawing_elem, doc, processed_images, file_path, - image_processor, - chart_callback=chart_callback - ) + if image_processor and hasattr(image_processor, 'process_drawing_element'): + drawing_content, drawing_type = image_processor.process_drawing_element( + drawing_elem, doc, processed_images, chart_callback=chart_callback + ) + else: + drawing_content, drawing_type = "", None if drawing_content: content_parts.append(drawing_content) if drawing_type == ElementType.IMAGE: @@ -73,9 +76,12 @@ def process_paragraph_element( elif drawing_type == ElementType.CHART: chart_count += 1 - # Process pict element (legacy VML image) + # Process pict element (legacy VML image) - use DOCXImageProcessor for pict_elem in run_elem.findall('w:pict', NAMESPACES): - pict_content = process_pict_element(pict_elem, doc, processed_images, image_processor) + if image_processor and hasattr(image_processor, 'extract_from_pict'): + pict_content = image_processor.extract_from_pict(pict_elem, doc, processed_images) + else: + pict_content = "[Unknown Image]" if pict_content: content_parts.append(pict_content) image_count += 1 diff --git a/contextifier/core/processor/docx_helper/docx_preprocessor.py b/contextifier/core/processor/docx_helper/docx_preprocessor.py new file mode 100644 index 0000000..00a896f --- /dev/null +++ b/contextifier/core/processor/docx_helper/docx_preprocessor.py @@ -0,0 +1,82 @@ +# contextifier/core/processor/docx_helper/docx_preprocessor.py +""" +DOCX Preprocessor - Process DOCX document after conversion. + +Processing Pipeline Position: + 1. DOCXFileConverter.convert() → docx.Document + 2. DOCXPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. DOCXMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (paragraphs, tables, images) + +Current Implementation: + - Pass-through (DOCX uses python-docx Document object directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.docx.preprocessor") + + +class DOCXPreprocessor(BasePreprocessor): + """ + DOCX Document Preprocessor. + + Currently a pass-through implementation as DOCX processing + is handled during the content extraction phase using python-docx. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted DOCX document. + + Args: + converted_data: docx.Document object from DOCXFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the document and any extracted resources + """ + metadata: Dict[str, Any] = {} + + # Extract basic document info if available + if hasattr(converted_data, 'core_properties'): + props = converted_data.core_properties + if hasattr(props, 'title') and props.title: + metadata['title'] = props.title + + if hasattr(converted_data, 'paragraphs'): + metadata['paragraph_count'] = len(converted_data.paragraphs) + + if hasattr(converted_data, 'tables'): + metadata['table_count'] = len(converted_data.tables) + + logger.debug("DOCX preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the docx.Document + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - docx.Document + encoding="utf-8", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "DOCX Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is a DOCX Document object.""" + return hasattr(data, 'paragraphs') and hasattr(data, 'tables') + + +__all__ = ['DOCXPreprocessor'] diff --git a/contextifier/core/processor/excel_handler.py b/contextifier/core/processor/excel_handler.py index becdb65..8af8e0b 100644 --- a/contextifier/core/processor/excel_handler.py +++ b/contextifier/core/processor/excel_handler.py @@ -7,7 +7,7 @@ - Text extraction (direct parsing via openpyxl/xlrd) - Table extraction (Markdown or HTML conversion based on merged cells) - Inline image extraction and local storage -- Chart processing (1st priority: convert to table, 2nd priority: matplotlib image) +- Chart processing (convert to table) - Multi-sheet support Class-based Handler: @@ -31,13 +31,6 @@ from contextifier.core.processor.excel_helper import ( # Textbox extract_textboxes_from_xlsx, - # Metadata - extract_xlsx_metadata, - extract_xls_metadata, - format_metadata, - # Image - extract_images_from_xlsx, - get_sheet_images, # Table convert_xlsx_sheet_to_table, convert_xls_sheet_to_table, @@ -45,9 +38,13 @@ convert_xlsx_objects_to_tables, convert_xls_objects_to_tables, ) - -import xlrd -from openpyxl import load_workbook +from contextifier.core.processor.excel_helper.excel_metadata import ( + XLSXMetadataExtractor, + XLSMetadataExtractor, +) +from contextifier.core.processor.excel_helper.excel_image_processor import ( + ExcelImageProcessor, +) logger = logging.getLogger("document-processor") @@ -59,18 +56,52 @@ class ExcelHandler(BaseHandler): """ Excel Document Handler (XLSX/XLS) - + Inherits from BaseHandler to manage config and image_processor at instance level. - + Usage: handler = ExcelHandler(config=config, image_processor=image_processor) text = handler.extract_text(current_file) """ - + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._xlsx_metadata_extractor = None + self._xls_metadata_extractor = None + + def _create_file_converter(self): + """Create Excel-specific file converter.""" + from contextifier.core.processor.excel_helper.excel_file_converter import ExcelFileConverter + return ExcelFileConverter() + + def _create_preprocessor(self): + """Create Excel-specific preprocessor.""" + from contextifier.core.processor.excel_helper.excel_preprocessor import ExcelPreprocessor + return ExcelPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Create Excel-specific chart extractor.""" return ExcelChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create XLSX-specific metadata extractor (default).""" + return XLSXMetadataExtractor() + + def _create_format_image_processor(self): + """Create Excel-specific image processor.""" + return ExcelImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + + def _get_xls_metadata_extractor(self): + """Get XLS-specific metadata extractor.""" + if self._xls_metadata_extractor is None: + self._xls_metadata_extractor = XLSMetadataExtractor() + return self._xls_metadata_extractor + def extract_text( self, current_file: "CurrentFile", @@ -79,12 +110,12 @@ def extract_text( ) -> str: """ Extract text from Excel file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ @@ -100,7 +131,7 @@ def extract_text( return self._extract_xls(current_file, extract_metadata) else: raise ValueError(f"Unsupported Excel format: {ext}") - + def _extract_xlsx( self, current_file: "CurrentFile", @@ -111,9 +142,14 @@ def _extract_xlsx( self.logger.info(f"XLSX processing: {file_path}") try: - # Open from stream to avoid path encoding issues - file_stream = self.get_file_stream(current_file) - wb = load_workbook(file_stream, data_only=True) + # Step 1: Convert to Workbook using file_converter + file_data = current_file.get("file_data", b"") + wb = self.file_converter.convert(file_data, extension='xlsx') + + # Step 2: Preprocess - may transform wb in the future + preprocessed = self.preprocess(wb) + wb = preprocessed.clean_content # TRUE SOURCE + preload = self._preload_xlsx_data(current_file, wb, extract_metadata) result_parts = [preload["metadata_str"]] if preload["metadata_str"] else [] @@ -155,14 +191,19 @@ def _extract_xls( self.logger.info(f"XLS processing: {file_path}") try: - # xlrd can open from file_contents (bytes) + # Step 1: Convert to Workbook using file_converter file_data = current_file.get("file_data", b"") - wb = xlrd.open_workbook(file_contents=file_data, formatting_info=True) + wb = self.file_converter.convert(file_data, extension='xls') + + # Step 2: Preprocess - may transform wb in the future + preprocessed = self.preprocess(wb) + wb = preprocessed.clean_content # TRUE SOURCE + result_parts = [] if extract_metadata: - metadata = extract_xls_metadata(wb) - metadata_str = format_metadata(metadata) + xls_extractor = self._get_xls_metadata_extractor() + metadata_str = xls_extractor.extract_and_format(wb) if metadata_str: result_parts.append(metadata_str + "\n\n") @@ -195,7 +236,7 @@ def _preload_xlsx_data( """Extract preprocessing data from XLSX file.""" file_path = current_file.get("file_path", "unknown") file_stream = self.get_file_stream(current_file) - + result = { "metadata_str": "", "chart_data_list": [], # ChartData instances from extractor @@ -205,16 +246,19 @@ def _preload_xlsx_data( } if extract_metadata: - metadata = extract_xlsx_metadata(wb) - result["metadata_str"] = format_metadata(metadata) + result["metadata_str"] = self.extract_and_format_metadata(wb) if result["metadata_str"]: result["metadata_str"] += "\n\n" # Use ChartExtractor for chart extraction result["chart_data_list"] = self.chart_extractor.extract_all_from_file(file_stream) - - # NOTE: These helper functions still require file_path for now - result["images_data"] = extract_images_from_xlsx(file_path) + + # Use format_image_processor directly for image extraction + image_processor = self.format_image_processor + if hasattr(image_processor, 'extract_images_from_xlsx'): + result["images_data"] = image_processor.extract_images_from_xlsx(file_path) + else: + result["images_data"] = {} result["textboxes_by_sheet"] = extract_textboxes_from_xlsx(file_path) return result @@ -248,11 +292,15 @@ def _process_xlsx_sheet( stats["charts"] += 1 preload["chart_idx"] += 1 - # Image processing - sheet_images = get_sheet_images(ws, preload["images_data"], "") + # Image processing - use format_image_processor directly + image_processor = self.format_image_processor + if hasattr(image_processor, 'get_sheet_images'): + sheet_images = image_processor.get_sheet_images(ws, preload["images_data"], "") + else: + sheet_images = [] for image_data, anchor in sheet_images: if image_data: - image_tag = self.image_processor.save_image(image_data) + image_tag = self.format_image_processor.save_image(image_data) if image_tag: parts.append(f"\n{image_tag}\n") stats["images"] += 1 @@ -265,14 +313,14 @@ def _process_xlsx_sheet( stats["textboxes"] += 1 return "".join(parts) - + def _format_chart_data(self, chart_data) -> str: """Format ChartData using ChartProcessor.""" from contextifier.core.functions.chart_extractor import ChartData - + if not isinstance(chart_data, ChartData): return "" - + if chart_data.has_data(): return self.chart_processor.format_chart_data( chart_type=chart_data.chart_type, diff --git a/contextifier/core/processor/excel_helper/__init__.py b/contextifier/core/processor/excel_helper/__init__.py index 2ae148c..f925618 100644 --- a/contextifier/core/processor/excel_helper/__init__.py +++ b/contextifier/core/processor/excel_helper/__init__.py @@ -1,19 +1,17 @@ """ -Excel Helper 모듈 +Excel Helper Module -XLSX/XLS 파일의 세부 요소(텍스트박스, 차트, 이미지, 테이블 등) 추출을 담당합니다. +Handles extraction of elements (textboxes, charts, images, tables, etc.) from XLSX/XLS files. -모듈 구성: -- excel_chart_constants: 차트 타입 맵핑 상수 -- excel_chart_parser: OOXML 차트 XML 파싱 -- excel_chart_formatter: 차트 데이터 테이블 포맷팅 -- excel_chart_renderer: matplotlib 이미지 렌더링 -- excel_chart_processor: 차트 처리 메인 (테이블/이미지 폴백) -- excel_table_xlsx: XLSX 테이블 변환 -- excel_table_xls: XLS 테이블 변환 -- textbox_extractor: 텍스트박스 추출 -- metadata_extractor: 메타데이터 추출 -- image_extractor: 이미지 추출 +Module Structure: +- excel_chart_constants: Chart type mapping constants +- excel_chart_extractor: Chart extraction (ChartExtractor) +- excel_table_xlsx: XLSX table conversion +- excel_table_xls: XLS table conversion +- excel_textbox: Textbox extraction +- excel_metadata: Metadata extraction +- excel_image: Image extraction +- excel_layout_detector: Layout detection """ # === Textbox === @@ -21,45 +19,20 @@ # === Metadata === from contextifier.core.processor.excel_helper.excel_metadata import ( - extract_xlsx_metadata, - extract_xls_metadata, - format_metadata, + ExcelMetadataExtractor, + XLSXMetadataExtractor, + XLSMetadataExtractor, ) -# === Chart Constants === -from contextifier.core.processor.excel_helper.excel_chart_constants import ( +# === Chart Extractor === +from contextifier.core.processor.excel_helper.excel_chart_extractor import ( + ExcelChartExtractor, CHART_TYPE_MAP, - CHART_NAMESPACES, ) -# === Chart Parser === -from contextifier.core.processor.excel_helper.excel_chart_parser import ( - extract_charts_from_xlsx, - parse_ooxml_chart_xml, - extract_chart_info_basic, -) - -# === Chart Formatter === -from contextifier.core.processor.excel_helper.excel_chart_formatter import ( - format_chart_data_as_table, - format_chart_fallback, -) - -# === Chart Renderer === -from contextifier.core.processor.excel_helper.excel_chart_renderer import ( - render_chart_to_image, -) - -# === Chart Processor === -from contextifier.core.processor.excel_helper.excel_chart_processor import ( - process_chart, -) - -# === Image === -from contextifier.core.processor.excel_helper.excel_image import ( - extract_images_from_xlsx, - get_sheet_images, - SUPPORTED_IMAGE_EXTENSIONS, +# === Image Processor (replaces excel_image.py utility functions) === +from contextifier.core.processor.excel_helper.excel_image_processor import ( + ExcelImageProcessor, ) # === Table XLSX === @@ -94,27 +67,15 @@ # Textbox 'extract_textboxes_from_xlsx', # Metadata - 'extract_xlsx_metadata', - 'extract_xls_metadata', - 'format_metadata', + 'ExcelMetadataExtractor', + 'XLSXMetadataExtractor', + 'XLSMetadataExtractor', # Chart Constants 'CHART_TYPE_MAP', - 'CHART_NAMESPACES', - # Chart Parser - 'extract_charts_from_xlsx', - 'parse_ooxml_chart_xml', - 'extract_chart_info_basic', - # Chart Formatter - 'format_chart_data_as_table', - 'format_chart_fallback', - # Chart Renderer - 'render_chart_to_image', - # Chart Processor - 'process_chart', - # Image - 'extract_images_from_xlsx', - 'get_sheet_images', - 'SUPPORTED_IMAGE_EXTENSIONS', + # Chart Extractor + 'ExcelChartExtractor', + # Image Processor + 'ExcelImageProcessor', # Table XLSX 'has_merged_cells_xlsx', 'convert_xlsx_sheet_to_table', diff --git a/contextifier/core/processor/excel_helper/excel_chart_constants.py b/contextifier/core/processor/excel_helper/excel_chart_constants.py deleted file mode 100644 index 233a58c..0000000 --- a/contextifier/core/processor/excel_helper/excel_chart_constants.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Excel 차트 상수 모듈 - -OOXML 차트 타입 맵핑 및 관련 상수 정의 -""" - -# OOXML 차트 타입 맵핑 -CHART_TYPE_MAP = { - 'barChart': '막대 차트', - 'bar3DChart': '3D 막대 차트', - 'lineChart': '선 차트', - 'line3DChart': '3D 선 차트', - 'pieChart': '파이 차트', - 'pie3DChart': '3D 파이 차트', - 'doughnutChart': '도넛 차트', - 'areaChart': '영역 차트', - 'area3DChart': '3D 영역 차트', - 'scatterChart': '분산형 차트', - 'radarChart': '방사형 차트', - 'bubbleChart': '거품형 차트', - 'stockChart': '주식형 차트', - 'surfaceChart': '표면 차트', - 'surface3DChart': '3D 표면 차트', - 'ofPieChart': '분리형 파이 차트', -} - -# OOXML 네임스페이스 -CHART_NAMESPACES = { - 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart', - 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', -} diff --git a/contextifier/core/processor/excel_helper/excel_chart_formatter.py b/contextifier/core/processor/excel_helper/excel_chart_formatter.py deleted file mode 100644 index 907261f..0000000 --- a/contextifier/core/processor/excel_helper/excel_chart_formatter.py +++ /dev/null @@ -1,106 +0,0 @@ -""" -Excel 차트 데이터 포맷팅 모듈 - -차트 데이터를 Markdown 테이블 형식으로 변환합니다. -""" - -import logging -from typing import Any, Dict, Optional - -logger = logging.getLogger("document-processor") - - -def format_chart_data_as_table(chart_info: Dict[str, Any]) -> Optional[str]: - """ - 차트 데이터를 Markdown 테이블 형식으로 포맷합니다. - - 데이터가 충분하면 테이블 문자열 반환, 없으면 None 반환. - None 반환 시 이미지 폴백이 트리거됩니다. - - Args: - chart_info: 차트 정보 딕셔너리 - - Returns: - Markdown 테이블 형식의 문자열 또는 None - """ - if not chart_info: - return None - - categories = chart_info.get('categories', []) - series_list = chart_info.get('series', []) - - # 데이터가 없으면 None 반환 (이미지 폴백 필요) - if not series_list or all(len(s.get('values', [])) == 0 for s in series_list): - return None - - result_parts = ["[chart]"] - - if chart_info.get('title'): - result_parts.append(f"제목: {chart_info['title']}") - - if chart_info.get('chart_type'): - result_parts.append(f"유형: {chart_info['chart_type']}") - - result_parts.append("") - - # 테이블 헤더 생성 - header = ["카테고리"] + [s.get('name', f'시리즈 {i+1}') for i, s in enumerate(series_list)] - result_parts.append("| " + " | ".join(str(h) for h in header) + " |") - result_parts.append("| " + " | ".join(["---"] * len(header)) + " |") - - # 데이터 행 생성 - max_len = max( - len(categories), - max((len(s.get('values', [])) for s in series_list), default=0) - ) - - for i in range(max_len): - row = [] - - # 카테고리 - if i < len(categories): - row.append(str(categories[i])) - else: - row.append(f"항목 {i+1}") - - # 시리즈 값 - for series in series_list: - values = series.get('values', []) - if i < len(values): - val = values[i] - if isinstance(val, float): - row.append(f"{val:,.2f}") - elif val is not None: - row.append(str(val)) - else: - row.append("") - else: - row.append("") - - result_parts.append("| " + " | ".join(row) + " |") - - result_parts.append("[/chart]") - return "\n".join(result_parts) - - -def format_chart_fallback(chart_info: Dict[str, Any]) -> str: - """ - 차트 정보만 출력하는 폴백 포맷터. - - 테이블/이미지 변환 모두 실패 시 사용됩니다. - - Args: - chart_info: 차트 정보 딕셔너리 - - Returns: - [chart]...[/chart] 형태의 기본 문자열 - """ - result_parts = ["[chart]"] - - if chart_info and chart_info.get('title'): - result_parts.append(f"제목: {chart_info['title']}") - if chart_info and chart_info.get('chart_type'): - result_parts.append(f"유형: {chart_info['chart_type']}") - - result_parts.append("[/chart]") - return "\n".join(result_parts) diff --git a/contextifier/core/processor/excel_helper/excel_chart_parser.py b/contextifier/core/processor/excel_helper/excel_chart_parser.py deleted file mode 100644 index 217d819..0000000 --- a/contextifier/core/processor/excel_helper/excel_chart_parser.py +++ /dev/null @@ -1,381 +0,0 @@ -""" -Excel 차트 OOXML 파싱 모듈 - -XLSX 파일의 차트 XML을 파싱하여 데이터를 추출합니다. -""" - -import logging -import re -import zipfile -import xml.etree.ElementTree as ET -from typing import Any, Dict, List, Optional - -from contextifier.core.processor.excel_helper.excel_chart_constants import CHART_TYPE_MAP, CHART_NAMESPACES - -logger = logging.getLogger("document-processor") - - -def extract_charts_from_xlsx(file_path: str) -> List[Dict[str, Any]]: - """ - XLSX 파일에서 차트 데이터를 추출합니다. - - XLSX 차트는 xl/charts/ 폴더에 chart*.xml 파일로 저장됩니다. - - Args: - file_path: XLSX 파일 경로 - - Returns: - 차트 정보 딕셔너리 리스트 - """ - charts = [] - - try: - with zipfile.ZipFile(file_path, 'r') as zf: - for name in zf.namelist(): - if name.startswith('xl/charts/chart') and name.endswith('.xml'): - try: - chart_xml = zf.read(name) - chart_info = parse_ooxml_chart_xml(chart_xml) - if chart_info: - charts.append(chart_info) - except Exception as e: - logger.debug(f"Error parsing chart {name}: {e}") - - logger.info(f"Extracted {len(charts)} charts from XLSX") - - except Exception as e: - logger.warning(f"Error extracting charts from XLSX: {e}") - - return charts - - -def parse_ooxml_chart_xml(chart_xml: bytes) -> Optional[Dict[str, Any]]: - """ - OOXML 차트 XML을 파싱하여 차트 데이터를 추출합니다. - - Args: - chart_xml: 차트 XML 바이트 - - Returns: - 차트 데이터 딕셔너리 - """ - try: - ns = CHART_NAMESPACES - - try: - root = ET.fromstring(chart_xml) - except ET.ParseError: - try: - chart_str = chart_xml.decode('utf-8-sig', errors='ignore') - root = ET.fromstring(chart_str) - except: - return None - - chart_info = { - 'type': 'ooxml', - 'chart_type': None, - 'title': None, - 'series': [], - 'categories': [] - } - - # chart 요소 찾기 - chart_elem = root.find('.//c:chart', ns) - if chart_elem is None: - chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart') - - if chart_elem is None: - if root.tag.endswith('}chart') or root.tag == 'chart': - chart_elem = root - else: - return None - - # 차트 제목 추출 - title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', ns) - if title_elem is not None and title_elem.text: - chart_info['title'] = title_elem.text.strip() - else: - title_elem = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}tx//{http://schemas.openxmlformats.org/drawingml/2006/main}t') - if title_elem is not None and title_elem.text: - chart_info['title'] = title_elem.text.strip() - - # 차트 유형 및 시리즈 데이터 추출 - plot_area = chart_elem.find('.//c:plotArea', ns) - if plot_area is None: - plot_area = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}plotArea') - - if plot_area is not None: - for chart_tag, chart_name in CHART_TYPE_MAP.items(): - elem = plot_area.find(f'.//c:{chart_tag}', ns) - if elem is None: - elem = plot_area.find(f'.//{{{ns["c"]}}}{chart_tag}') - if elem is not None: - chart_info['chart_type'] = chart_name - _extract_chart_series(elem, chart_info, ns) - break - - return chart_info if chart_info['series'] else None - - except Exception as e: - logger.debug(f"Error parsing OOXML chart: {e}") - return None - - -def _extract_chart_series(chart_type_elem, chart_info: Dict[str, Any], ns: Dict[str, str]): - """ - 차트 요소에서 시리즈 데이터를 추출합니다. - - Args: - chart_type_elem: 차트 타입 XML 요소 - chart_info: 차트 정보 딕셔너리 (수정됨) - ns: XML 네임스페이스 딕셔너리 - """ - ns_c = ns.get('c', 'http://schemas.openxmlformats.org/drawingml/2006/chart') - - series_elements = chart_type_elem.findall('.//c:ser', ns) - if not series_elements: - series_elements = chart_type_elem.findall(f'.//{{{ns_c}}}ser') - - categories_extracted = False - - for ser_elem in series_elements: - series_data = { - 'name': None, - 'values': [], - } - - # 시리즈 이름 추출 - tx_elem = ser_elem.find('.//c:tx//c:v', ns) - if tx_elem is None: - tx_elem = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}v') - if tx_elem is not None and tx_elem.text: - series_data['name'] = tx_elem.text.strip() - else: - str_ref = ser_elem.find('.//c:tx//c:strRef//c:strCache//c:pt//c:v', ns) - if str_ref is None: - str_ref = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}strRef//{{{ns_c}}}strCache//{{{ns_c}}}pt//{{{ns_c}}}v') - if str_ref is not None and str_ref.text: - series_data['name'] = str_ref.text.strip() - - # 카테고리 레이블 추출 - if not categories_extracted: - _extract_categories(ser_elem, chart_info, ns, ns_c) - categories_extracted = True - - # 값 추출 - _extract_values(ser_elem, series_data, ns, ns_c) - - if series_data['values']: - chart_info['series'].append(series_data) - - -def _extract_categories(ser_elem, chart_info: Dict[str, Any], ns: Dict[str, str], ns_c: str): - """ - 시리즈 요소에서 카테고리 레이블을 추출합니다. - """ - cat_elem = ser_elem.find('.//c:cat', ns) - if cat_elem is None: - cat_elem = ser_elem.find(f'.//{{{ns_c}}}cat') - - if cat_elem is None: - return - - # strCache에서 추출 - str_cache = cat_elem.find('.//c:strCache', ns) - if str_cache is None: - str_cache = cat_elem.find(f'.//{{{ns_c}}}strCache') - - if str_cache is not None: - pts = str_cache.findall('.//c:pt', ns) - if not pts: - pts = str_cache.findall(f'.//{{{ns_c}}}pt') - - for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))): - v_elem = pt.find('c:v', ns) - if v_elem is None: - v_elem = pt.find(f'{{{ns_c}}}v') - if v_elem is not None and v_elem.text: - chart_info['categories'].append(v_elem.text.strip()) - - # numCache에서 추출 (폴백) - if not chart_info['categories']: - num_cache = cat_elem.find('.//c:numCache', ns) - if num_cache is None: - num_cache = cat_elem.find(f'.//{{{ns_c}}}numCache') - - if num_cache is not None: - pts = num_cache.findall('.//c:pt', ns) - if not pts: - pts = num_cache.findall(f'.//{{{ns_c}}}pt') - - for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))): - v_elem = pt.find('c:v', ns) - if v_elem is None: - v_elem = pt.find(f'{{{ns_c}}}v') - if v_elem is not None and v_elem.text: - chart_info['categories'].append(v_elem.text.strip()) - - -def _extract_values(ser_elem, series_data: Dict[str, Any], ns: Dict[str, str], ns_c: str): - """ - 시리즈 요소에서 값을 추출합니다. - """ - # val 요소에서 추출 - val_elem = ser_elem.find('.//c:val', ns) - if val_elem is None: - val_elem = ser_elem.find(f'.//{{{ns_c}}}val') - - if val_elem is not None: - _extract_num_cache_values(val_elem, series_data, ns, ns_c) - - # yVal 확인 (scatter/bubble 차트용) - if not series_data['values']: - yval_elem = ser_elem.find('.//c:yVal', ns) - if yval_elem is None: - yval_elem = ser_elem.find(f'.//{{{ns_c}}}yVal') - - if yval_elem is not None: - _extract_num_cache_values(yval_elem, series_data, ns, ns_c) - - -def _extract_num_cache_values(val_elem, series_data: Dict[str, Any], ns: Dict[str, str], ns_c: str): - """ - numCache에서 숫자 값을 추출합니다. - """ - num_cache = val_elem.find('.//c:numCache', ns) - if num_cache is None: - num_cache = val_elem.find(f'.//{{{ns_c}}}numCache') - - if num_cache is not None: - pts = num_cache.findall('.//c:pt', ns) - if not pts: - pts = num_cache.findall(f'.//{{{ns_c}}}pt') - - for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))): - v_elem = pt.find('c:v', ns) - if v_elem is None: - v_elem = pt.find(f'{{{ns_c}}}v') - if v_elem is not None and v_elem.text: - try: - series_data['values'].append(float(v_elem.text)) - except ValueError: - series_data['values'].append(v_elem.text) - - -def extract_chart_info_basic(chart, ws) -> str: - """ - 차트 정보를 추출합니다 (openpyxl 객체에서 기본 추출). - OOXML 파싱 실패 시 폴백으로 사용됩니다. - - Args: - chart: openpyxl Chart 객체 - ws: openpyxl Worksheet 객체 - - Returns: - [chart]...[/chart] 형태의 문자열 - """ - try: - result_parts = ["[chart]"] - - # 차트 타입 - chart_type = type(chart).__name__ - result_parts.append(f"유형: {chart_type}") - - # 차트 제목 - if chart.title: - title_text = _extract_chart_title(chart.title) - if title_text: - result_parts.append(f"제목: {title_text}") - - # 시리즈 데이터 - if hasattr(chart, 'series'): - for i, series in enumerate(chart.series): - series_info = _extract_series_info(series, ws, i) - if series_info: - result_parts.append(series_info) - - result_parts.append("[/chart]") - return "\n".join(result_parts) - - except Exception as e: - logger.debug(f"Error extracting chart info: {e}") - return "[chart][/chart]" - - -def _extract_chart_title(title_obj) -> str: - """ - 차트 제목을 추출합니다. - """ - try: - if hasattr(title_obj, 'tx') and title_obj.tx: - if hasattr(title_obj.tx, 'rich') and title_obj.tx.rich: - # RichText에서 텍스트 추출 - texts = [] - if hasattr(title_obj.tx.rich, 'p'): - for p in title_obj.tx.rich.p: - if hasattr(p, 'r'): - for r in p.r: - if hasattr(r, 't') and r.t: - texts.append(r.t) - return "".join(texts) - return "" - except Exception: - return "" - - -def _extract_series_info(series, ws, index: int) -> str: - """ - 차트 시리즈 정보를 추출합니다. - """ - try: - parts = [f"시리즈 {index + 1}:"] - - # 시리즈 이름 - if hasattr(series, 'title') and series.title: - if hasattr(series.title, 'strRef') and series.title.strRef: - ref = series.title.strRef.f - parts.append(f" 이름 참조: {ref}") - - # 데이터 참조 - if hasattr(series, 'val') and series.val: - if hasattr(series.val, 'numRef') and series.val.numRef: - ref = series.val.numRef.f - parts.append(f" 데이터 참조: {ref}") - - # 실제 데이터 값 추출 시도 - try: - values = _get_range_values(ws, ref) - if values: - parts.append(f" 데이터: {values[:10]}{'...' if len(values) > 10 else ''}") - except Exception: - pass - - return "\n".join(parts) if len(parts) > 1 else "" - - except Exception: - return "" - - -def _get_range_values(ws, ref: str) -> List[Any]: - """ - 셀 범위 참조에서 값을 추출합니다. - """ - try: - # 참조 형식: 'Sheet1'!$A$1:$A$10 또는 Sheet1!A1:A10 - match = re.search(r"['\"]?([^'\"!]+)['\"]?!\$?([A-Z]+)\$?(\d+):\$?([A-Z]+)\$?(\d+)", ref) - if not match: - return [] - - _, start_col, start_row, end_col, end_row = match.groups() - start_row, end_row = int(start_row), int(end_row) - - values = [] - for row in range(start_row, end_row + 1): - cell = ws[f"{start_col}{row}"] - if cell.value is not None: - values.append(cell.value) - - return values - - except Exception: - return [] diff --git a/contextifier/core/processor/excel_helper/excel_chart_processor.py b/contextifier/core/processor/excel_helper/excel_chart_processor.py deleted file mode 100644 index 471123d..0000000 --- a/contextifier/core/processor/excel_helper/excel_chart_processor.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Excel Chart Processing Module - -Extracts chart data from Excel files and formats using ChartProcessor. -Output format: - {chart_prefix} - {chart_title} - {chart_type} - ...
- {chart_suffix} -""" - -import logging -from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING - -if TYPE_CHECKING: - from contextifier.core.functions.chart_processor import ChartProcessor - -logger = logging.getLogger("document-processor") - - -def process_chart( - chart_info: Dict[str, Any], - chart_processor: "ChartProcessor" -) -> str: - """ - Process a chart using ChartProcessor. - - Args: - chart_info: Chart information dictionary containing: - - chart_type: Type of chart (bar, line, pie, etc.) - - title: Chart title (optional) - - categories: List of category labels - - series: List of series dicts with 'name' and 'values' - chart_processor: ChartProcessor instance for formatting - - Returns: - Formatted chart string with tags - """ - if not chart_info: - return chart_processor.format_chart_fallback(chart_type="Unknown") - - chart_type = chart_info.get('chart_type', 'Unknown') - title = chart_info.get('title') - categories = chart_info.get('categories', []) - series_list = chart_info.get('series', []) - - # Check if we have valid data - has_data = series_list and any(len(s.get('values', [])) > 0 for s in series_list) - - if has_data: - result = chart_processor.format_chart_data( - chart_type=chart_type, - series_data=series_list, - title=title, - categories=categories - ) - logger.debug(f"Chart '{title}' converted to table successfully") - return result - - # Fallback: no data available - return chart_processor.format_chart_fallback(chart_type=chart_type, title=title) diff --git a/contextifier/core/processor/excel_helper/excel_chart_renderer.py b/contextifier/core/processor/excel_helper/excel_chart_renderer.py deleted file mode 100644 index aaecd5a..0000000 --- a/contextifier/core/processor/excel_helper/excel_chart_renderer.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Excel 차트 이미지 렌더링 모듈 - -matplotlib를 사용하여 차트 데이터를 이미지로 렌더링합니다. -테이블 변환 실패 시 폴백으로 사용됩니다. -""" - -import io -import logging -from typing import Any, Dict, Optional, Set - -import matplotlib -matplotlib.use('Agg') # Non-GUI backend -import matplotlib.pyplot as plt - -logger = logging.getLogger("document-processor") - - -def render_chart_to_image( - chart_info: Dict[str, Any], - processed_images: Set[str] = None, - upload_func=None -) -> Optional[str]: - """ - 차트 데이터를 matplotlib로 이미지로 렌더링하고 로컬에 저장합니다. - - 테이블 변환 실패 시 폴백으로 사용됩니다. - - Args: - chart_info: 차트 정보 딕셔너리 - processed_images: 이미 처리된 이미지 해시 집합 - upload_func: 이미지 업로드 함수 - - Returns: - [chart] 태그로 감싸진 이미지 참조 문자열, 실패 시 None - """ - if not chart_info: - return None - - try: - categories = chart_info.get('categories', []) - series_list = chart_info.get('series', []) - chart_type = chart_info.get('chart_type', '') - title = chart_info.get('title', '차트') - - if not series_list: - return None - - # 그래프 생성 - fig, ax = plt.subplots(figsize=(10, 6)) - - # 차트 유형에 따른 렌더링 - if '파이' in chart_type or 'pie' in chart_type.lower(): - _render_pie_chart(ax, series_list, categories) - elif '선' in chart_type or 'line' in chart_type.lower(): - _render_line_chart(ax, series_list, categories) - elif '영역' in chart_type or 'area' in chart_type.lower(): - _render_area_chart(ax, series_list, categories) - else: - # 기본: 막대 차트 - _render_bar_chart(ax, series_list, categories) - - ax.set_title(title) - plt.tight_layout() - - # 이미지를 바이트로 저장 - img_buffer = io.BytesIO() - fig.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight') - img_buffer.seek(0) - img_data = img_buffer.getvalue() - plt.close(fig) - - # 로컬에 저장 - if processed_images is None: - processed_images = set() - - if upload_func: - image_tag = upload_func(img_data) - - if image_tag: - result_parts = ["[chart]"] - if title: - result_parts.append(f"제목: {title}") - if chart_type: - result_parts.append(f"유형: {chart_type}") - result_parts.append(image_tag) - result_parts.append("[/chart]") - return "\n".join(result_parts) - - return None - - except Exception as e: - logger.warning(f"Error rendering chart to image: {e}") - if 'fig' in locals(): - plt.close(fig) - return None - - -def _render_pie_chart(ax, series_list, categories): - """ - 파이 차트를 렌더링합니다. - """ - if series_list and series_list[0].get('values'): - values = series_list[0]['values'] - labels = categories if categories else [f'항목 {i+1}' for i in range(len(values))] - ax.pie(values, labels=labels, autopct='%1.1f%%') - - -def _render_line_chart(ax, series_list, categories): - """ - 선 차트를 렌더링합니다. - """ - x = categories if categories else list(range(len(series_list[0].get('values', [])))) - for series in series_list: - values = series.get('values', []) - name = series.get('name', '시리즈') - if values: - ax.plot(x[:len(values)], values, marker='o', label=name) - ax.legend() - ax.grid(True, alpha=0.3) - - -def _render_area_chart(ax, series_list, categories): - """ - 영역 차트를 렌더링합니다. - """ - for series in series_list: - values = series.get('values', []) - name = series.get('name', '시리즈') - if values: - ax.fill_between(range(len(values)), values, alpha=0.5, label=name) - ax.plot(values, marker='o', label=f'{name} (선)') - ax.legend() - ax.grid(True, alpha=0.3) - - -def _render_bar_chart(ax, series_list, categories): - """ - 막대 차트를 렌더링합니다. - """ - x = categories if categories else [f'항목 {i+1}' for i in range(len(series_list[0].get('values', [])))] - width = 0.8 / len(series_list) if len(series_list) > 1 else 0.6 - - for idx, series in enumerate(series_list): - values = series.get('values', []) - name = series.get('name', f'시리즈 {idx+1}') - if values: - offset = (idx - len(series_list) / 2 + 0.5) * width - positions = [i + offset for i in range(len(values))] - ax.bar(positions, values, width=width, label=name) - - ax.set_xticks(range(len(x))) - ax.set_xticklabels(x, rotation=45, ha='right') - ax.legend() - ax.grid(True, alpha=0.3, axis='y') diff --git a/contextifier/core/processor/excel_helper/excel_file_converter.py b/contextifier/core/processor/excel_helper/excel_file_converter.py new file mode 100644 index 0000000..c845917 --- /dev/null +++ b/contextifier/core/processor/excel_helper/excel_file_converter.py @@ -0,0 +1,156 @@ +# libs/core/processor/excel_helper/excel_file_converter.py +""" +ExcelFileConverter - Excel file format converter + +Converts binary Excel data to Workbook object. +Supports both XLSX and XLS formats. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO, Union + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class XLSXFileConverter(BaseFileConverter): + """ + XLSX file converter using openpyxl. + + Converts binary XLSX data to openpyxl Workbook object. + """ + + # ZIP magic number (XLSX is a ZIP file) + ZIP_MAGIC = b'PK\x03\x04' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + data_only: bool = True, + **kwargs + ) -> Any: + """ + Convert binary XLSX data to Workbook object. + + Args: + file_data: Raw binary XLSX data + file_stream: Optional file stream + data_only: If True, return calculated values instead of formulas + **kwargs: Additional options + + Returns: + openpyxl.Workbook object + """ + from openpyxl import load_workbook + + stream = file_stream if file_stream is not None else BytesIO(file_data) + stream.seek(0) + return load_workbook(stream, data_only=data_only) + + def get_format_name(self) -> str: + """Return format name.""" + return "XLSX Workbook" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is a valid XLSX.""" + if not file_data or len(file_data) < 4: + return False + return file_data[:4] == self.ZIP_MAGIC + + +class XLSFileConverter(BaseFileConverter): + """ + XLS file converter using xlrd. + + Converts binary XLS data to xlrd Workbook object. + """ + + # OLE magic number (XLS is an OLE file) + OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary XLS data to xlrd Workbook object. + + Args: + file_data: Raw binary XLS data + file_stream: Optional file stream (not used) + **kwargs: Additional options + + Returns: + xlrd.Book object + """ + import xlrd + return xlrd.open_workbook(file_contents=file_data) + + def get_format_name(self) -> str: + """Return format name.""" + return "XLS Workbook" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is a valid XLS.""" + if not file_data or len(file_data) < 8: + return False + return file_data[:8] == self.OLE_MAGIC + + +class ExcelFileConverter(BaseFileConverter): + """ + Unified Excel file converter. + + Auto-detects format (XLSX/XLS) and uses appropriate converter. + """ + + def __init__(self): + """Initialize with both converters.""" + self._xlsx_converter = XLSXFileConverter() + self._xls_converter = XLSFileConverter() + self._used_converter: Optional[BaseFileConverter] = None + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + extension: Optional[str] = None, + **kwargs + ) -> Any: + """ + Convert binary Excel data to Workbook object. + + Args: + file_data: Raw binary Excel data + file_stream: Optional file stream + extension: File extension hint ('xlsx' or 'xls') + **kwargs: Additional options + + Returns: + Workbook object (openpyxl or xlrd) + """ + # Determine format from extension or magic number + if extension: + ext = extension.lower().lstrip('.') + if ext == 'xlsx': + self._used_converter = self._xlsx_converter + elif ext == 'xls': + self._used_converter = self._xls_converter + else: + # Auto-detect + if self._xlsx_converter.validate(file_data): + self._used_converter = self._xlsx_converter + elif self._xls_converter.validate(file_data): + self._used_converter = self._xls_converter + else: + # Default to XLSX + self._used_converter = self._xlsx_converter + + return self._used_converter.convert(file_data, file_stream, **kwargs) + + def get_format_name(self) -> str: + """Return format name based on detected type.""" + if self._used_converter: + return self._used_converter.get_format_name() + return "Excel Workbook" diff --git a/contextifier/core/processor/excel_helper/excel_image.py b/contextifier/core/processor/excel_helper/excel_image.py deleted file mode 100644 index ea59829..0000000 --- a/contextifier/core/processor/excel_helper/excel_image.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -XLSX 이미지 추출 모듈 - -Excel 파일에서 임베디드 이미지를 추출합니다. -""" - -import os -import logging -import zipfile -from typing import Dict, List, Tuple - -logger = logging.getLogger("document-processor") - -# PIL에서 지원하는 이미지 형식만 추출 -SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'] - -# 지원하지 않는 형식 (EMF, WMF 등) -UNSUPPORTED_IMAGE_EXTENSIONS = ['.emf', '.wmf'] - - -def extract_images_from_xlsx(file_path: str) -> Dict[str, bytes]: - """ - XLSX 파일에서 이미지를 추출합니다 (ZIP 직접 접근). - EMF, WMF 등 PIL에서 지원하지 않는 형식은 제외합니다. - - Args: - file_path: XLSX 파일 경로 - - Returns: - {이미지 경로: 이미지 바이트} 딕셔너리 - """ - images = {} - - try: - with zipfile.ZipFile(file_path, 'r') as zf: - for name in zf.namelist(): - if name.startswith('xl/media/'): - # 이미지 파일 - ext = os.path.splitext(name)[1].lower() - if ext in SUPPORTED_IMAGE_EXTENSIONS: - images[name] = zf.read(name) - elif ext in UNSUPPORTED_IMAGE_EXTENSIONS: - logger.debug(f"Skipping unsupported image format: {name}") - - return images - - except Exception as e: - logger.warning(f"Error extracting images from XLSX: {e}") - return {} - - -def get_sheet_images(ws, images_data: Dict[str, bytes], file_path: str) -> List[Tuple[bytes, str]]: - """ - 시트에 포함된 이미지를 가져옵니다. - - Args: - ws: openpyxl Worksheet 객체 - images_data: extract_images_from_xlsx에서 추출한 이미지 딕셔너리 - file_path: XLSX 파일 경로 - - Returns: - [(이미지 바이트, 앵커 정보)] 리스트 - """ - result = [] - - try: - # openpyxl의 _images 속성 사용 - if hasattr(ws, '_images') and ws._images: - for img in ws._images: - try: - # 이미지 데이터 접근 - if hasattr(img, '_data') and callable(img._data): - img_data = img._data() - anchor = str(img.anchor) if hasattr(img, 'anchor') else "" - result.append((img_data, anchor)) - except Exception as e: - logger.debug(f"Error accessing image data: {e}") - - # 직접 추출한 이미지 사용 (위에서 못 가져온 경우) - if not result and images_data: - for name, data in images_data.items(): - result.append((data, name)) - - return result - - except Exception as e: - logger.warning(f"Error getting sheet images: {e}") - return [] diff --git a/contextifier/core/processor/excel_helper/excel_image_processor.py b/contextifier/core/processor/excel_helper/excel_image_processor.py new file mode 100644 index 0000000..1f8f816 --- /dev/null +++ b/contextifier/core/processor/excel_helper/excel_image_processor.py @@ -0,0 +1,316 @@ +# contextifier/core/processor/excel_helper/excel_image_processor.py +""" +Excel Image Processor + +Provides Excel-specific image processing that inherits from ImageProcessor. +Handles embedded images, chart images, and drawing images for XLSX/XLS files. + +This class consolidates all Excel image extraction logic including: +- XLSX ZIP-based image extraction +- openpyxl Image object processing +- Sheet image extraction +""" +import os +import logging +import zipfile +from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +if TYPE_CHECKING: + from openpyxl.workbook import Workbook + from openpyxl.worksheet.worksheet import Worksheet + from openpyxl.drawing.image import Image + +logger = logging.getLogger("contextify.image_processor.excel") + +# Image formats supported by PIL +SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'] + +# Unsupported formats (EMF, WMF, etc.) +UNSUPPORTED_IMAGE_EXTENSIONS = ['.emf', '.wmf'] + + +class ExcelImageProcessor(ImageProcessor): + """ + Excel-specific image processor. + + Inherits from ImageProcessor and provides Excel-specific processing. + + Handles: + - Embedded worksheet images + - Drawing images + - Chart images + - Shape images + + Example: + processor = ExcelImageProcessor() + + # Process worksheet image + tag = processor.process_image(image_data, sheet_name="Sheet1") + + # Process from openpyxl Image object + tag = processor.process_openpyxl_image(image_obj) + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize ExcelImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + sheet_name: Optional[str] = None, + image_index: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save Excel image data. + + Args: + image_data: Raw image binary data + sheet_name: Source sheet name (for naming) + image_index: Image index in sheet (for naming) + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = None + if sheet_name is not None: + safe_sheet = sheet_name.replace(' ', '_').replace('/', '_') + if image_index is not None: + custom_name = f"excel_{safe_sheet}_{image_index}" + else: + custom_name = f"excel_{safe_sheet}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_openpyxl_image( + self, + image: "Image", + sheet_name: Optional[str] = None, + image_index: Optional[int] = None, + ) -> Optional[str]: + """ + Process openpyxl Image object. + + Args: + image: openpyxl Image object + sheet_name: Source sheet name + image_index: Image index + + Returns: + Image tag string, or None on failure + """ + try: + # Get image data from openpyxl Image + if hasattr(image, '_data'): + image_data = image._data() + elif hasattr(image, 'ref'): + # For embedded images with reference + image_data = image.ref.blob + else: + self._logger.warning("Cannot extract data from openpyxl Image") + return None + + return self.process_image( + image_data, + sheet_name=sheet_name, + image_index=image_index + ) + + except Exception as e: + self._logger.warning(f"Failed to process openpyxl image: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + sheet_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded Excel image. + + Args: + image_data: Image binary data + image_name: Original image filename + sheet_name: Source sheet name + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and sheet_name is not None: + safe_sheet = sheet_name.replace(' ', '_').replace('/', '_') + custom_name = f"excel_embed_{safe_sheet}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_chart_image( + self, + chart_data: bytes, + chart_name: Optional[str] = None, + sheet_name: Optional[str] = None, + chart_index: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process Excel chart as image. + + Args: + chart_data: Chart image binary data + chart_name: Chart title/name + sheet_name: Source sheet name + chart_index: Chart index in sheet + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = chart_name + if custom_name is None: + if sheet_name is not None: + safe_sheet = sheet_name.replace(' ', '_').replace('/', '_') + if chart_index is not None: + custom_name = f"excel_chart_{safe_sheet}_{chart_index}" + else: + custom_name = f"excel_chart_{safe_sheet}" + elif chart_index is not None: + custom_name = f"excel_chart_{chart_index}" + + return self.save_image(chart_data, custom_name=custom_name) + + def extract_images_from_xlsx( + self, + file_path: str, + ) -> Dict[str, bytes]: + """ + Extract images from XLSX file (direct ZIP access). + Excludes formats not supported by PIL (EMF, WMF, etc.). + + Args: + file_path: Path to XLSX file + + Returns: + {image_path: image_bytes} dictionary + """ + images = {} + + try: + with zipfile.ZipFile(file_path, 'r') as zf: + for name in zf.namelist(): + if name.startswith('xl/media/'): + ext = os.path.splitext(name)[1].lower() + if ext in SUPPORTED_IMAGE_EXTENSIONS: + images[name] = zf.read(name) + elif ext in UNSUPPORTED_IMAGE_EXTENSIONS: + logger.debug(f"Skipping unsupported image format: {name}") + + return images + + except Exception as e: + logger.warning(f"Error extracting images from XLSX: {e}") + return {} + + def get_sheet_images( + self, + ws: "Worksheet", + images_data: Dict[str, bytes], + file_path: str, + ) -> List[Tuple[bytes, str]]: + """ + Get images contained in a sheet. + + Args: + ws: openpyxl Worksheet object + images_data: Image dictionary from extract_images_from_xlsx + file_path: Path to XLSX file + + Returns: + [(image_bytes, anchor_info)] list + """ + result = [] + + try: + # Use openpyxl's _images attribute + if hasattr(ws, '_images') and ws._images: + for img in ws._images: + try: + if hasattr(img, '_data') and callable(img._data): + img_data = img._data() + anchor = str(img.anchor) if hasattr(img, 'anchor') else "" + result.append((img_data, anchor)) + except Exception as e: + logger.debug(f"Error accessing image data: {e}") + + # Use directly extracted images (if not obtained above) + if not result and images_data: + for name, data in images_data.items(): + result.append((data, name)) + + return result + + except Exception as e: + logger.warning(f"Error getting sheet images: {e}") + return [] + + def process_sheet_images( + self, + ws: "Worksheet", + sheet_name: str, + images_data: Optional[Dict[str, bytes]] = None, + file_path: Optional[str] = None, + ) -> str: + """ + Process all images in a sheet. + + Args: + ws: openpyxl Worksheet object + sheet_name: Sheet name + images_data: Pre-extracted image dictionary + file_path: Path to XLSX file + + Returns: + Joined image tag strings + """ + results = [] + + if images_data is None and file_path: + images_data = self.extract_images_from_xlsx(file_path) + + images_data = images_data or {} + sheet_images = self.get_sheet_images(ws, images_data, file_path or "") + + for idx, (img_data, anchor) in enumerate(sheet_images): + tag = self.process_image(img_data, sheet_name=sheet_name, image_index=idx) + if tag: + results.append(tag) + + return "\n\n".join(results) + + +__all__ = ["ExcelImageProcessor"] diff --git a/contextifier/core/processor/excel_helper/excel_metadata.py b/contextifier/core/processor/excel_helper/excel_metadata.py index 2627e47..f8375e4 100644 --- a/contextifier/core/processor/excel_helper/excel_metadata.py +++ b/contextifier/core/processor/excel_helper/excel_metadata.py @@ -1,129 +1,145 @@ +# contextifier/core/processor/excel_helper/excel_metadata.py """ -XLSX/XLS 메타데이터 추출 모듈 +Excel Metadata Extraction Module -Excel 문서에서 메타데이터(제목, 작성자, 주제, 키워드, 작성일, 수정일 등)를 추출합니다. +Provides ExcelMetadataExtractor classes for extracting metadata from Excel documents. +Supports both XLSX (openpyxl) and XLS (xlrd) formats. +Implements BaseMetadataExtractor interface. """ - import logging -from datetime import datetime -from typing import Any, Dict +from typing import Any, Optional + +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) logger = logging.getLogger("document-processor") -def extract_xlsx_metadata(wb) -> Dict[str, Any]: +class XLSXMetadataExtractor(BaseMetadataExtractor): """ - XLSX 문서에서 메타데이터를 추출합니다. - - openpyxl의 properties를 통해 다음 정보를 추출합니다: - - 제목 (title) - - 주제 (subject) - - 작성자 (creator) - - 키워드 (keywords) - - 설명 (description) - - 마지막 수정자 (lastModifiedBy) - - 작성일 (created) - - 수정일 (modified) - - Args: - wb: openpyxl Workbook 객체 - - Returns: - 메타데이터 딕셔너리 + XLSX Metadata Extractor. + + Extracts metadata from openpyxl Workbook objects. + + Supported fields: + - title, subject, author (creator), keywords + - comments (description), last_saved_by + - create_time, last_saved_time + + Usage: + extractor = XLSXMetadataExtractor() + metadata = extractor.extract(workbook) + text = extractor.format(metadata) """ - metadata = {} - - try: - props = wb.properties - - if props.title: - metadata['title'] = props.title.strip() - if props.subject: - metadata['subject'] = props.subject.strip() - if props.creator: - metadata['author'] = props.creator.strip() - if props.keywords: - metadata['keywords'] = props.keywords.strip() - if props.description: - metadata['comments'] = props.description.strip() - if props.lastModifiedBy: - metadata['last_saved_by'] = props.lastModifiedBy.strip() - if props.created: - metadata['create_time'] = props.created - if props.modified: - metadata['last_saved_time'] = props.modified - - logger.debug(f"Extracted XLSX metadata: {list(metadata.keys())}") - - except Exception as e: - logger.warning(f"Failed to extract XLSX metadata: {e}") - - return metadata - - -def extract_xls_metadata(wb) -> Dict[str, Any]: + + def extract(self, source: Any) -> DocumentMetadata: + """ + Extract metadata from XLSX document. + + Args: + source: openpyxl Workbook object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + try: + props = source.properties + + return DocumentMetadata( + title=self._get_stripped(props.title), + subject=self._get_stripped(props.subject), + author=self._get_stripped(props.creator), + keywords=self._get_stripped(props.keywords), + comments=self._get_stripped(props.description), + last_saved_by=self._get_stripped(props.lastModifiedBy), + create_time=props.created, + last_saved_time=props.modified, + ) + except Exception as e: + self.logger.warning(f"Failed to extract XLSX metadata: {e}") + return DocumentMetadata() + + def _get_stripped(self, value: Optional[str]) -> Optional[str]: + """Return stripped string value, or None if empty.""" + return value.strip() if value else None + + +class XLSMetadataExtractor(BaseMetadataExtractor): """ - XLS 문서에서 메타데이터를 추출합니다. - - xlrd는 제한된 메타데이터만 지원합니다. - - Args: - wb: xlrd Workbook 객체 - - Returns: - 메타데이터 딕셔너리 + XLS Metadata Extractor. + + Extracts metadata from xlrd Workbook objects. + Note: xlrd has limited metadata support. + + Supported fields: + - author (user_name) + + Usage: + extractor = XLSMetadataExtractor() + metadata = extractor.extract(workbook) + text = extractor.format(metadata) """ - metadata = {} - - try: - # xlrd는 제한된 메타데이터 접근만 가능 - if hasattr(wb, 'user_name') and wb.user_name: - metadata['author'] = wb.user_name - - logger.debug(f"Extracted XLS metadata: {list(metadata.keys())}") - - except Exception as e: - logger.warning(f"Failed to extract XLS metadata: {e}") - - return metadata - - -def format_metadata(metadata: Dict[str, Any]) -> str: + + def extract(self, source: Any) -> DocumentMetadata: + """ + Extract metadata from XLS document. + + Args: + source: xlrd Workbook object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + try: + author = None + if hasattr(source, 'user_name') and source.user_name: + author = source.user_name + + return DocumentMetadata(author=author) + except Exception as e: + self.logger.warning(f"Failed to extract XLS metadata: {e}") + return DocumentMetadata() + + +class ExcelMetadataExtractor(BaseMetadataExtractor): """ - 메타데이터 딕셔너리를 읽기 쉬운 문자열로 변환합니다. - - Args: - metadata: 메타데이터 딕셔너리 - - Returns: - 포맷된 메타데이터 문자열 + Unified Excel Metadata Extractor. + + Selects appropriate extractor based on file format. + + Usage: + extractor = ExcelMetadataExtractor() + # For XLSX + metadata = extractor.extract(xlsx_workbook, file_type='xlsx') + # For XLS + metadata = extractor.extract(xls_workbook, file_type='xls') """ - if not metadata: - return "" - - lines = [""] - - field_names = { - 'title': '제목', - 'subject': '주제', - 'author': '작성자', - 'keywords': '키워드', - 'comments': '설명', - 'last_saved_by': '마지막 저장자', - 'create_time': '작성일', - 'last_saved_time': '수정일', - } - - for key, label in field_names.items(): - if key in metadata and metadata[key]: - value = metadata[key] - - # datetime 객체 포맷팅 - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - - lines.append(f" {label}: {value}") - - lines.append("") - - return "\n".join(lines) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._xlsx_extractor = XLSXMetadataExtractor(**kwargs) + self._xls_extractor = XLSMetadataExtractor(**kwargs) + + def extract(self, source: Any, file_type: str = 'xlsx') -> DocumentMetadata: + """ + Extract metadata from Excel document. + + Args: + source: openpyxl Workbook or xlrd Workbook object + file_type: File format ('xlsx' or 'xls') + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + if file_type.lower() == 'xls': + return self._xls_extractor.extract(source) + return self._xlsx_extractor.extract(source) + + +__all__ = [ + 'ExcelMetadataExtractor', + 'XLSXMetadataExtractor', + 'XLSMetadataExtractor', +] diff --git a/contextifier/core/processor/excel_helper/excel_preprocessor.py b/contextifier/core/processor/excel_helper/excel_preprocessor.py new file mode 100644 index 0000000..1ddead0 --- /dev/null +++ b/contextifier/core/processor/excel_helper/excel_preprocessor.py @@ -0,0 +1,83 @@ +# contextifier/core/processor/excel_helper/excel_preprocessor.py +""" +Excel Preprocessor - Process Excel workbook after conversion. + +Processing Pipeline Position: + 1. ExcelFileConverter.convert() → openpyxl.Workbook or xlrd.Book + 2. ExcelPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. ExcelMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (sheets, cells, images, charts) + +Current Implementation: + - Pass-through (Excel uses openpyxl/xlrd objects directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.excel.preprocessor") + + +class ExcelPreprocessor(BasePreprocessor): + """ + Excel Workbook Preprocessor. + + Currently a pass-through implementation as Excel processing + is handled during the content extraction phase using openpyxl/xlrd. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted Excel workbook. + + Args: + converted_data: openpyxl.Workbook or xlrd.Book from ExcelFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the workbook and any extracted resources + """ + metadata: Dict[str, Any] = {} + + # Detect workbook type and extract info + if hasattr(converted_data, 'sheetnames'): + # openpyxl Workbook + metadata['format'] = 'xlsx' + metadata['sheet_count'] = len(converted_data.sheetnames) + metadata['sheet_names'] = converted_data.sheetnames + elif hasattr(converted_data, 'sheet_names'): + # xlrd Book + metadata['format'] = 'xls' + metadata['sheet_count'] = converted_data.nsheets + metadata['sheet_names'] = converted_data.sheet_names() + + logger.debug("Excel preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the Workbook + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - openpyxl.Workbook or xlrd.Book + encoding="utf-8", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "Excel Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is an Excel Workbook object.""" + # openpyxl or xlrd + return hasattr(data, 'sheetnames') or hasattr(data, 'sheet_names') + + +__all__ = ['ExcelPreprocessor'] diff --git a/contextifier/core/processor/html_helper/__init__.py b/contextifier/core/processor/html_helper/__init__.py new file mode 100644 index 0000000..9cf09be --- /dev/null +++ b/contextifier/core/processor/html_helper/__init__.py @@ -0,0 +1,6 @@ +# libs/core/processor/html_helper/__init__.py +"""HTML helper module for HTML file processing.""" + +from contextifier.core.processor.html_helper.html_file_converter import HTMLFileConverter + +__all__ = ['HTMLFileConverter'] diff --git a/contextifier/core/processor/html_helper/html_file_converter.py b/contextifier/core/processor/html_helper/html_file_converter.py new file mode 100644 index 0000000..86a63e0 --- /dev/null +++ b/contextifier/core/processor/html_helper/html_file_converter.py @@ -0,0 +1,91 @@ +# libs/core/processor/html_helper/html_file_converter.py +""" +HTMLFileConverter - HTML file format converter + +Converts binary HTML data to BeautifulSoup object. +""" +from typing import Any, Optional, BinaryIO + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class HTMLFileConverter(BaseFileConverter): + """ + HTML file converter using BeautifulSoup. + + Converts binary HTML data to BeautifulSoup object. + """ + + DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1'] + + def __init__(self, parser: str = 'html.parser'): + """ + Initialize HTMLFileConverter. + + Args: + parser: BeautifulSoup parser to use + """ + self._parser = parser + self._detected_encoding: Optional[str] = None + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + encoding: Optional[str] = None, + **kwargs + ) -> Any: + """ + Convert binary HTML data to BeautifulSoup object. + + Args: + file_data: Raw binary HTML data + file_stream: Ignored + encoding: Specific encoding to use + **kwargs: Additional options + + Returns: + BeautifulSoup object + """ + from bs4 import BeautifulSoup + + # Decode to text first + text = self._decode(file_data, encoding) + return BeautifulSoup(text, self._parser) + + def _decode(self, file_data: bytes, encoding: Optional[str] = None) -> str: + """Decode bytes to string.""" + if encoding: + try: + self._detected_encoding = encoding + return file_data.decode(encoding) + except UnicodeDecodeError: + pass + + for enc in self.DEFAULT_ENCODINGS: + try: + self._detected_encoding = enc + return file_data.decode(enc) + except UnicodeDecodeError: + continue + + # Fallback + self._detected_encoding = 'utf-8' + return file_data.decode('utf-8', errors='replace') + + def get_format_name(self) -> str: + """Return format name.""" + return "HTML Document" + + def validate(self, file_data: bytes) -> bool: + """Validate if data appears to be HTML.""" + if not file_data: + return False + + header = file_data[:100].lower() + return ( + b' PreprocessedData: + """ + Preprocess the converted HTML content. + + Args: + converted_data: BeautifulSoup object from HTMLFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the BeautifulSoup object + """ + metadata: Dict[str, Any] = {} + + if hasattr(converted_data, 'find_all'): + # Count some basic elements + metadata['table_count'] = len(converted_data.find_all('table')) + metadata['image_count'] = len(converted_data.find_all('img')) + metadata['link_count'] = len(converted_data.find_all('a')) + + logger.debug("HTML preprocessor: pass-through, metadata=%s", metadata) + + return PreprocessedData( + raw_content=b"", + clean_content=b"", + encoding="utf-8", + extracted_resources={"soup": converted_data}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "HTML Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is a BeautifulSoup object.""" + return hasattr(data, 'find_all') and hasattr(data, 'get_text') + + +__all__ = ['HTMLPreprocessor'] diff --git a/contextifier/core/processor/hwp_handler.py b/contextifier/core/processor/hwp_handler.py index aed0944..a33d6fa 100644 --- a/contextifier/core/processor/hwp_handler.py +++ b/contextifier/core/processor/hwp_handler.py @@ -24,12 +24,6 @@ HWPTAG_TABLE, HwpRecord, decompress_section, - extract_metadata, - format_metadata, - find_bindata_stream, - extract_bindata_index, - extract_and_upload_image, - process_images_from_bindata, parse_doc_info, parse_table, extract_text_from_stream_raw, @@ -38,6 +32,8 @@ check_file_signature, ) from contextifier.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor +from contextifier.core.processor.hwp_helper.hwp_metadata import HWPMetadataExtractor +from contextifier.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -48,11 +44,34 @@ class HWPHandler(BaseHandler): """HWP 5.0 OLE Format File Processing Handler Class""" - + + def _create_file_converter(self): + """Create HWP-specific file converter.""" + from contextifier.core.processor.hwp_helper.hwp_file_converter import HWPFileConverter + return HWPFileConverter() + + def _create_preprocessor(self): + """Create HWP-specific preprocessor.""" + from contextifier.core.processor.hwp_helper.hwp_preprocessor import HWPPreprocessor + return HWPPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Create HWP-specific chart extractor.""" return HWPChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create HWP-specific metadata extractor.""" + return HWPMetadataExtractor() + + def _create_format_image_processor(self): + """Create HWP-specific image processor.""" + return HWPImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + def extract_text( self, current_file: "CurrentFile", @@ -61,69 +80,82 @@ def extract_text( ) -> str: """ Extract text from HWP file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - - # Check if it's an OLE file using bytes - if not self._is_ole_file(file_data): + + # Check if it's an OLE file using file_converter.validate() + if not self.file_converter.validate(file_data): return self._handle_non_ole_file(current_file, extract_metadata) - + text_content = [] processed_images: Set[str] = set() - + try: - # Open OLE file from stream + # Step 1: Open OLE file using file_converter file_stream = self.get_file_stream(current_file) - + # Pre-extract all charts using ChartExtractor chart_data_list = self.chart_extractor.extract_all_from_file(file_stream) - - file_stream.seek(0) - - with olefile.OleFileIO(file_stream) as ole: + + # Convert binary to OLE object using file_converter + ole = self.file_converter.convert(file_data, file_stream) + + # Step 2: Preprocess - may transform ole in the future + preprocessed = self.preprocess(ole) + ole = preprocessed.clean_content # TRUE SOURCE + + try: if extract_metadata: metadata_text = self._extract_metadata(ole) if metadata_text: text_content.append(metadata_text) text_content.append("") - + bin_data_map = self._parse_docinfo(ole) section_texts = self._extract_body_text(ole, bin_data_map, processed_images) text_content.extend(section_texts) - - image_text = process_images_from_bindata(ole, processed_images=processed_images, image_processor=self.image_processor) + + # Use format_image_processor directly + image_processor = self.format_image_processor + if hasattr(image_processor, 'process_images_from_bindata'): + image_text = image_processor.process_images_from_bindata(ole, processed_images=processed_images) + else: + image_text = "" if image_text: text_content.append("\n\n=== Extracted Images (Not Inline) ===\n") text_content.append(image_text) - + # Add pre-extracted charts for chart_data in chart_data_list: chart_text = self._format_chart_data(chart_data) if chart_text: text_content.append(chart_text) - + finally: + # Close OLE object using file_converter + self.file_converter.close(ole) + except Exception as e: self.logger.error(f"Error processing HWP file: {e}") return f"Error processing HWP file: {str(e)}" - + return "\n".join(text_content) - + def _format_chart_data(self, chart_data: "ChartData") -> str: """Format ChartData using ChartProcessor.""" from contextifier.core.functions.chart_extractor import ChartData - + if not isinstance(chart_data, ChartData): return "" - + if chart_data.has_data(): return self.chart_processor.format_chart_data( chart_type=chart_data.chart_type, @@ -136,68 +168,61 @@ def _format_chart_data(self, chart_data: "ChartData") -> str: chart_type=chart_data.chart_type, title=chart_data.title ) - - def _is_ole_file(self, file_data: bytes) -> bool: - """Check if file data is OLE format.""" - # OLE file signature: D0 CF 11 E0 A1 B1 1A E1 - ole_signature = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' - return file_data[:8] == ole_signature - + def _handle_non_ole_file(self, current_file: "CurrentFile", extract_metadata: bool) -> str: """Handle non-OLE file.""" file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + # Check if it's a ZIP file (HWPX) if file_data[:4] == b'PK\x03\x04': self.logger.info(f"File {file_path} is a Zip file. Processing as HWPX.") - from contextifier.core.processor.hwps_handler import HWPXHandler - hwpx_handler = HWPXHandler(config=self.config, image_processor=self.image_processor) + from contextifier.core.processor.hwpx_handler import HWPXHandler + hwpx_handler = HWPXHandler(config=self.config, image_processor=self.format_image_processor) return hwpx_handler.extract_text(current_file, extract_metadata=extract_metadata) - + # Check HWP 3.0 format if b'HWP Document File' in file_data[:32]: return "[HWP 3.0 Format - Not Supported]" - + return self._process_corrupted_hwp(current_file) - + def _extract_metadata(self, ole: olefile.OleFileIO) -> str: """Extract metadata from OLE file.""" - metadata = extract_metadata(ole) - return format_metadata(metadata) - + return self.extract_and_format_metadata(ole) + def _parse_docinfo(self, ole: olefile.OleFileIO) -> Dict: """Parse DocInfo stream.""" bin_data_by_storage_id, bin_data_list = parse_doc_info(ole) return {'by_storage_id': bin_data_by_storage_id, 'by_index': bin_data_list} - + def _extract_body_text(self, ole: olefile.OleFileIO, bin_data_map: Dict, processed_images: Set[str]) -> List[str]: """Extract text from BodyText sections.""" text_content = [] - + body_text_sections = [ entry for entry in ole.listdir() if entry[0] == "BodyText" and entry[1].startswith("Section") ] body_text_sections.sort(key=lambda x: int(x[1].replace("Section", ""))) - + for section in body_text_sections: stream = ole.openstream(section) data = stream.read() - + decompressed_data, success = decompress_section(data) if not success: continue - + section_text = self._parse_section(decompressed_data, ole, bin_data_map, processed_images) - + if not section_text or not section_text.strip(): section_text = extract_text_from_stream_raw(decompressed_data) - + text_content.append(section_text) - + return text_content - + def _parse_section(self, data: bytes, ole=None, bin_data_map=None, processed_images=None) -> str: """Parse a section.""" try: @@ -206,49 +231,49 @@ def _parse_section(self, data: bytes, ole=None, bin_data_map=None, processed_ima except Exception as e: self.logger.error(f"Error parsing HWP section: {e}") return "" - + def _traverse_tree(self, record: 'HwpRecord', ole=None, bin_data_map=None, processed_images=None) -> str: """Traverse record tree.""" parts = [] - + if record.tag_id == HWPTAG_PARA_HEADER: return self._process_paragraph(record, ole, bin_data_map, processed_images) - + if record.tag_id == HWPTAG_CTRL_HEADER: result = self._process_control(record, ole, bin_data_map, processed_images) if result: return result - + if record.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE: result = self._process_picture(record, ole, bin_data_map, processed_images) if result: return result - + if record.tag_id == HWPTAG_PARA_TEXT: text = record.get_text().replace('\x0b', '') if text: parts.append(text) - + for child in record.children: child_text = self._traverse_tree(child, ole, bin_data_map, processed_images) if child_text: parts.append(child_text) - + if record.tag_id == HWPTAG_PARA_HEADER: parts.append("\n") - + return "".join(parts) - + def _process_paragraph(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> str: """Process PARA_HEADER record.""" parts = [] - + text_rec = next((c for c in record.children if c.tag_id == HWPTAG_PARA_TEXT), None) text_content = text_rec.get_text() if text_rec else "" - + control_tags = [HWPTAG_CTRL_HEADER, HWPTAG_TABLE] controls = [c for c in record.children if c.tag_id in control_tags] - + if '\x0b' in text_content: segments = text_content.split('\x0b') for i, segment in enumerate(segments): @@ -261,25 +286,25 @@ def _process_paragraph(self, record: 'HwpRecord', ole, bin_data_map, processed_i parts.append(text_content) for c in controls: parts.append(self._traverse_tree(c, ole, bin_data_map, processed_images)) - + parts.append("\n") return "".join(parts) - + def _process_control(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]: """Process CTRL_HEADER record.""" if len(record.payload) < 4: return None - + ctrl_id = record.payload[:4][::-1] - + if ctrl_id == b'tbl ': return parse_table(record, self._traverse_tree, ole, bin_data_map, processed_images) - + if ctrl_id == b'gso ': return self._process_gso(record, ole, bin_data_map, processed_images) - + return None - + def _process_gso(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]: """Process GSO (Graphic Shape Object) record.""" def find_pictures(rec): @@ -289,7 +314,7 @@ def find_pictures(rec): for child in rec.children: results.extend(find_pictures(child)) return results - + pictures = find_pictures(record) if pictures: image_parts = [] @@ -299,74 +324,77 @@ def find_pictures(rec): image_parts.append(img_result) if image_parts: return "".join(image_parts) - + return None - + def _process_picture(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]: """Process SHAPE_COMPONENT_PICTURE record.""" if not bin_data_map or not ole: return None - + bin_data_list = bin_data_map.get('by_index', []) if not bin_data_list: return None - - bindata_index = extract_bindata_index(record.payload, len(bin_data_list)) - + + image_processor = self.format_image_processor + + # Use image processor methods directly + bindata_index = image_processor.extract_bindata_index(record.payload, len(bin_data_list)) + if bindata_index and 0 < bindata_index <= len(bin_data_list): storage_id, ext = bin_data_list[bindata_index - 1] if storage_id > 0: - target_stream = find_bindata_stream(ole, storage_id, ext) + target_stream = image_processor.find_bindata_stream(ole, storage_id, ext) if target_stream: - return extract_and_upload_image(ole, target_stream, processed_images, image_processor=self.image_processor) - + return image_processor.extract_and_save_image(ole, target_stream, processed_images) + if len(bin_data_list) == 1: storage_id, ext = bin_data_list[0] if storage_id > 0: - target_stream = find_bindata_stream(ole, storage_id, ext) + target_stream = image_processor.find_bindata_stream(ole, storage_id, ext) if target_stream: - return extract_and_upload_image(ole, target_stream, processed_images, image_processor=self.image_processor) - + return image_processor.extract_and_save_image(ole, target_stream, processed_images) + return None - + def _process_corrupted_hwp(self, current_file: "CurrentFile") -> str: """Attempt forensic recovery of corrupted HWP file.""" file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") - + self.logger.info(f"Starting forensic recovery for: {file_path}") text_content = [] - + try: raw_data = file_data - + file_type = check_file_signature(raw_data) if file_type == "HWP3.0": return "[HWP 3.0 Format - Not Supported]" - + zlib_chunks = find_zlib_streams(raw_data, min_size=50) - + for offset, decompressed in zlib_chunks: parsed_text = self._parse_section(decompressed) if not parsed_text or not parsed_text.strip(): parsed_text = extract_text_from_stream_raw(decompressed) if parsed_text and len(parsed_text.strip()) > 0: text_content.append(parsed_text) - + if not text_content: plain_text = extract_text_from_stream_raw(raw_data) if plain_text and len(plain_text) > 100: text_content.append(plain_text) - - image_text = recover_images_from_raw(raw_data, image_processor=self.image_processor) + + image_text = recover_images_from_raw(raw_data, image_processor=self.format_image_processor) if image_text: text_content.append(f"\n\n=== Recovered Images ===\n{image_text}") - + except Exception as e: self.logger.error(f"Forensic recovery failed: {e}") return f"Forensic recovery failed: {str(e)}" - + if not text_content: return "[Forensic Recovery: No text found]" - + return "\n".join(text_content) diff --git a/contextifier/core/processor/hwp_helper/__init__.py b/contextifier/core/processor/hwp_helper/__init__.py index 519af8a..6e44835 100644 --- a/contextifier/core/processor/hwp_helper/__init__.py +++ b/contextifier/core/processor/hwp_helper/__init__.py @@ -45,22 +45,12 @@ # Metadata from contextifier.core.processor.hwp_helper.hwp_metadata import ( - extract_metadata, + HWPMetadataExtractor, parse_hwp_summary_information, - format_metadata, - MetadataHelper, ) -# Image -from contextifier.core.processor.hwp_helper.hwp_image import ( - try_decompress_image, - save_image_to_local, - find_bindata_stream, - extract_bindata_index, - extract_and_upload_image, - process_images_from_bindata, - ImageHelper, -) +# Image Processor (replaces hwp_image.py utility functions) +from contextifier.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor # Chart Extractor from contextifier.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor @@ -109,18 +99,10 @@ 'decompress_stream', 'decompress_section', # Metadata - 'extract_metadata', + 'HWPMetadataExtractor', 'parse_hwp_summary_information', - 'format_metadata', - 'MetadataHelper', - # Image - 'try_decompress_image', - 'save_image_to_local', - 'find_bindata_stream', - 'extract_bindata_index', - 'extract_and_upload_image', - 'process_images_from_bindata', - 'ImageHelper', + # Image Processor + 'HWPImageProcessor', # Chart Extractor 'HWPChartExtractor', # DocInfo diff --git a/contextifier/core/processor/hwp_helper/hwp_file_converter.py b/contextifier/core/processor/hwp_helper/hwp_file_converter.py new file mode 100644 index 0000000..ed5059f --- /dev/null +++ b/contextifier/core/processor/hwp_helper/hwp_file_converter.py @@ -0,0 +1,59 @@ +# libs/core/processor/hwp_helper/hwp_file_converter.py +""" +HWPFileConverter - HWP file format converter + +Converts binary HWP data to OLE file object. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class HWPFileConverter(BaseFileConverter): + """ + HWP file converter using olefile. + + Converts binary HWP (OLE format) data to OleFileIO object. + """ + + # OLE magic number + OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary HWP data to OleFileIO object. + + Args: + file_data: Raw binary HWP data + file_stream: Optional file stream + **kwargs: Additional options + + Returns: + olefile.OleFileIO object + """ + import olefile + + stream = file_stream if file_stream is not None else BytesIO(file_data) + stream.seek(0) + return olefile.OleFileIO(stream) + + def get_format_name(self) -> str: + """Return format name.""" + return "HWP Document (OLE)" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is a valid OLE file.""" + if not file_data or len(file_data) < 8: + return False + return file_data[:8] == self.OLE_MAGIC + + def close(self, converted_object: Any) -> None: + """Close the OLE file.""" + if converted_object is not None and hasattr(converted_object, 'close'): + converted_object.close() diff --git a/contextifier/core/processor/hwp_helper/hwp_image.py b/contextifier/core/processor/hwp_helper/hwp_image.py deleted file mode 100644 index 3b08cc5..0000000 --- a/contextifier/core/processor/hwp_helper/hwp_image.py +++ /dev/null @@ -1,293 +0,0 @@ -# libs/core/processor/hwp_helper/hwp_image.py -""" -HWP 이미지 처리 유틸리티 - -HWP 5.0 OLE 파일에서 이미지를 추출하고 로컬에 저장합니다. -- try_decompress_image: zlib 압축 이미지 해제 -- find_bindata_stream: BinData 스트림 경로 찾기 -- extract_bindata_index: SHAPE_COMPONENT_PICTURE에서 BinData 인덱스 추출 -- extract_and_upload_image: 이미지 추출 및 로컬 저장 -- process_images_from_bindata: BinData에서 모든 이미지 추출 -""" -import io -import os -import zlib -import struct -import logging -import traceback -from typing import Optional, List, Dict, Set - -import olefile -from PIL import Image - -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -def try_decompress_image(data: bytes) -> bytes: - """ - HWP 이미지 데이터 압축 해제를 시도합니다. - - HWP 파일에서 이미지가 zlib으로 압축되어 있을 수 있으므로, - 다양한 전략으로 압축 해제를 시도합니다. - - Args: - data: 원본 이미지 데이터 (압축되었을 수 있음) - - Returns: - 압축 해제된 이미지 데이터 (또는 원본 데이터) - """ - # 1. zlib 헤더가 있으면 zlib 압축 해제 시도 - if data.startswith(b'\x78'): - try: - return zlib.decompress(data) - except Exception: - pass - - # 2. 이미 유효한 이미지인지 확인 - try: - with Image.open(io.BytesIO(data)) as img: - img.verify() - return data # 유효한 이미지 - except Exception: - pass - - # 3. raw deflate (헤더 없음) 시도 - try: - return zlib.decompress(data, -15) - except Exception: - pass - - return data - - -def save_image_to_local( - image_data: bytes, - image_processor: ImageProcessor -) -> Optional[str]: - """ - 이미지를 로컬에 저장합니다. - - Args: - image_data: 이미지 바이너리 데이터 - image_processor: 이미지 프로세서 인스턴스 - - Returns: - 이미지 태그 문자열 또는 None - """ - return image_processor.save_image(image_data) - - -def find_bindata_stream(ole: olefile.OleFileIO, storage_id: int, ext: str) -> Optional[List[str]]: - """ - OLE 컨테이너에서 storage_id와 확장자로 BinData 스트림을 찾습니다. - - Args: - ole: OLE 파일 객체 - storage_id: BinData 스토리지 ID - ext: 파일 확장자 - - Returns: - 찾은 스트림 경로 또는 None - """ - ole_dirs = ole.listdir() - - candidates = [ - f"BinData/BIN{storage_id:04X}.{ext}", - f"BinData/BIN{storage_id:04x}.{ext}", - f"BinData/Bin{storage_id:04X}.{ext}", - f"BinData/Bin{storage_id:04x}.{ext}", - f"BinData/BIN{storage_id:04X}.{ext.lower()}", - f"BinData/BIN{storage_id:04x}.{ext.lower()}", - ] - - # 패턴 매칭으로 찾기 - for entry in ole_dirs: - if entry[0] == "BinData" and len(entry) > 1: - fname = entry[1].lower() - expected_patterns = [ - f"bin{storage_id:04x}", - f"bin{storage_id:04X}", - ] - for pattern in expected_patterns: - if pattern.lower() in fname.lower(): - logger.debug(f"Found stream by pattern match: {entry}") - return entry - - # 정확한 경로 매칭 - for candidate in candidates: - candidate_parts = candidate.split('/') - if candidate_parts in ole_dirs: - return candidate_parts - - # 대소문자 무시 매칭 - for entry in ole_dirs: - if entry[0] == "BinData" and len(entry) > 1: - fname = entry[1] - for candidate in candidates: - if fname.lower() == candidate.split('/')[-1].lower(): - return entry - - return None - - -def extract_bindata_index(payload: bytes, bin_data_list_len: int) -> Optional[int]: - """ - SHAPE_COMPONENT_PICTURE 레코드 payload에서 BinData 인덱스를 추출합니다. - - 여러 HWP 버전 호환을 위해 다양한 오프셋 전략을 시도합니다. - - Args: - payload: SHAPE_COMPONENT_PICTURE 레코드의 payload - bin_data_list_len: bin_data_list의 길이 (유효 범위 검증용) - - Returns: - BinData 인덱스 (1-based) 또는 None - """ - if bin_data_list_len == 0: - return None - - bindata_index = None - - # Strategy 1: 오프셋 79 (HWP 5.0.3.x+ 스펙) - if len(payload) >= 81: - test_id = struct.unpack('= 10: - test_id = struct.unpack('= offset + 2: - test_id = struct.unpack(' Optional[str]: - """ - OLE 스트림에서 이미지를 추출하여 로컬에 저장합니다. - - Args: - ole: OLE 파일 객체 - target_stream: 스트림 경로 - processed_images: 처리된 이미지 경로 집합 - image_processor: 이미지 프로세서 인스턴스 - - Returns: - 이미지 태그 문자열 또는 None - """ - try: - stream = ole.openstream(target_stream) - image_data = stream.read() - image_data = try_decompress_image(image_data) - - image_tag = save_image_to_local(image_data, image_processor) - if image_tag: - if processed_images is not None: - processed_images.add("/".join(target_stream)) - logger.info(f"Successfully extracted inline image: {image_tag}") - return f"\n{image_tag}\n" - except Exception as e: - logger.warning(f"Failed to process inline HWP image {target_stream}: {e}") - logger.debug(traceback.format_exc()) - - return None - - -def process_images_from_bindata( - ole: olefile.OleFileIO, - processed_images: Optional[Set[str]], - image_processor: ImageProcessor -) -> str: - """ - BinData 스토리지에서 이미지를 추출하여 로컬에 저장합니다. - - Args: - ole: OLE 파일 객체 - processed_images: 이미 처리된 이미지 경로 집합 (스킵용) - image_processor: 이미지 프로세서 인스턴스 - - Returns: - 이미지 태그들을 결합한 문자열 - """ - results = [] - - try: - bindata_streams = [ - entry for entry in ole.listdir() - if entry[0] == "BinData" - ] - - for stream_path in bindata_streams: - if processed_images and "/".join(stream_path) in processed_images: - continue - - stream_name = stream_path[-1] - ext = os.path.splitext(stream_name)[1].lower() - if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']: - stream = ole.openstream(stream_path) - image_data = stream.read() - image_data = try_decompress_image(image_data) - - image_tag = save_image_to_local(image_data, image_processor) - if image_tag: - results.append(image_tag) - - except Exception as e: - logger.warning(f"Error processing HWP images: {e}") - - return "\n\n".join(results) - - -class ImageHelper: - """HWP 이미지 처리 유틸리티""" - - @staticmethod - def try_decompress_image(data: bytes) -> bytes: - return try_decompress_image(data) - - @staticmethod - def save_image_to_local( - image_data: bytes, - image_processor: ImageProcessor - ) -> Optional[str]: - return save_image_to_local(image_data, image_processor) - - -__all__ = [ - 'try_decompress_image', - 'save_image_to_local', - 'find_bindata_stream', - 'extract_bindata_index', - 'extract_and_upload_image', - 'process_images_from_bindata', - 'ImageHelper', -] diff --git a/contextifier/core/processor/hwp_helper/hwp_image_processor.py b/contextifier/core/processor/hwp_helper/hwp_image_processor.py new file mode 100644 index 0000000..40c0f5a --- /dev/null +++ b/contextifier/core/processor/hwp_helper/hwp_image_processor.py @@ -0,0 +1,413 @@ +# contextifier/core/processor/hwp_helper/hwp_image_processor.py +""" +HWP Image Processor + +Provides HWP-specific image processing that inherits from ImageProcessor. +Handles BinData stream images and embedded images in HWP 5.0 OLE format. + +This class consolidates all HWP image extraction logic including: +- zlib decompression for compressed images +- BinData stream finding and extraction +- OLE storage image processing +""" +import io +import os +import zlib +import struct +import logging +from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING + +from PIL import Image + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +if TYPE_CHECKING: + import olefile + +logger = logging.getLogger("contextify.image_processor.hwp") + + +class HWPImageProcessor(ImageProcessor): + """ + HWP-specific image processor. + + Inherits from ImageProcessor and provides HWP-specific processing. + + Handles: + - BinData stream images + - Compressed images (zlib) + - Embedded OLE images + + Example: + processor = HWPImageProcessor() + + # Process BinData image + tag = processor.process_image(image_data, bindata_id="BIN0001") + + # Process from OLE stream + tag = processor.process_bindata_stream(ole, stream_path) + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize HWPImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + bindata_id: Optional[str] = None, + image_index: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save HWP image data. + + Args: + image_data: Raw image binary data + bindata_id: BinData ID (e.g., "BIN0001") + image_index: Image index (for naming) + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = None + if bindata_id is not None: + custom_name = f"hwp_{bindata_id}" + elif image_index is not None: + custom_name = f"hwp_image_{image_index}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_bindata_stream( + self, + ole: "olefile.OleFileIO", + stream_path: str, + is_compressed: bool = True, + ) -> Optional[str]: + """ + Process image from HWP BinData OLE stream. + + Args: + ole: OleFileIO object + stream_path: Path to BinData stream + is_compressed: Whether data is zlib compressed + + Returns: + Image tag string, or None on failure + """ + try: + import zlib + + stream_data = ole.openstream(stream_path).read() + + if is_compressed: + try: + image_data = zlib.decompress(stream_data, -15) + except zlib.error: + # Try without negative windowBits + try: + image_data = zlib.decompress(stream_data) + except zlib.error: + # Not compressed after all + image_data = stream_data + else: + image_data = stream_data + + # Extract bindata ID from path + bindata_id = stream_path.split('/')[-1] if '/' in stream_path else stream_path + + return self.process_image(image_data, bindata_id=bindata_id) + + except Exception as e: + self._logger.warning(f"Failed to process BinData stream {stream_path}: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + bindata_id: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded HWP image. + + Args: + image_data: Image binary data + image_name: Original image filename + bindata_id: BinData ID + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and bindata_id is not None: + custom_name = f"hwp_embed_{bindata_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def decompress_and_process( + self, + compressed_data: bytes, + bindata_id: Optional[str] = None, + ) -> Optional[str]: + """ + Decompress and process zlib-compressed image data. + + Args: + compressed_data: zlib compressed image data + bindata_id: BinData ID + + Returns: + Image tag string, or None on failure + """ + image_data = self.try_decompress_image(compressed_data) + return self.process_image(image_data, bindata_id=bindata_id) + + @staticmethod + def try_decompress_image(data: bytes) -> bytes: + """ + Attempt to decompress HWP image data. + + HWP files may contain zlib-compressed images, so this method + tries various decompression strategies. + + Args: + data: Original image data (possibly compressed) + + Returns: + Decompressed image data (or original if not compressed) + """ + # 1. Try zlib decompression if zlib header present + if data.startswith(b'\x78'): + try: + return zlib.decompress(data) + except Exception: + pass + + # 2. Check if already a valid image + try: + with Image.open(io.BytesIO(data)) as img: + img.verify() + return data # Valid image + except Exception: + pass + + # 3. Try raw deflate (no header) + try: + return zlib.decompress(data, -15) + except Exception: + pass + + return data + + @staticmethod + def find_bindata_stream(ole: "olefile.OleFileIO", storage_id: int, ext: str) -> Optional[List[str]]: + """ + Find BinData stream in OLE container by storage_id and extension. + + Args: + ole: OLE file object + storage_id: BinData storage ID + ext: File extension + + Returns: + Stream path if found, None otherwise + """ + ole_dirs = ole.listdir() + + candidates = [ + f"BinData/BIN{storage_id:04X}.{ext}", + f"BinData/BIN{storage_id:04x}.{ext}", + f"BinData/Bin{storage_id:04X}.{ext}", + f"BinData/Bin{storage_id:04x}.{ext}", + f"BinData/BIN{storage_id:04X}.{ext.lower()}", + f"BinData/BIN{storage_id:04x}.{ext.lower()}", + ] + + # Pattern matching + for entry in ole_dirs: + if entry[0] == "BinData" and len(entry) > 1: + fname = entry[1].lower() + expected_patterns = [ + f"bin{storage_id:04x}", + f"bin{storage_id:04X}", + ] + for pattern in expected_patterns: + if pattern.lower() in fname.lower(): + logger.debug(f"Found stream by pattern match: {entry}") + return entry + + # Exact path matching + for candidate in candidates: + candidate_parts = candidate.split('/') + if candidate_parts in ole_dirs: + return candidate_parts + + # Case-insensitive matching + for entry in ole_dirs: + if entry[0] == "BinData" and len(entry) > 1: + fname = entry[1] + for candidate in candidates: + if fname.lower() == candidate.split('/')[-1].lower(): + return entry + + return None + + @staticmethod + def extract_bindata_index(payload: bytes, bin_data_list_len: int) -> Optional[int]: + """ + Extract BinData index from SHAPE_COMPONENT_PICTURE record payload. + + Tries various offset strategies for compatibility with different HWP versions. + + Args: + payload: SHAPE_COMPONENT_PICTURE record payload + bin_data_list_len: Length of bin_data_list (for validation) + + Returns: + BinData index (1-based) or None + """ + if bin_data_list_len == 0: + return None + + bindata_index = None + + # Strategy 1: 오프셋 79 (HWP 5.0.3.x+ 스펙) + if len(payload) >= 81: + test_id = struct.unpack('= 10: + test_id = struct.unpack('= offset + 2: + test_id = struct.unpack(' Optional[str]: + """ + Extract image from OLE stream and save locally. + + Args: + ole: OLE file object + target_stream: Stream path + processed_images: Set of processed image paths + + Returns: + Image tag string or None + """ + try: + stream = ole.openstream(target_stream) + image_data = stream.read() + image_data = self.try_decompress_image(image_data) + + bindata_id = target_stream[-1] if target_stream else None + image_tag = self.process_image(image_data, bindata_id=bindata_id) + + if image_tag: + if processed_images is not None: + processed_images.add("/".join(target_stream)) + logger.info(f"Successfully extracted inline image: {image_tag}") + return f"\n{image_tag}\n" + except Exception as e: + logger.warning(f"Failed to process inline HWP image {target_stream}: {e}") + + return None + + def process_images_from_bindata( + self, + ole: "olefile.OleFileIO", + processed_images: Optional[Set[str]] = None, + ) -> str: + """ + Extract images from BinData storage and save locally. + + Args: + ole: OLE file object + processed_images: Set of already processed image paths (to skip) + + Returns: + Joined image tag strings + """ + results = [] + + try: + bindata_streams = [ + entry for entry in ole.listdir() + if entry[0] == "BinData" + ] + + for stream_path in bindata_streams: + if processed_images and "/".join(stream_path) in processed_images: + continue + + stream_name = stream_path[-1] + ext = os.path.splitext(stream_name)[1].lower() + if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']: + stream = ole.openstream(stream_path) + image_data = stream.read() + image_data = self.try_decompress_image(image_data) + + bindata_id = stream_name + image_tag = self.process_image(image_data, bindata_id=bindata_id) + if image_tag: + results.append(image_tag) + + except Exception as e: + logger.warning(f"Error processing HWP images: {e}") + + return "\n\n".join(results) + + +__all__ = ["HWPImageProcessor"] diff --git a/contextifier/core/processor/hwp_helper/hwp_metadata.py b/contextifier/core/processor/hwp_helper/hwp_metadata.py index 938f270..438ae0d 100644 --- a/contextifier/core/processor/hwp_helper/hwp_metadata.py +++ b/contextifier/core/processor/hwp_helper/hwp_metadata.py @@ -1,99 +1,131 @@ -# service/document_processor/processor/hwp_helper/hwp_metadata.py +# contextifier/core/processor/hwp_helper/hwp_metadata.py """ -HWP 메타데이터 추출 유틸리티 +HWP Metadata Extraction Module -HWP 5.0 OLE 파일에서 메타데이터를 추출합니다. -- extract_metadata: OLE 표준 메타데이터 + HwpSummaryInformation 추출 -- parse_hwp_summary_information: HWP 고유 Property Set 파싱 -- format_metadata: 메타데이터를 문자열로 포맷팅 +Provides HWPMetadataExtractor class for extracting metadata from HWP 5.0 OLE files. +Implements BaseMetadataExtractor interface. + +Extraction methods: +1. olefile's get_metadata() - OLE standard metadata +2. HwpSummaryInformation stream direct parsing - HWP-specific metadata + +Note: HWP is a Korean-native document format, so Korean metadata labels +are preserved in output for proper display. """ import struct import logging from datetime import datetime -from typing import Dict, Any +from typing import Dict, Any, Optional import olefile +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) + logger = logging.getLogger("document-processor") -def extract_metadata(ole: olefile.OleFileIO) -> Dict[str, Any]: +class HWPMetadataExtractor(BaseMetadataExtractor): """ - HWP 파일의 메타데이터를 추출합니다. + HWP Metadata Extractor. - 두 가지 방법으로 메타데이터를 추출합니다: - 1. olefile의 get_metadata() - OLE 표준 메타데이터 - 2. HwpSummaryInformation 스트림 직접 파싱 - HWP 고유 메타데이터 + Extracts metadata from olefile OleFileIO objects. + Supports both OLE standard metadata and HWP-specific HwpSummaryInformation. - Args: - ole: OLE 파일 객체 - - Returns: - 추출된 메타데이터 딕셔너리 + Supported fields: + - title, subject, author, keywords, comments + - last_saved_by, create_time, last_saved_time + + Usage: + extractor = HWPMetadataExtractor() + metadata = extractor.extract(ole_file) + text = extractor.format(metadata) """ - metadata = {} - # Method 1: olefile의 get_metadata() 사용 - try: - ole_meta = ole.get_metadata() - - if ole_meta: - if ole_meta.title: - metadata['title'] = ole_meta.title - if ole_meta.subject: - metadata['subject'] = ole_meta.subject - if ole_meta.author: - metadata['author'] = ole_meta.author - if ole_meta.keywords: - metadata['keywords'] = ole_meta.keywords - if ole_meta.comments: - metadata['comments'] = ole_meta.comments - if ole_meta.last_saved_by: - metadata['last_saved_by'] = ole_meta.last_saved_by - if ole_meta.create_time: - metadata['create_time'] = ole_meta.create_time - if ole_meta.last_saved_time: - metadata['last_saved_time'] = ole_meta.last_saved_time + def extract(self, source: olefile.OleFileIO) -> DocumentMetadata: + """ + Extract metadata from HWP file. - logger.info(f"Extracted OLE metadata: {metadata}") + Args: + source: olefile OleFileIO object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + metadata_dict: Dict[str, Any] = {} - except Exception as e: - logger.warning(f"Failed to extract OLE metadata: {e}") - - # Method 2: HwpSummaryInformation 스트림 직접 파싱 - try: - hwp_summary_stream = '\x05HwpSummaryInformation' - if ole.exists(hwp_summary_stream): - logger.debug("Found HwpSummaryInformation stream, attempting to parse...") - stream = ole.openstream(hwp_summary_stream) - data = stream.read() - hwp_meta = parse_hwp_summary_information(data) + # Method 1: Use olefile's get_metadata() + try: + ole_meta = source.get_metadata() - # HWP 특화 메타데이터가 우선 - for key, value in hwp_meta.items(): - if value: - metadata[key] = value - - except Exception as e: - logger.debug(f"Failed to parse HwpSummaryInformation: {e}") - - return metadata + if ole_meta: + if ole_meta.title: + metadata_dict['title'] = ole_meta.title + if ole_meta.subject: + metadata_dict['subject'] = ole_meta.subject + if ole_meta.author: + metadata_dict['author'] = ole_meta.author + if ole_meta.keywords: + metadata_dict['keywords'] = ole_meta.keywords + if ole_meta.comments: + metadata_dict['comments'] = ole_meta.comments + if ole_meta.last_saved_by: + metadata_dict['last_saved_by'] = ole_meta.last_saved_by + if ole_meta.create_time: + metadata_dict['create_time'] = ole_meta.create_time + if ole_meta.last_saved_time: + metadata_dict['last_saved_time'] = ole_meta.last_saved_time + + self.logger.debug(f"Extracted OLE metadata: {list(metadata_dict.keys())}") + + except Exception as e: + self.logger.warning(f"Failed to extract OLE metadata: {e}") + + # Method 2: Parse HwpSummaryInformation stream directly + try: + hwp_summary_stream = '\x05HwpSummaryInformation' + if source.exists(hwp_summary_stream): + self.logger.debug("Found HwpSummaryInformation stream, attempting to parse...") + stream = source.openstream(hwp_summary_stream) + data = stream.read() + hwp_meta = parse_hwp_summary_information(data) + + # HWP-specific metadata takes priority + for key, value in hwp_meta.items(): + if value: + metadata_dict[key] = value + + except Exception as e: + self.logger.debug(f"Failed to parse HwpSummaryInformation: {e}") + + return DocumentMetadata( + title=metadata_dict.get('title'), + subject=metadata_dict.get('subject'), + author=metadata_dict.get('author'), + keywords=metadata_dict.get('keywords'), + comments=metadata_dict.get('comments'), + last_saved_by=metadata_dict.get('last_saved_by'), + create_time=metadata_dict.get('create_time'), + last_saved_time=metadata_dict.get('last_saved_time'), + ) def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]: """ - HwpSummaryInformation 스트림을 파싱합니다. (OLE Property Set 형식) + Parse HwpSummaryInformation stream (OLE Property Set format). - OLE Property Set 구조: + OLE Property Set structure: - Header (28 bytes) - Section(s) containing property ID/offset pairs - - Property values (string, datetime 등) + - Property values (string, datetime, etc.) Args: - data: HwpSummaryInformation 스트림 바이너리 데이터 + data: HwpSummaryInformation stream binary data Returns: - 파싱된 메타데이터 딕셔너리 + Dictionary containing parsed metadata. """ metadata = {} @@ -103,7 +135,7 @@ def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]: pos = 0 _byte_order = struct.unpack(' Dict[str, Any]: if section_offset >= len(data): return metadata - # Section 파싱 + # Parse section pos = section_offset if len(data) < pos + 8: return metadata @@ -123,7 +155,7 @@ def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]: num_properties = struct.unpack(' Dict[str, Any]: properties.append((prop_id, prop_offset)) pos += 8 - # Property 값 읽기 + # Read property values for prop_id, prop_offset in properties: abs_offset = section_offset + prop_offset if abs_offset + 4 >= len(data): @@ -173,7 +205,7 @@ def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]: except Exception: pass - # Property ID 매핑 + # Property ID mapping if value: if prop_id == 0x02: metadata['title'] = value @@ -198,59 +230,7 @@ def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]: return metadata -def format_metadata(metadata: Dict[str, Any]) -> str: - """ - 메타데이터 딕셔너리를 읽기 쉬운 문자열로 포맷팅합니다. - - Args: - metadata: 메타데이터 딕셔너리 - - Returns: - 포맷팅된 메타데이터 문자열 - """ - if not metadata: - return "" - - lines = [""] - - field_names = { - 'title': '제목', - 'subject': '주제', - 'author': '작성자', - 'keywords': '키워드', - 'comments': '설명', - 'last_saved_by': '마지막 저장자', - 'create_time': '작성일', - 'last_saved_time': '수정일', - } - - for key, label in field_names.items(): - if key in metadata and metadata[key]: - value = metadata[key] - - # Format datetime objects - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - - lines.append(f" {label}: {value}") - - lines.append("") - - return "\n".join(lines) - - -# 하위 호환성을 위한 클래스 래퍼 -class MetadataHelper: - """메타데이터 처리 관련 유틸리티 (하위 호환성)""" - - @staticmethod - def format_metadata(metadata: Dict[str, Any]) -> str: - return format_metadata(metadata) - - __all__ = [ - 'extract_metadata', + 'HWPMetadataExtractor', 'parse_hwp_summary_information', - 'format_metadata', - 'MetadataHelper', ] diff --git a/contextifier/core/processor/hwp_helper/hwp_preprocessor.py b/contextifier/core/processor/hwp_helper/hwp_preprocessor.py new file mode 100644 index 0000000..986ee2e --- /dev/null +++ b/contextifier/core/processor/hwp_helper/hwp_preprocessor.py @@ -0,0 +1,82 @@ +# contextifier/core/processor/hwp_helper/hwp_preprocessor.py +""" +HWP Preprocessor - Process HWP OLE document after conversion. + +Processing Pipeline Position: + 1. HWPFileConverter.convert() → olefile.OleFileIO + 2. HWPPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. HWPMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (body text, tables, images) + +Current Implementation: + - Pass-through (HWP uses olefile object directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.hwp.preprocessor") + + +class HWPPreprocessor(BasePreprocessor): + """ + HWP OLE Document Preprocessor. + + Currently a pass-through implementation as HWP processing + is handled during the content extraction phase using olefile. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted HWP OLE document. + + Args: + converted_data: olefile.OleFileIO object from HWPFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the OLE object and any extracted resources + """ + metadata: Dict[str, Any] = {} + + if hasattr(converted_data, 'listdir'): + try: + streams = converted_data.listdir() + metadata['stream_count'] = len(streams) + # Check for common HWP streams + has_body = any('BodyText' in '/'.join(s) for s in streams) + has_docinfo = any('DocInfo' in '/'.join(s) for s in streams) + metadata['has_body_text'] = has_body + metadata['has_doc_info'] = has_docinfo + except Exception: + pass + + logger.debug("HWP preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the OLE object + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - olefile.OleFileIO + encoding="utf-8", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "HWP Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is an OLE file object.""" + return hasattr(data, 'listdir') and hasattr(data, 'openstream') + + +__all__ = ['HWPPreprocessor'] diff --git a/contextifier/core/processor/hwps_handler.py b/contextifier/core/processor/hwpx_handler.py similarity index 60% rename from contextifier/core/processor/hwps_handler.py rename to contextifier/core/processor/hwpx_handler.py index 5c4f7ea..a835d26 100644 --- a/contextifier/core/processor/hwps_handler.py +++ b/contextifier/core/processor/hwpx_handler.py @@ -5,21 +5,18 @@ Class-based handler for HWPX files inheriting from BaseHandler. """ import io -import zipfile import logging from typing import Dict, Any, Set, TYPE_CHECKING from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.chart_extractor import BaseChartExtractor -from contextifier.core.processor.hwp_helper import MetadataHelper from contextifier.core.processor.hwpx_helper import ( - extract_hwpx_metadata, parse_bin_item_map, parse_hwpx_section, - process_hwpx_images, - get_remaining_images, ) from contextifier.core.processor.hwpx_helper.hwpx_chart_extractor import HWPXChartExtractor +from contextifier.core.processor.hwpx_helper.hwpx_metadata import HWPXMetadataExtractor +from contextifier.core.processor.hwpx_helper.hwpx_image_processor import HWPXImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -30,11 +27,34 @@ class HWPXHandler(BaseHandler): """HWPX (ZIP/XML based Korean document) Processing Handler Class""" - + + def _create_file_converter(self): + """Create HWPX-specific file converter.""" + from contextifier.core.processor.hwpx_helper.hwpx_file_converter import HWPXFileConverter + return HWPXFileConverter() + + def _create_preprocessor(self): + """Create HWPX-specific preprocessor.""" + from contextifier.core.processor.hwpx_helper.hwpx_preprocessor import HWPXPreprocessor + return HWPXPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Create HWPX-specific chart extractor.""" return HWPXChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create HWPX-specific metadata extractor.""" + return HWPXMetadataExtractor() + + def _create_format_image_processor(self): + """Create HWPX-specific image processor.""" + return HWPXImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + def extract_text( self, current_file: "CurrentFile", @@ -43,34 +63,32 @@ def extract_text( ) -> str: """ Extract text from HWPX file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") + file_data = current_file.get("file_data", b"") text_content = [] - + + # Check if it's a valid ZIP file using file_converter.validate() + if not self.file_converter.validate(file_data): + self.logger.error("Not a valid Zip file: %s", file_path) + return "" + try: - # Open ZIP from stream + # Get file stream file_stream = self.get_file_stream(current_file) - - # Check if valid ZIP - if not self._is_valid_zip(file_stream): - self.logger.error(f"Not a valid Zip file: {file_path}") - return "" - - # Reset stream position - file_stream.seek(0) - + # Pre-extract all charts using ChartExtractor chart_data_list = self.chart_extractor.extract_all_from_file(file_stream) chart_idx = [0] # Mutable container for closure - + def get_next_chart() -> str: """Callback to get the next pre-extracted chart content.""" if chart_idx[0] < len(chart_data_list): @@ -78,52 +96,62 @@ def get_next_chart() -> str: chart_idx[0] += 1 return self._format_chart_data(chart_data) return "" - - file_stream.seek(0) - - with zipfile.ZipFile(file_stream, 'r') as zf: + + # Step 1: Convert binary to ZipFile using file_converter + zf = self.file_converter.convert(file_data, file_stream) + + # Step 2: Preprocess - clean_content is the TRUE SOURCE + preprocessed = self.preprocess(zf) + zf = preprocessed.clean_content # TRUE SOURCE + + try: if extract_metadata: - metadata = extract_hwpx_metadata(zf) - metadata_text = MetadataHelper.format_metadata(metadata) + metadata_text = self.extract_and_format_metadata(zf) if metadata_text: text_content.append(metadata_text) text_content.append("") - + bin_item_map = parse_bin_item_map(zf) - + section_files = [ - f for f in zf.namelist() + f for f in zf.namelist() if f.startswith("Contents/section") and f.endswith(".xml") ] section_files.sort(key=lambda x: int(x.replace("Contents/section", "").replace(".xml", ""))) - + processed_images: Set[str] = set() - + for sec_file in section_files: with zf.open(sec_file) as f: xml_content = f.read() - section_text = parse_hwpx_section(xml_content, zf, bin_item_map, processed_images, image_processor=self.image_processor) + section_text = parse_hwpx_section(xml_content, zf, bin_item_map, processed_images, image_processor=self.format_image_processor) text_content.append(section_text) - - remaining_images = get_remaining_images(zf, processed_images) - if remaining_images: - image_text = process_hwpx_images(zf, remaining_images, image_processor=self.image_processor) - if image_text: - text_content.append("\n\n=== Extracted Images (Not Inline) ===\n") - text_content.append(image_text) - + + # Use format_image_processor directly + image_processor = self.format_image_processor + if hasattr(image_processor, 'get_remaining_images'): + remaining_images = image_processor.get_remaining_images(zf, processed_images) + if remaining_images and hasattr(image_processor, 'process_images'): + image_text = image_processor.process_images(zf, remaining_images) + if image_text: + text_content.append("\n\n=== Extracted Images (Not Inline) ===\n") + text_content.append(image_text) + # Add pre-extracted charts while chart_idx[0] < len(chart_data_list): chart_text = get_next_chart() if chart_text: text_content.append(chart_text) - - except Exception as e: - self.logger.error(f"Error processing HWPX file: {e}") + finally: + # Close ZipFile using file_converter + self.file_converter.close(zf) + + except Exception as e: # noqa: BLE001 + self.logger.error("Error processing HWPX file: %s", e) return f"Error processing HWPX file: {str(e)}" - + return "\n".join(text_content) - + def _is_valid_zip(self, file_stream: io.BytesIO) -> bool: """Check if stream is a valid ZIP file.""" try: @@ -131,16 +159,16 @@ def _is_valid_zip(self, file_stream: io.BytesIO) -> bool: header = file_stream.read(4) file_stream.seek(0) return header == b'PK\x03\x04' - except: + except Exception: # noqa: BLE001 return False - + def _format_chart_data(self, chart_data: "ChartData") -> str: """Format ChartData using ChartProcessor.""" from contextifier.core.functions.chart_extractor import ChartData - + if not isinstance(chart_data, ChartData): return "" - + if chart_data.has_data(): return self.chart_processor.format_chart_data( chart_type=chart_data.chart_type, diff --git a/contextifier/core/processor/hwpx_helper/__init__.py b/contextifier/core/processor/hwpx_helper/__init__.py index faf2642..d3b64b5 100644 --- a/contextifier/core/processor/hwpx_helper/__init__.py +++ b/contextifier/core/processor/hwpx_helper/__init__.py @@ -25,7 +25,7 @@ # Metadata from contextifier.core.processor.hwpx_helper.hwpx_metadata import ( - extract_hwpx_metadata, + HWPXMetadataExtractor, parse_bin_item_map, ) @@ -40,10 +40,9 @@ parse_hwpx_section, ) -# Image -from contextifier.core.processor.hwpx_helper.hwpx_image import ( - process_hwpx_images, - get_remaining_images, +# Image Processor (replaces hwpx_image.py utility functions) +from contextifier.core.processor.hwpx_helper.hwpx_image_processor import ( + HWPXImageProcessor, ) # Chart Extractor @@ -60,16 +59,15 @@ "HEADER_FILE_PATHS", "HPF_PATH", # Metadata - "extract_hwpx_metadata", + "HWPXMetadataExtractor", "parse_bin_item_map", # Table "parse_hwpx_table", "extract_cell_content", # Section "parse_hwpx_section", - # Image - "process_hwpx_images", - "get_remaining_images", + # Image Processor + "HWPXImageProcessor", # Chart Extractor "HWPXChartExtractor", ] diff --git a/contextifier/core/processor/hwpx_helper/hwpx_file_converter.py b/contextifier/core/processor/hwpx_helper/hwpx_file_converter.py new file mode 100644 index 0000000..e404434 --- /dev/null +++ b/contextifier/core/processor/hwpx_helper/hwpx_file_converter.py @@ -0,0 +1,69 @@ +# libs/core/processor/hwpx_helper/hwpx_file_converter.py +""" +HWPXFileConverter - HWPX file format converter + +Converts binary HWPX data to ZipFile object. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO +import zipfile + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class HWPXFileConverter(BaseFileConverter): + """ + HWPX file converter. + + Converts binary HWPX (ZIP format) data to ZipFile object. + """ + + # ZIP magic number + ZIP_MAGIC = b'PK\x03\x04' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> zipfile.ZipFile: + """ + Convert binary HWPX data to ZipFile object. + + Args: + file_data: Raw binary HWPX data + file_stream: Optional file stream + **kwargs: Additional options + + Returns: + zipfile.ZipFile object + """ + stream = file_stream if file_stream is not None else BytesIO(file_data) + stream.seek(0) + return zipfile.ZipFile(stream, 'r') + + def get_format_name(self) -> str: + """Return format name.""" + return "HWPX Document (ZIP/XML)" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is a valid ZIP file.""" + if not file_data or len(file_data) < 4: + return False + + if file_data[:4] != self.ZIP_MAGIC: + return False + + # Verify it's a valid ZIP + try: + with zipfile.ZipFile(BytesIO(file_data), 'r') as zf: + # HWPX should have specific structure + namelist = zf.namelist() + return len(namelist) > 0 + except zipfile.BadZipFile: + return False + + def close(self, converted_object: Any) -> None: + """Close the ZipFile.""" + if converted_object is not None and hasattr(converted_object, 'close'): + converted_object.close() diff --git a/contextifier/core/processor/hwpx_helper/hwpx_image.py b/contextifier/core/processor/hwpx_helper/hwpx_image.py deleted file mode 100644 index 867244c..0000000 --- a/contextifier/core/processor/hwpx_helper/hwpx_image.py +++ /dev/null @@ -1,77 +0,0 @@ -# hwpx_helper/hwpx_image.py -""" -HWPX 이미지 처리 - -HWPX 문서의 이미지를 추출하고 로컬에 저장합니다. -""" -import logging -import os -import zipfile -from typing import List, Optional - -from contextifier.core.processor.hwpx_helper.hwpx_constants import SUPPORTED_IMAGE_EXTENSIONS -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -def process_hwpx_images( - zf: zipfile.ZipFile, - image_files: List[str], - image_processor: ImageProcessor -) -> str: - """ - HWPX zip에서 이미지를 추출하여 로컬에 저장합니다. - - Args: - zf: 열린 ZipFile 객체 - image_files: 처리할 이미지 파일 경로 목록 - image_processor: 이미지 프로세서 인스턴스 - - Returns: - 이미지 태그 문자열들을 줄바꿈으로 연결한 결과 - """ - results = [] - - for img_path in image_files: - ext = os.path.splitext(img_path)[1].lower() - if ext in SUPPORTED_IMAGE_EXTENSIONS: - try: - with zf.open(img_path) as f: - image_data = f.read() - - image_tag = image_processor.save_image(image_data) - if image_tag: - results.append(image_tag) - - except Exception as e: - logger.warning(f"Error processing HWPX image {img_path}: {e}") - - return "\n\n".join(results) - - -def get_remaining_images( - zf: zipfile.ZipFile, - processed_images: set -) -> List[str]: - """ - 아직 처리되지 않은 이미지 파일 목록을 반환합니다. - - Args: - zf: 열린 ZipFile 객체 - processed_images: 이미 처리된 이미지 경로 집합 - - Returns: - 처리되지 않은 이미지 파일 경로 목록 - """ - image_files = [ - f for f in zf.namelist() - if f.startswith("BinData/") and not f.endswith("/") - ] - - remaining_images = [] - for img in image_files: - if img not in processed_images: - remaining_images.append(img) - - return remaining_images diff --git a/contextifier/core/processor/hwpx_helper/hwpx_image_processor.py b/contextifier/core/processor/hwpx_helper/hwpx_image_processor.py new file mode 100644 index 0000000..7d326b4 --- /dev/null +++ b/contextifier/core/processor/hwpx_helper/hwpx_image_processor.py @@ -0,0 +1,258 @@ +# contextifier/core/processor/hwpx_helper/hwpx_image_processor.py +""" +HWPX Image Processor + +Provides HWPX-specific image processing that inherits from ImageProcessor. +Handles images in HWPX (ZIP/XML based) Korean document format. + +This class consolidates all HWPX image extraction logic including: +- BinData images extraction from ZIP +- Remaining images processing +- Image filtering by extension +""" +import logging +import os +from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING +import zipfile + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +logger = logging.getLogger("contextify.image_processor.hwpx") + +# Supported image extensions +SUPPORTED_IMAGE_EXTENSIONS = frozenset(['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']) + + +class HWPXImageProcessor(ImageProcessor): + """ + HWPX-specific image processor. + + Inherits from ImageProcessor and provides HWPX-specific processing. + + Handles: + - BinData images in HWPX ZIP structure + - Embedded images + - Referenced images via bin_item_map + + Example: + processor = HWPXImageProcessor() + + # Process image from ZIP + with zipfile.ZipFile(file_stream, 'r') as zf: + tag = processor.process_from_zip(zf, "BinData/image1.png") + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize HWPXImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + bin_item_id: Optional[str] = None, + image_path: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save HWPX image data. + + Args: + image_data: Raw image binary data + bin_item_id: BinItem ID from HWPX + image_path: Original path in ZIP (for naming) + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = None + if bin_item_id is not None: + custom_name = f"hwpx_{bin_item_id}" + elif image_path is not None: + # Extract filename from path + filename = image_path.split('/')[-1] if '/' in image_path else image_path + # Remove extension and sanitize + name_base = filename.rsplit('.', 1)[0] if '.' in filename else filename + custom_name = f"hwpx_{name_base}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_from_zip( + self, + zf: zipfile.ZipFile, + image_path: str, + bin_item_id: Optional[str] = None, + ) -> Optional[str]: + """ + Process image from HWPX ZIP archive. + + Args: + zf: ZipFile object + image_path: Path to image in ZIP + bin_item_id: BinItem ID + + Returns: + Image tag string, or None on failure + """ + try: + with zf.open(image_path) as f: + image_data = f.read() + + return self.process_image( + image_data, + bin_item_id=bin_item_id, + image_path=image_path + ) + + except Exception as e: + self._logger.warning(f"Failed to process image from ZIP {image_path}: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + bin_item_id: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded HWPX image. + + Args: + image_data: Image binary data + image_name: Original image filename + bin_item_id: BinItem ID + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and bin_item_id is not None: + custom_name = f"hwpx_embed_{bin_item_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_bindata_images( + self, + zf: zipfile.ZipFile, + bin_item_map: Dict[str, str], + exclude_processed: Optional[Set[str]] = None, + ) -> Dict[str, str]: + """ + Process all BinData images from HWPX. + + Args: + zf: ZipFile object + bin_item_map: Mapping of bin_item_id to path + exclude_processed: Set of already processed IDs to skip + + Returns: + Dictionary mapping bin_item_id to image tag + """ + exclude = exclude_processed or set() + result = {} + + for bin_id, image_path in bin_item_map.items(): + if bin_id in exclude: + continue + + tag = self.process_from_zip(zf, image_path, bin_item_id=bin_id) + if tag: + result[bin_id] = tag + + return result + + def process_images( + self, + zf: zipfile.ZipFile, + image_files: List[str], + ) -> str: + """ + Extract images from HWPX zip and save locally. + + Args: + zf: Open ZipFile object + image_files: List of image file paths to process + + Returns: + Image tag strings joined by newlines + """ + results = [] + + for img_path in image_files: + ext = os.path.splitext(img_path)[1].lower() + if ext in SUPPORTED_IMAGE_EXTENSIONS: + tag = self.process_from_zip(zf, img_path) + if tag: + results.append(tag) + + return "\n\n".join(results) + + def get_remaining_images( + self, + zf: zipfile.ZipFile, + processed_images: Set[str], + ) -> List[str]: + """ + Return list of image files not yet processed. + + Args: + zf: Open ZipFile object + processed_images: Set of already processed image paths + + Returns: + List of unprocessed image file paths + """ + image_files = [ + f for f in zf.namelist() + if f.startswith("BinData/") and not f.endswith("/") + ] + + remaining_images = [] + for img in image_files: + if img not in processed_images: + remaining_images.append(img) + + return remaining_images + + def process_remaining_images( + self, + zf: zipfile.ZipFile, + processed_images: Set[str], + ) -> str: + """ + Process all images not yet processed. + + Args: + zf: Open ZipFile object + processed_images: Set of already processed image paths + + Returns: + Image tag strings joined by newlines + """ + remaining = self.get_remaining_images(zf, processed_images) + return self.process_images(zf, remaining) + + +__all__ = ["HWPXImageProcessor"] diff --git a/contextifier/core/processor/hwpx_helper/hwpx_metadata.py b/contextifier/core/processor/hwpx_helper/hwpx_metadata.py index 4bc26c9..57e1b1e 100644 --- a/contextifier/core/processor/hwpx_helper/hwpx_metadata.py +++ b/contextifier/core/processor/hwpx_helper/hwpx_metadata.py @@ -1,104 +1,139 @@ -# hwpx_helper/hwpx_metadata.py +# contextifier/core/processor/hwpx_helper/hwpx_metadata.py """ -HWPX 메타데이터 추출 +HWPX Metadata Extraction Module -HWPX 파일에서 메타데이터를 추출합니다. -메타데이터는 다음 파일에 저장됩니다: -- version.xml: 문서 버전 정보 -- META-INF/container.xml: 컨테이너 정보 -- Contents/header.xml: 문서 속성 (작성자, 날짜 등) +Provides HWPXMetadataExtractor class for extracting metadata from HWPX files. +Implements BaseMetadataExtractor interface. + +Metadata locations in HWPX: +- version.xml: Document version information +- META-INF/container.xml: Container information +- Contents/header.xml: Document properties (author, date, etc.) + +Note: HWPX is a Korean-native document format, so Korean metadata labels +are preserved in output for proper display. """ import logging import xml.etree.ElementTree as ET import zipfile from typing import Any, Dict +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) from contextifier.core.processor.hwpx_helper.hwpx_constants import HWPX_NAMESPACES, HEADER_FILE_PATHS logger = logging.getLogger("document-processor") -def extract_hwpx_metadata(zf: zipfile.ZipFile) -> Dict[str, Any]: +class HWPXMetadataExtractor(BaseMetadataExtractor): """ - HWPX 파일에서 메타데이터를 추출합니다. - - HWPX stores metadata in: - - version.xml: Document version info - - META-INF/container.xml: Container info - - Contents/header.xml: Document properties (작성자, 날짜 등) - - Args: - zf: 열린 ZipFile 객체 - - Returns: - 추출된 메타데이터 딕셔너리 + HWPX Metadata Extractor. + + Extracts HWPX metadata from zipfile.ZipFile objects. + + Supported fields: + - Standard fields: title, subject, author, keywords, comments, etc. + - HWPX-specific: version, media_type, etc. (stored in custom fields) + + Usage: + extractor = HWPXMetadataExtractor() + metadata = extractor.extract(zip_file) + text = extractor.format(metadata) """ - metadata = {} - - try: - # Try to read header.xml for document properties - for header_path in HEADER_FILE_PATHS: - if header_path in zf.namelist(): - with zf.open(header_path) as f: - header_content = f.read() - header_root = ET.fromstring(header_content) - - # Try to find document properties - # contains metadata - doc_info = header_root.find('.//hh:docInfo', HWPX_NAMESPACES) - if doc_info is not None: - # Get properties - for prop in doc_info: - tag = prop.tag.split('}')[-1] if '}' in prop.tag else prop.tag - if prop.text: - metadata[tag.lower()] = prop.text - break - - # Try to read version.xml - if 'version.xml' in zf.namelist(): - with zf.open('version.xml') as f: - version_content = f.read() - version_root = ET.fromstring(version_content) - - # Get version info - if version_root.text: - metadata['version'] = version_root.text - for attr in version_root.attrib: - metadata[f'version_{attr}'] = version_root.get(attr) - - # Try to read META-INF/manifest.xml for additional info - if 'META-INF/manifest.xml' in zf.namelist(): - with zf.open('META-INF/manifest.xml') as f: - manifest_content = f.read() - manifest_root = ET.fromstring(manifest_content) - - # Get mimetype and other info - for child in manifest_root: - tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag - if tag == 'file-entry': - full_path = child.get('full-path', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}full-path', '')) - if full_path == '/': - media_type = child.get('media-type', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}media-type', '')) - if media_type: - metadata['media_type'] = media_type - - logger.info(f"Extracted HWPX metadata: {metadata}") - - except Exception as e: - logger.warning(f"Failed to extract HWPX metadata: {e}") - - return metadata + + def extract(self, source: zipfile.ZipFile) -> DocumentMetadata: + """ + Extract metadata from HWPX file. + + Args: + source: Open zipfile.ZipFile object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + raw_metadata: Dict[str, Any] = {} + + try: + # Try to read header.xml for document properties + for header_path in HEADER_FILE_PATHS: + if header_path in source.namelist(): + with source.open(header_path) as f: + header_content = f.read() + header_root = ET.fromstring(header_content) + + # Try to find document properties + # contains metadata + doc_info = header_root.find('.//hh:docInfo', HWPX_NAMESPACES) + if doc_info is not None: + # Get properties + for prop in doc_info: + tag = prop.tag.split('}')[-1] if '}' in prop.tag else prop.tag + if prop.text: + raw_metadata[tag.lower()] = prop.text + break + + # Try to read version.xml + if 'version.xml' in source.namelist(): + with source.open('version.xml') as f: + version_content = f.read() + version_root = ET.fromstring(version_content) + + # Get version info + if version_root.text: + raw_metadata['version'] = version_root.text + for attr in version_root.attrib: + raw_metadata[f'version_{attr}'] = version_root.get(attr) + + # Try to read META-INF/manifest.xml for additional info + if 'META-INF/manifest.xml' in source.namelist(): + with source.open('META-INF/manifest.xml') as f: + manifest_content = f.read() + manifest_root = ET.fromstring(manifest_content) + + # Get mimetype and other info + for child in manifest_root: + tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag + if tag == 'file-entry': + full_path = child.get('full-path', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}full-path', '')) + if full_path == '/': + media_type = child.get('media-type', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}media-type', '')) + if media_type: + raw_metadata['media_type'] = media_type + + self.logger.debug(f"Extracted HWPX metadata: {list(raw_metadata.keys())}") + + except Exception as e: + self.logger.warning(f"Failed to extract HWPX metadata: {e}") + + # Separate standard fields from custom fields + standard_fields = {'title', 'subject', 'author', 'keywords', 'comments', + 'last_saved_by', 'create_time', 'last_saved_time'} + custom_fields = {k: v for k, v in raw_metadata.items() if k not in standard_fields} + + return DocumentMetadata( + title=raw_metadata.get('title'), + subject=raw_metadata.get('subject'), + author=raw_metadata.get('author'), + keywords=raw_metadata.get('keywords'), + comments=raw_metadata.get('comments'), + last_saved_by=raw_metadata.get('last_saved_by'), + create_time=raw_metadata.get('create_time'), + last_saved_time=raw_metadata.get('last_saved_time'), + custom=custom_fields, + ) def parse_bin_item_map(zf: zipfile.ZipFile) -> Dict[str, str]: """ - content.hpf 파일을 파싱하여 BinItem ID와 파일 경로 매핑을 생성합니다. + Parse content.hpf file to create BinItem ID to file path mapping. Args: - zf: 열린 ZipFile 객체 + zf: Open ZipFile object Returns: - BinItem ID -> 파일 경로 매핑 딕셔너리 + Dictionary mapping BinItem ID to file path. """ from .hwpx_constants import HPF_PATH, OPF_NAMESPACES @@ -120,3 +155,9 @@ def parse_bin_item_map(zf: zipfile.ZipFile) -> Dict[str, str]: logger.warning(f"Failed to parse content.hpf: {e}") return bin_item_map + + +__all__ = [ + 'HWPXMetadataExtractor', + 'parse_bin_item_map', +] diff --git a/contextifier/core/processor/hwpx_helper/hwpx_preprocessor.py b/contextifier/core/processor/hwpx_helper/hwpx_preprocessor.py new file mode 100644 index 0000000..7433097 --- /dev/null +++ b/contextifier/core/processor/hwpx_helper/hwpx_preprocessor.py @@ -0,0 +1,80 @@ +# contextifier/core/processor/hwpx_helper/hwpx_preprocessor.py +""" +HWPX Preprocessor - Process HWPX ZIP document after conversion. + +Processing Pipeline Position: + 1. HWPXFileConverter.convert() → zipfile.ZipFile + 2. HWPXPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. HWPXMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (sections, tables, images) + +Current Implementation: + - Pass-through (HWPX uses zipfile object directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.hwpx.preprocessor") + + +class HWPXPreprocessor(BasePreprocessor): + """ + HWPX ZIP Document Preprocessor. + + Currently a pass-through implementation as HWPX processing + is handled during the content extraction phase. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted HWPX ZIP document. + + Args: + converted_data: zipfile.ZipFile object from HWPXFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the ZIP object and any extracted resources + """ + metadata: Dict[str, Any] = {} + + if hasattr(converted_data, 'namelist'): + try: + files = converted_data.namelist() + metadata['file_count'] = len(files) + # Check for section files + sections = [f for f in files if 'section' in f.lower() and f.endswith('.xml')] + metadata['section_count'] = len(sections) + except Exception: # noqa: BLE001 + pass + + logger.debug("HWPX preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the ZipFile + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - zipfile.ZipFile + encoding="utf-8", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "HWPX Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is a ZipFile object.""" + return hasattr(data, 'namelist') and hasattr(data, 'open') + + +__all__ = ['HWPXPreprocessor'] diff --git a/contextifier/core/processor/image_file_handler.py b/contextifier/core/processor/image_file_handler.py index b9cf4ba..6f0ca33 100644 --- a/contextifier/core/processor/image_file_handler.py +++ b/contextifier/core/processor/image_file_handler.py @@ -12,6 +12,8 @@ from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor +from contextifier.core.processor.image_file_helper.image_file_image_processor import ImageFileImageProcessor +from contextifier.core.functions.img_processor import ImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -27,61 +29,82 @@ class ImageFileHandler(BaseHandler): """ Image File Processing Handler Class. - + Processes standalone image files by converting them to text using OCR. Requires an OCR engine to be provided for actual text extraction. - + Args: config: Configuration dictionary (passed from DocumentProcessor) image_processor: ImageProcessor instance (passed from DocumentProcessor) page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor) ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion - + Example: >>> from contextifier.ocr.ocr_engine import OpenAIOCR >>> ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o") >>> handler = ImageFileHandler(ocr_engine=ocr) >>> text = handler.extract_text(current_file) """ - + + def _create_file_converter(self): + """Create image-file-specific file converter.""" + from contextifier.core.processor.image_file_helper.image_file_converter import ImageFileConverter + return ImageFileConverter() + + def _create_preprocessor(self): + """Create image-file-specific preprocessor.""" + from contextifier.core.processor.image_file_helper.image_file_preprocessor import ImageFilePreprocessor + return ImageFilePreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Image files do not contain charts. Return NullChartExtractor.""" return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Image files do not have document metadata. Return None (uses NullMetadataExtractor).""" + return None + + def _create_format_image_processor(self) -> ImageProcessor: + """Create image-file-specific image processor.""" + return ImageFileImageProcessor() + def __init__( self, config: Optional[dict] = None, image_processor: Optional[Any] = None, page_tag_processor: Optional[Any] = None, + chart_processor: Optional[Any] = None, ocr_engine: Optional["BaseOCR"] = None ): """ Initialize ImageFileHandler. - + Args: config: Configuration dictionary (passed from DocumentProcessor) image_processor: ImageProcessor instance (passed from DocumentProcessor) page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor) + chart_processor: ChartProcessor instance (passed from DocumentProcessor) ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion. If None, images cannot be converted to text. """ super().__init__( config=config, image_processor=image_processor, - page_tag_processor=page_tag_processor + page_tag_processor=page_tag_processor, + chart_processor=chart_processor ) self._ocr_engine = ocr_engine - + @property def ocr_engine(self) -> Optional["BaseOCR"]: """Current OCR engine instance.""" return self._ocr_engine - + @ocr_engine.setter def ocr_engine(self, engine: Optional["BaseOCR"]) -> None: """Set OCR engine instance.""" self._ocr_engine = engine - + def extract_text( self, current_file: "CurrentFile", @@ -90,66 +113,72 @@ def extract_text( ) -> str: """ Extract text from image file using OCR. - + Converts the image file to text using the configured OCR engine. If no OCR engine is available, returns an error message. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata (not used for images) **kwargs: Additional options (not used) - + Returns: Extracted text from image, or error message if OCR is not available - + Raises: ValueError: If OCR engine is not configured """ file_path = current_file.get("file_path", "unknown") file_name = current_file.get("file_name", "unknown") file_extension = current_file.get("file_extension", "").lower() - + file_data = current_file.get("file_data", b"") + self.logger.info(f"Processing image file: {file_name}") - + + # Step 1: No file_converter for image files (direct processing) + # Step 2: Preprocess - clean_content is the TRUE SOURCE + preprocessed = self.preprocess(file_data) + file_data = preprocessed.clean_content # TRUE SOURCE + # Validate file extension if file_extension not in SUPPORTED_IMAGE_EXTENSIONS: self.logger.warning(f"Unsupported image extension: {file_extension}") return f"[Unsupported image format: {file_extension}]" - + # If OCR engine is not available, return image tag format # This allows the image to be processed later when OCR is available if self._ocr_engine is None: self.logger.debug(f"OCR engine not available, returning image tag: {file_name}") # Use ImageProcessor's tag format (e.g., [Image:path] or custom format) return self._build_image_tag(file_path) - + # Use OCR engine to convert image to text try: # Use the file path directly for OCR conversion result = self._ocr_engine.convert_image_to_text(file_path) - + if result is None: self.logger.error(f"OCR returned None for image: {file_name}") return f"[Image OCR failed: {file_name}]" - + if result.startswith("[Image conversion error:"): self.logger.error(f"OCR error for image {file_name}: {result}") return result - + self.logger.info(f"Successfully extracted text from image: {file_name}") return result - + except Exception as e: self.logger.error(f"Error processing image {file_name}: {e}") return f"[Image processing error: {str(e)}]" - + def is_supported(self, file_extension: str) -> bool: """ Check if file extension is supported. - + Args: file_extension: File extension (with or without dot) - + Returns: True if extension is supported, False otherwise """ @@ -159,23 +188,23 @@ def is_supported(self, file_extension: str) -> bool: def _build_image_tag(self, file_path: str) -> str: """ Build image tag using ImageProcessor's tag format. - + Uses the configured tag_prefix and tag_suffix from ImageProcessor to create a consistent image tag format. - + Args: file_path: Path to the image file - + Returns: Image tag string (e.g., "[Image:path]" or custom format) """ # Normalize path separators (Windows -> Unix style) path_str = file_path.replace("\\", "/") - + # Use ImageProcessor's tag format prefix = self.image_processor.config.tag_prefix suffix = self.image_processor.config.tag_suffix - + return f"{prefix}{path_str}{suffix}" diff --git a/contextifier/core/processor/image_file_helper/__init__.py b/contextifier/core/processor/image_file_helper/__init__.py new file mode 100644 index 0000000..e6bbb1b --- /dev/null +++ b/contextifier/core/processor/image_file_helper/__init__.py @@ -0,0 +1,17 @@ +# contextifier/core/processor/image_file_helper/__init__.py +""" +Image File Helper 모듈 + +이미지 파일 처리에 필요한 유틸리티를 제공합니다. + +모듈 구성: +- image_file_image_processor: 이미지 파일용 이미지 프로세서 +""" + +from contextifier.core.processor.image_file_helper.image_file_image_processor import ( + ImageFileImageProcessor, +) + +__all__ = [ + "ImageFileImageProcessor", +] diff --git a/contextifier/core/processor/image_file_helper/image_file_converter.py b/contextifier/core/processor/image_file_helper/image_file_converter.py new file mode 100644 index 0000000..c9ea732 --- /dev/null +++ b/contextifier/core/processor/image_file_helper/image_file_converter.py @@ -0,0 +1,68 @@ +# libs/core/processor/image_file_helper/image_file_converter.py +""" +ImageFileConverter - Image file format converter + +Pass-through converter for image files. +Images are kept as binary data. +""" +from typing import Any, Optional, BinaryIO + +from contextifier.core.functions.file_converter import NullFileConverter + + +class ImageFileConverter(NullFileConverter): + """ + Image file converter. + + Images don't need conversion - returns raw bytes. + This is a pass-through converter. + """ + + # Common image magic numbers + MAGIC_JPEG = b'\xff\xd8\xff' + MAGIC_PNG = b'\x89PNG\r\n\x1a\n' + MAGIC_GIF = b'GIF8' + MAGIC_BMP = b'BM' + MAGIC_WEBP = b'RIFF' + + def get_format_name(self) -> str: + """Return format name.""" + return "Image File" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is an image.""" + if not file_data or len(file_data) < 4: + return False + + return ( + file_data[:3] == self.MAGIC_JPEG or + file_data[:8] == self.MAGIC_PNG or + file_data[:4] == self.MAGIC_GIF or + file_data[:2] == self.MAGIC_BMP or + file_data[:4] == self.MAGIC_WEBP + ) + + def detect_image_type(self, file_data: bytes) -> Optional[str]: + """ + Detect image type from binary data. + + Args: + file_data: Raw binary image data + + Returns: + Image type string (jpeg, png, gif, bmp, webp) or None + """ + if not file_data or len(file_data) < 8: + return None + + if file_data[:3] == self.MAGIC_JPEG: + return "jpeg" + elif file_data[:8] == self.MAGIC_PNG: + return "png" + elif file_data[:4] == self.MAGIC_GIF: + return "gif" + elif file_data[:2] == self.MAGIC_BMP: + return "bmp" + elif file_data[:4] == self.MAGIC_WEBP: + return "webp" + return None diff --git a/contextifier/core/processor/image_file_helper/image_file_image_processor.py b/contextifier/core/processor/image_file_helper/image_file_image_processor.py new file mode 100644 index 0000000..b465f5f --- /dev/null +++ b/contextifier/core/processor/image_file_helper/image_file_image_processor.py @@ -0,0 +1,123 @@ +# contextifier/core/processor/image_file_helper/image_file_image_processor.py +""" +Image File Image Processor + +Provides image-file-specific processing that inherits from ImageProcessor. +Handles standalone image files (jpg, png, gif, bmp, webp, etc.). +""" +import logging +from typing import Any, Optional + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +logger = logging.getLogger("contextify.image_processor.image_file") + + +class ImageFileImageProcessor(ImageProcessor): + """ + Image file-specific image processor. + + Inherits from ImageProcessor and provides image file-specific processing. + Handles standalone image files that are the document themselves. + + Handles: + - Standalone image files (jpg, jpeg, png, gif, bmp, webp) + - Image saving with metadata preservation + - Format conversion if needed + + Example: + processor = ImageFileImageProcessor() + + # Process standalone image + tag = processor.process_image(image_data, source_path="/path/to/image.png") + + # Process with original filename + tag = processor.process_standalone_image(image_data, original_name="photo.jpg") + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + preserve_original_name: bool = False, + ): + """ + Initialize ImageFileImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + preserve_original_name: Whether to preserve original filename + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + self._preserve_original_name = preserve_original_name + + @property + def preserve_original_name(self) -> bool: + """Whether to preserve original filename.""" + return self._preserve_original_name + + def process_image( + self, + image_data: bytes, + source_path: Optional[str] = None, + original_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save image file data. + + Args: + image_data: Raw image binary data + source_path: Original file path + original_name: Original filename + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + # Use original name if preserve option is set + custom_name = None + if self._preserve_original_name and original_name: + import os + custom_name = os.path.splitext(original_name)[0] + elif source_path: + import os + custom_name = os.path.splitext(os.path.basename(source_path))[0] + + return self.save_image(image_data, custom_name=custom_name) + + def process_standalone_image( + self, + image_data: bytes, + original_name: Optional[str] = None, + **kwargs + ) -> Optional[str]: + """ + Process standalone image file. + + Specialized method for processing image files that are the document. + + Args: + image_data: Raw image binary data + original_name: Original filename + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return self.process_image( + image_data, + original_name=original_name, + **kwargs + ) diff --git a/contextifier/core/processor/image_file_helper/image_file_preprocessor.py b/contextifier/core/processor/image_file_helper/image_file_preprocessor.py new file mode 100644 index 0000000..531758d --- /dev/null +++ b/contextifier/core/processor/image_file_helper/image_file_preprocessor.py @@ -0,0 +1,84 @@ +# contextifier/core/processor/image_file_helper/image_file_preprocessor.py +""" +Image File Preprocessor - Process image file after conversion. + +Processing Pipeline Position: + 1. ImageFileConverter.convert() → bytes (raw image data) + 2. ImageFilePreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. ImageFileMetadataExtractor.extract() → DocumentMetadata + 4. OCR processing (if OCR engine available) + +Current Implementation: + - Pass-through (Image uses raw bytes directly for OCR) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.image_file.preprocessor") + + +class ImageFilePreprocessor(BasePreprocessor): + """ + Image File Preprocessor. + + Currently a pass-through implementation as image processing + is handled by the OCR engine. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted image data. + + Args: + converted_data: Image bytes from ImageFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the image data + """ + metadata: Dict[str, Any] = {} + + if isinstance(converted_data, bytes): + metadata['size_bytes'] = len(converted_data) + # Try to detect image format from magic bytes + if converted_data.startswith(b'\xff\xd8\xff'): + metadata['format'] = 'jpeg' + elif converted_data.startswith(b'\x89PNG'): + metadata['format'] = 'png' + elif converted_data.startswith(b'GIF'): + metadata['format'] = 'gif' + elif converted_data.startswith(b'BM'): + metadata['format'] = 'bmp' + elif converted_data.startswith(b'RIFF') and b'WEBP' in converted_data[:12]: + metadata['format'] = 'webp' + + logger.debug("Image file preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the image bytes + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - image bytes + encoding="binary", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "Image File Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is image bytes.""" + return isinstance(data, bytes) and len(data) > 0 + + +__all__ = ['ImageFilePreprocessor'] diff --git a/contextifier/core/processor/pdf_handler.py b/contextifier/core/processor/pdf_handler.py index 81b4490..66ca487 100644 --- a/contextifier/core/processor/pdf_handler.py +++ b/contextifier/core/processor/pdf_handler.py @@ -57,15 +57,14 @@ # Import from new modular helpers from contextifier.core.processor.pdf_helpers.pdf_metadata import ( - extract_pdf_metadata, - format_metadata, + PDFMetadataExtractor, +) +from contextifier.core.processor.pdf_helpers.pdf_image_processor import ( + PDFImageProcessor, ) from contextifier.core.processor.pdf_helpers.pdf_utils import ( bbox_overlaps, ) -from contextifier.core.processor.pdf_helpers.pdf_image import ( - extract_images_from_page, -) from contextifier.core.processor.pdf_helpers.pdf_text_extractor import ( extract_text_blocks, ) @@ -124,20 +123,43 @@ class PDFHandler(BaseHandler): """ PDF Document Handler - + Inherits from BaseHandler to manage config and image_processor at instance level. All internal methods access these via self.config, self.image_processor. - + Usage: handler = PDFHandler(config=config, image_processor=image_processor) text = handler.extract_text(current_file) """ - + + def _create_file_converter(self): + """Create PDF-specific file converter.""" + from contextifier.core.processor.pdf_helpers.pdf_file_converter import PDFFileConverter + return PDFFileConverter() + + def _create_preprocessor(self): + """Create PDF-specific preprocessor.""" + from contextifier.core.processor.pdf_helpers.pdf_preprocessor import PDFPreprocessor + return PDFPreprocessor() + def _create_chart_extractor(self): """PDF chart extraction not yet implemented. Return NullChartExtractor.""" from contextifier.core.functions.chart_extractor import NullChartExtractor return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create PDF-specific metadata extractor.""" + return PDFMetadataExtractor() + + def _create_format_image_processor(self): + """Create PDF-specific image processor.""" + return PDFImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + def extract_text( self, current_file: "CurrentFile", @@ -146,19 +168,19 @@ def extract_text( ) -> str: """ Extract text from PDF file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") self.logger.info(f"[PDF] Processing: {file_path}") return self._extract_pdf(current_file, extract_metadata) - + def _extract_pdf( self, current_file: "CurrentFile", @@ -166,27 +188,31 @@ def _extract_pdf( ) -> str: """ Enhanced PDF processing - adaptive complexity-based. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") - + file_data = current_file.get("file_data", b"") + try: - # Open PDF from stream to avoid path encoding issues - file_stream = self.get_file_stream(current_file) - doc = fitz.open(stream=file_stream, filetype="pdf") + # Step 1: Use FileConverter to convert binary to fitz.Document + doc = self.file_converter.convert(file_data) + + # Step 2: Preprocess - may transform doc in the future + preprocessed = self.preprocess(doc) + doc = preprocessed.clean_content # TRUE SOURCE + all_pages_text = [] processed_images: Set[int] = set() # Extract metadata if extract_metadata: - metadata = extract_pdf_metadata(doc) - metadata_text = format_metadata(metadata) + metadata_text = self.extract_and_format_metadata(doc) if metadata_text: all_pages_text.append(metadata_text) @@ -326,7 +352,7 @@ def _process_page_hybrid( page_elements.append(elem) if complex_bboxes: - block_engine = BlockImageEngine(page, page_num, image_processor=self.image_processor) + block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor) for complex_bbox in complex_bboxes: result = block_engine.process_region(complex_bbox, region_type="complex_region") @@ -361,7 +387,7 @@ def _process_page_block_ocr( table_bboxes = [elem.bbox for elem in page_tables] if complex_regions: - block_engine = BlockImageEngine(page, page_num, image_processor=self.image_processor) + block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor) for complex_bbox in complex_regions: if any(bbox_overlaps(complex_bbox, tb) for tb in table_bboxes): @@ -443,7 +469,7 @@ def _process_page_full_ocr( return merge_page_elements(page_elements) # Smart block processing - block_engine = BlockImageEngine(page, page_num, image_processor=self.image_processor) + block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor) multi_result: MultiBlockResult = block_engine.process_page_smart() if multi_result.success and multi_result.block_results: @@ -501,12 +527,25 @@ def _extract_images_from_page( min_image_size: int = 50, min_image_area: int = 2500 ) -> List[PageElement]: - """Extract images from page using instance's image_processor.""" - return extract_images_from_page( - page, page_num, doc, processed_images, table_bboxes, - image_processor=self.image_processor, - min_image_size=min_image_size, min_image_area=min_image_area - ) + """Extract images from page using instance's format_image_processor.""" + # Use PDFImageProcessor's integrated method + image_processor = self.format_image_processor + if hasattr(image_processor, 'extract_images_from_page'): + elements_dicts = image_processor.extract_images_from_page( + page, page_num, doc, processed_images, table_bboxes, + min_image_size=min_image_size, min_image_area=min_image_area + ) + # Convert dicts to PageElement + return [ + PageElement( + element_type=ElementType.IMAGE, + content=e['content'], + bbox=e['bbox'], + page_num=e['page_num'] + ) + for e in elements_dicts + ] + return [] # ============================================================================ @@ -520,7 +559,7 @@ def extract_text_from_pdf( ) -> str: """ PDF text extraction (legacy function interface). - + This function creates a PDFHandler instance and delegates to it. For new code, consider using PDFHandler class directly. @@ -534,13 +573,13 @@ def extract_text_from_pdf( """ if current_config is None: current_config = {} - + # Extract image_processor from config if available image_processor = current_config.get("image_processor") - + # Create handler instance with config and image_processor handler = PDFHandler(config=current_config, image_processor=image_processor) - + return handler.extract_text(file_path, extract_metadata=extract_default_metadata) @@ -555,4 +594,3 @@ def _extract_pdf( ) -> str: """Deprecated: Use PDFHandler.extract_text() instead.""" return extract_text_from_pdf(file_path, current_config, extract_default_metadata) - diff --git a/contextifier/core/processor/pdf_helpers/__init__.py b/contextifier/core/processor/pdf_helpers/__init__.py index 22cfef3..ea8db7d 100644 --- a/contextifier/core/processor/pdf_helpers/__init__.py +++ b/contextifier/core/processor/pdf_helpers/__init__.py @@ -4,10 +4,9 @@ Contains helper modules for PDF processing. """ -# Backward compatibility - import from new modules +# Metadata - class-based extractor from contextifier.core.processor.pdf_helpers.pdf_metadata import ( - extract_pdf_metadata, - format_metadata, + PDFMetadataExtractor, parse_pdf_date, ) @@ -20,8 +19,9 @@ bbox_overlaps, ) -from contextifier.core.processor.pdf_helpers.pdf_image import ( - extract_images_from_page, +# Image Processor (replaces pdf_image.py utility functions) +from contextifier.core.processor.pdf_helpers.pdf_image_processor import ( + PDFImageProcessor, ) from contextifier.core.processor.pdf_helpers.pdf_text_extractor import ( @@ -201,8 +201,8 @@ 'find_image_position', 'get_text_lines_with_positions', 'bbox_overlaps', - # pdf_image - 'extract_images_from_page', + # Image Processor + 'PDFImageProcessor', # pdf_text_extractor 'extract_text_blocks', 'split_ocr_text_to_blocks', diff --git a/contextifier/core/processor/pdf_helpers/pdf_file_converter.py b/contextifier/core/processor/pdf_helpers/pdf_file_converter.py new file mode 100644 index 0000000..ca45542 --- /dev/null +++ b/contextifier/core/processor/pdf_helpers/pdf_file_converter.py @@ -0,0 +1,71 @@ +# libs/core/processor/pdf_helpers/pdf_file_converter.py +""" +PDFFileConverter - PDF file format converter + +Converts binary PDF data to fitz.Document object using PyMuPDF. +""" +from typing import Any, Optional, BinaryIO + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class PDFFileConverter(BaseFileConverter): + """ + PDF file converter using PyMuPDF (fitz). + + Converts binary PDF data to fitz.Document object. + """ + + # PDF magic number + PDF_MAGIC = b'%PDF' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary PDF data to fitz.Document. + + Args: + file_data: Raw binary PDF data + file_stream: Optional file stream (not used, fitz prefers bytes) + **kwargs: Additional options + + Returns: + fitz.Document object + + Raises: + RuntimeError: If PDF cannot be opened + """ + import fitz + return fitz.open(stream=file_data, filetype="pdf") + + def get_format_name(self) -> str: + """Return format name.""" + return "PDF Document" + + def validate(self, file_data: bytes) -> bool: + """ + Validate if data is a valid PDF. + + Args: + file_data: Raw binary file data + + Returns: + True if file appears to be a PDF + """ + if not file_data or len(file_data) < 4: + return False + return file_data[:4] == self.PDF_MAGIC + + def close(self, converted_object: Any) -> None: + """ + Close the fitz.Document. + + Args: + converted_object: fitz.Document to close + """ + if converted_object is not None and hasattr(converted_object, 'close'): + converted_object.close() diff --git a/contextifier/core/processor/pdf_helpers/pdf_image.py b/contextifier/core/processor/pdf_helpers/pdf_image.py deleted file mode 100644 index e401a9f..0000000 --- a/contextifier/core/processor/pdf_helpers/pdf_image.py +++ /dev/null @@ -1,100 +0,0 @@ -# libs/core/processor/pdf_helpers/pdf_image.py -""" -PDF Image Extraction Module - -Provides functions for extracting images from PDF pages. -""" -import logging -from typing import Any, Dict, List, Optional, Set, Tuple - -from contextifier.core.processor.pdf_helpers.types import ( - ElementType, - PageElement, -) -from contextifier.core.processor.pdf_helpers.pdf_utils import ( - find_image_position, - is_inside_any_bbox, -) -from contextifier.core.functions.img_processor import ImageProcessor - -logger = logging.getLogger("document-processor") - - -def extract_images_from_page( - page, - page_num: int, - doc, - processed_images: Set[int], - table_bboxes: List[Tuple[float, float, float, float]], - image_processor: ImageProcessor, - min_image_size: int = 50, - min_image_area: int = 2500 -) -> List[PageElement]: - """ - Extract images from page and save locally. - - Args: - page: PyMuPDF page object - page_num: Page number (0-indexed) - doc: PyMuPDF document object - processed_images: Set of already processed image xrefs - table_bboxes: List of table bounding boxes to exclude - image_processor: ImageProcessor instance for saving images - min_image_size: Minimum image dimension (width/height) - min_image_area: Minimum image area - - Returns: - List of PageElement for extracted images - """ - elements = [] - - try: - image_list = page.get_images() - - for img_info in image_list: - xref = img_info[0] - - if xref in processed_images: - continue - - try: - base_image = doc.extract_image(xref) - if not base_image: - continue - - image_bytes = base_image.get("image") - width = base_image.get("width", 0) - height = base_image.get("height", 0) - - if width < min_image_size or height < min_image_size: - continue - if width * height < min_image_area: - continue - - img_bbox = find_image_position(page, xref) - if img_bbox is None: - continue - - if is_inside_any_bbox(img_bbox, table_bboxes, threshold=0.7): - continue - - image_tag = image_processor.save_image(image_bytes) - - if image_tag: - processed_images.add(xref) - - elements.append(PageElement( - element_type=ElementType.IMAGE, - content=f'\n{image_tag}\n', - bbox=img_bbox, - page_num=page_num - )) - - except Exception as e: - logger.debug(f"[PDF] Error extracting image xref={xref}: {e}") - continue - - except Exception as e: - logger.warning(f"[PDF] Error extracting images: {e}") - - return elements diff --git a/contextifier/core/processor/pdf_helpers/pdf_image_processor.py b/contextifier/core/processor/pdf_helpers/pdf_image_processor.py new file mode 100644 index 0000000..32c20b8 --- /dev/null +++ b/contextifier/core/processor/pdf_helpers/pdf_image_processor.py @@ -0,0 +1,321 @@ +# contextifier/core/processor/pdf_helpers/pdf_image_processor.py +""" +PDF Image Processor + +Provides PDF-specific image processing that inherits from ImageProcessor. +Handles XRef images, inline images, and page rendering for complex regions. + +This class consolidates all PDF image extraction logic including: +- XRef-based image extraction +- Page region rendering +- Image filtering by size/position +""" +import logging +from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +if TYPE_CHECKING: + import fitz + +logger = logging.getLogger("contextify.image_processor.pdf") + + +class PDFImageProcessor(ImageProcessor): + """ + PDF-specific image processor. + + Inherits from ImageProcessor and provides PDF-specific processing. + + Handles: + - XRef images (embedded images with XRef references) + - Inline images + - Page region rendering for complex areas + - Image extraction from PyMuPDF objects + + Example: + processor = PDFImageProcessor() + + # Process XRef image + tag = processor.process_image(image_data, xref=123) + + # Process page region + tag = processor.process_page_region(page, rect) + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + dpi: int = 150, + ): + """ + Initialize PDFImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + dpi: DPI for page rendering + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + self._dpi = dpi + + @property + def dpi(self) -> int: + """DPI for page rendering.""" + return self._dpi + + @dpi.setter + def dpi(self, value: int) -> None: + """Set DPI for page rendering.""" + self._dpi = value + + def process_image( + self, + image_data: bytes, + xref: Optional[int] = None, + page_num: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save PDF image data. + + Args: + image_data: Raw image binary data + xref: Image XRef number (for naming) + page_num: Page number (for naming) + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + # Generate custom name based on XRef or page + custom_name = None + if xref is not None: + custom_name = f"pdf_xref_{xref}" + elif page_num is not None: + custom_name = f"pdf_page_{page_num}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_xref_image( + self, + doc: "fitz.Document", + xref: int, + ) -> Optional[str]: + """ + Extract and save image by XRef number. + + Args: + doc: PyMuPDF document object + xref: Image XRef number + + Returns: + Image tag string, or None on failure + """ + try: + import fitz + + image_dict = doc.extract_image(xref) + if not image_dict: + return None + + image_data = image_dict.get("image") + if not image_data: + return None + + return self.process_image(image_data, xref=xref) + + except Exception as e: + self._logger.warning(f"Failed to extract XRef image {xref}: {e}") + return None + + def process_page_region( + self, + page: "fitz.Page", + rect: "fitz.Rect", + region_name: Optional[str] = None, + ) -> Optional[str]: + """ + Render and save a page region as image. + + Used for complex regions that can't be represented as text. + + Args: + page: PyMuPDF page object + rect: Region rectangle to render + region_name: Optional name for the region + + Returns: + Image tag string, or None on failure + """ + try: + import fitz + + # Calculate zoom for DPI + zoom = self._dpi / 72.0 + mat = fitz.Matrix(zoom, zoom) + + # Clip to region + clip = rect + pix = page.get_pixmap(matrix=mat, clip=clip, alpha=False) + image_data = pix.tobytes("png") + + custom_name = region_name or f"pdf_page{page.number}_region" + return self.save_image(image_data, custom_name=custom_name) + + except Exception as e: + self._logger.warning(f"Failed to render page region: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + xref: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded PDF image. + + Args: + image_data: Image binary data + image_name: Original image name + xref: Image XRef number + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and xref is not None: + custom_name = f"pdf_embedded_{xref}" + + return self.save_image(image_data, custom_name=custom_name) + + def render_page( + self, + page: "fitz.Page", + alpha: bool = False, + ) -> Optional[str]: + """ + Render entire page as image. + + Args: + page: PyMuPDF page object + alpha: Include alpha channel + + Returns: + Image tag string, or None on failure + """ + try: + import fitz + + zoom = self._dpi / 72.0 + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat, alpha=alpha) + image_data = pix.tobytes("png") + + custom_name = f"pdf_page_{page.number + 1}_full" + return self.save_image(image_data, custom_name=custom_name) + + except Exception as e: + self._logger.warning(f"Failed to render page: {e}") + return None + + def extract_images_from_page( + self, + page: "fitz.Page", + page_num: int, + doc: "fitz.Document", + processed_images: Set[int], + table_bboxes: List[Tuple[float, float, float, float]], + min_image_size: int = 50, + min_image_area: int = 2500 + ) -> List[Dict[str, Any]]: + """ + Extract images from PDF page. + + This consolidates the logic from pdf_image.py extract_images_from_page(). + + Args: + page: PyMuPDF page object + page_num: Page number (0-indexed) + doc: PyMuPDF document object + processed_images: Set of already processed image xrefs + table_bboxes: List of table bounding boxes to exclude + min_image_size: Minimum image dimension + min_image_area: Minimum image area + + Returns: + List of dicts with 'content', 'bbox', 'page_num' keys + """ + from contextifier.core.processor.pdf_helpers.pdf_utils import ( + find_image_position, + is_inside_any_bbox, + ) + + elements = [] + + try: + image_list = page.get_images() + + for img_info in image_list: + xref = img_info[0] + + if xref in processed_images: + continue + + try: + base_image = doc.extract_image(xref) + if not base_image: + continue + + image_bytes = base_image.get("image") + width = base_image.get("width", 0) + height = base_image.get("height", 0) + + if width < min_image_size or height < min_image_size: + continue + if width * height < min_image_area: + continue + + img_bbox = find_image_position(page, xref) + if img_bbox is None: + continue + + if is_inside_any_bbox(img_bbox, table_bboxes, threshold=0.7): + continue + + # Use format-specific process_image method + image_tag = self.process_image(image_bytes, xref=xref, page_num=page_num) + + if image_tag: + processed_images.add(xref) + elements.append({ + 'content': f'\n{image_tag}\n', + 'bbox': img_bbox, + 'page_num': page_num + }) + + except Exception as e: + logger.debug(f"[PDF] Error extracting image xref={xref}: {e}") + continue + + except Exception as e: + logger.warning(f"[PDF] Error extracting images: {e}") + + return elements + + +__all__ = ["PDFImageProcessor"] diff --git a/contextifier/core/processor/pdf_helpers/pdf_metadata.py b/contextifier/core/processor/pdf_helpers/pdf_metadata.py index a226e98..ad1693d 100644 --- a/contextifier/core/processor/pdf_helpers/pdf_metadata.py +++ b/contextifier/core/processor/pdf_helpers/pdf_metadata.py @@ -2,61 +2,71 @@ """ PDF Metadata Extraction Module -Provides functions for extracting and formatting PDF document metadata. +Provides PDFMetadataExtractor class for extracting and formatting PDF document metadata. +Implements BaseMetadataExtractor interface from contextifier.core.functions. """ import logging from datetime import datetime from typing import Any, Dict, Optional +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) + logger = logging.getLogger("document-processor") -def extract_pdf_metadata(doc) -> Dict[str, Any]: +class PDFMetadataExtractor(BaseMetadataExtractor): """ - Extract metadata from a PDF document. - - Args: - doc: PyMuPDF document object - - Returns: - Metadata dictionary + PDF Metadata Extractor. + + Extracts metadata from PyMuPDF (fitz) document objects. + + Supported fields: + - title, subject, author, keywords + - create_time, last_saved_time + + Usage: + extractor = PDFMetadataExtractor() + metadata = extractor.extract(pdf_doc) + text = extractor.format(metadata) """ - metadata = {} - - try: - pdf_meta = doc.metadata - if not pdf_meta: - return metadata - - if pdf_meta.get('title'): - metadata['title'] = pdf_meta['title'].strip() - - if pdf_meta.get('subject'): - metadata['subject'] = pdf_meta['subject'].strip() - - if pdf_meta.get('author'): - metadata['author'] = pdf_meta['author'].strip() - - if pdf_meta.get('keywords'): - metadata['keywords'] = pdf_meta['keywords'].strip() - - if pdf_meta.get('creationDate'): - create_time = parse_pdf_date(pdf_meta['creationDate']) - if create_time: - metadata['create_time'] = create_time - - if pdf_meta.get('modDate'): - mod_time = parse_pdf_date(pdf_meta['modDate']) - if mod_time: - metadata['last_saved_time'] = mod_time - - except Exception as e: - logger.debug(f"[PDF] Error extracting metadata: {e}") - - return metadata - - -def parse_pdf_date(date_str: str) -> Optional[datetime]: + + def extract(self, source: Any) -> DocumentMetadata: + """ + Extract metadata from PDF document. + + Args: + source: PyMuPDF document object (fitz.Document) + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + try: + pdf_meta = source.metadata + if not pdf_meta: + return DocumentMetadata() + + return DocumentMetadata( + title=self._get_stripped(pdf_meta, 'title'), + subject=self._get_stripped(pdf_meta, 'subject'), + author=self._get_stripped(pdf_meta, 'author'), + keywords=self._get_stripped(pdf_meta, 'keywords'), + create_time=parse_pdf_date(pdf_meta.get('creationDate')), + last_saved_time=parse_pdf_date(pdf_meta.get('modDate')), + ) + except Exception as e: + self.logger.debug(f"[PDF] Error extracting metadata: {e}") + return DocumentMetadata() + + def _get_stripped(self, meta: Dict[str, Any], key: str) -> Optional[str]: + """Get stripped string value from metadata dict.""" + value = meta.get(key) + return value.strip() if value else None + + +def parse_pdf_date(date_str: Optional[str]) -> Optional[datetime]: """ Convert a PDF date string to datetime. @@ -84,37 +94,7 @@ def parse_pdf_date(date_str: str) -> Optional[datetime]: return None -def format_metadata(metadata: Dict[str, Any]) -> str: - """ - Format metadata as a string. - - Args: - metadata: Metadata dictionary - - Returns: - Formatted metadata string - """ - if not metadata: - return "" - - lines = [""] - - field_names = { - 'title': 'Title', - 'subject': 'Subject', - 'author': 'Author', - 'keywords': 'Keywords', - 'create_time': 'Created', - 'last_saved_time': 'Last Modified' - } - - for key, label in field_names.items(): - value = metadata.get(key) - if value: - if isinstance(value, datetime): - value = value.strftime("%Y-%m-%d %H:%M:%S") - lines.append(f" {label}: {value}") - - lines.append("\n") - - return "\n".join(lines) +__all__ = [ + "PDFMetadataExtractor", + "parse_pdf_date", +] diff --git a/contextifier/core/processor/pdf_helpers/pdf_preprocessor.py b/contextifier/core/processor/pdf_helpers/pdf_preprocessor.py new file mode 100644 index 0000000..8198617 --- /dev/null +++ b/contextifier/core/processor/pdf_helpers/pdf_preprocessor.py @@ -0,0 +1,106 @@ +# contextifier/core/processor/pdf_helpers/pdf_preprocessor.py +""" +PDF Preprocessor - Process PDF document after conversion. + +This preprocessor handles PDF-specific processing after the document +has been converted from binary to fitz.Document. + +Processing Pipeline Position: + 1. PDFFileConverter.convert() → fitz.Document + 2. PDFPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. PDFMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (text, images, tables) + +Current Implementation: + - Pass-through (no special preprocessing needed for PDF) + - PDF processing is done during content extraction phase + +Future Enhancements: + - Page rotation normalization + - Damaged page recovery + - Font embedding analysis + - Document structure analysis +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.pdf.preprocessor") + + +class PDFPreprocessor(BasePreprocessor): + """ + PDF Document Preprocessor. + + Currently a pass-through implementation as PDF processing + is handled during the content extraction phase. + + The fitz.Document object from PDFFileConverter already provides + a clean interface for accessing pages, text, and images. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted PDF document. + + Args: + converted_data: fitz.Document object from PDFFileConverter + **kwargs: Additional options + - analyze_structure: Whether to analyze document structure + - normalize_rotation: Whether to normalize page rotation + + Returns: + PreprocessedData with the document and any extracted resources + """ + # For now, PDF preprocessing is a pass-through + # The fitz.Document is already in a workable state + + # Store the document reference for downstream processing + metadata: Dict[str, Any] = {} + + # If it's a fitz.Document, extract some basic info + if hasattr(converted_data, 'page_count'): + metadata['page_count'] = converted_data.page_count + metadata['is_encrypted'] = getattr(converted_data, 'is_encrypted', False) + metadata['is_pdf'] = getattr(converted_data, 'is_pdf', True) + + logger.debug("PDF preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the fitz.Document + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - fitz.Document + encoding="binary", + extracted_resources={"document": converted_data}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "PDF Preprocessor" + + def validate(self, data: Any) -> bool: + """ + Validate if the data can be preprocessed. + + Args: + data: fitz.Document object or bytes + + Returns: + True if valid PDF document + """ + # Check if it's a fitz.Document + if hasattr(data, 'page_count') and hasattr(data, 'load_page'): + return True + return False + + +__all__ = ['PDFPreprocessor'] diff --git a/contextifier/core/processor/ppt_handler.py b/contextifier/core/processor/ppt_handler.py index de1890c..a1a22a8 100644 --- a/contextifier/core/processor/ppt_handler.py +++ b/contextifier/core/processor/ppt_handler.py @@ -7,15 +7,11 @@ import logging from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING -from pptx import Presentation - from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.chart_extractor import BaseChartExtractor from contextifier.core.processor.ppt_helper import ( ElementType, SlideElement, - extract_ppt_metadata, - format_metadata, extract_text_with_bullets, is_simple_table, extract_simple_table_as_text, @@ -29,6 +25,8 @@ merge_slide_elements, ) from contextifier.core.processor.ppt_helper.ppt_chart_extractor import PPTChartExtractor +from contextifier.core.processor.ppt_helper.ppt_metadata import PPTMetadataExtractor +from contextifier.core.processor.ppt_helper.ppt_image_processor import PPTImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -39,11 +37,34 @@ class PPTHandler(BaseHandler): """PPT/PPTX File Processing Handler Class""" - + + def _create_file_converter(self): + """Create PPT-specific file converter.""" + from contextifier.core.processor.ppt_helper.ppt_file_converter import PPTFileConverter + return PPTFileConverter() + + def _create_preprocessor(self): + """Create PPT-specific preprocessor.""" + from contextifier.core.processor.ppt_helper.ppt_preprocessor import PPTPreprocessor + return PPTPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Create PPT-specific chart extractor.""" return PPTChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Create PPT-specific metadata extractor.""" + return PPTMetadataExtractor() + + def _create_format_image_processor(self): + """Create PPT-specific image processor.""" + return PPTImageProcessor( + directory_path=self._image_processor.config.directory_path, + tag_prefix=self._image_processor.config.tag_prefix, + tag_suffix=self._image_processor.config.tag_suffix, + storage_backend=self._image_processor.storage_backend, + ) + def extract_text( self, current_file: "CurrentFile", @@ -52,39 +73,45 @@ def extract_text( ) -> str: """ Extract text from PPT/PPTX file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") self.logger.info(f"PPT processing: {file_path}") return self._extract_ppt_enhanced(current_file, extract_metadata) - + def _extract_ppt_enhanced(self, current_file: "CurrentFile", extract_metadata: bool = True) -> str: """Enhanced PPT processing with pre-extracted charts.""" file_path = current_file.get("file_path", "unknown") self.logger.info(f"Enhanced PPT processing: {file_path}") - + try: - # Open from stream to avoid path encoding issues + # Step 1: Convert to Presentation using file_converter + file_data = current_file.get("file_data", b"") file_stream = self.get_file_stream(current_file) - prs = Presentation(file_stream) + prs = self.file_converter.convert(file_data, file_stream) + + # Step 2: Preprocess - may transform prs in the future + preprocessed = self.preprocess(prs) + prs = preprocessed.clean_content # TRUE SOURCE + result_parts = [] processed_images: Set[str] = set() total_tables = 0 total_images = 0 total_charts = 0 - + # Pre-extract all charts using ChartExtractor file_stream.seek(0) chart_data_list = self.chart_extractor.extract_all_from_file(file_stream) chart_idx = [0] # Mutable container for closure - + def get_next_chart() -> str: """Callback to get the next pre-extracted chart content.""" if chart_idx[0] < len(chart_data_list): @@ -92,25 +119,24 @@ def get_next_chart() -> str: chart_idx[0] += 1 return self._format_chart_data(chart_data) return "" - + if extract_metadata: - metadata = extract_ppt_metadata(prs) - metadata_text = format_metadata(metadata) + metadata_text = self.extract_and_format_metadata(prs) if metadata_text: result_parts.append(metadata_text) result_parts.append("") - + for slide_idx, slide in enumerate(prs.slides): slide_tag = self.create_slide_tag(slide_idx + 1) result_parts.append(f"\n{slide_tag}\n") - + elements: List[SlideElement] = [] - + for shape in slide.shapes: try: position = get_shape_position(shape) shape_id = shape.shape_id if hasattr(shape, 'shape_id') else id(shape) - + if shape.has_table: if is_simple_table(shape.table): simple_text = extract_simple_table_as_text(shape.table) @@ -131,9 +157,9 @@ def get_next_chart() -> str: position=position, shape_id=shape_id )) - + elif is_picture_shape(shape): - image_tag = process_image_shape(shape, processed_images, self.image_processor) + image_tag = process_image_shape(shape, processed_images, self.format_image_processor) if image_tag: total_images += 1 elements.append(SlideElement( @@ -142,7 +168,7 @@ def get_next_chart() -> str: position=position, shape_id=shape_id )) - + elif shape.has_chart: # Use pre-extracted chart via callback chart_text = get_next_chart() @@ -154,7 +180,7 @@ def get_next_chart() -> str: position=position, shape_id=shape_id )) - + elif hasattr(shape, "text_frame") and shape.text_frame: text_content = extract_text_with_bullets(shape.text_frame) if text_content: @@ -164,7 +190,7 @@ def get_next_chart() -> str: position=position, shape_id=shape_id )) - + elif hasattr(shape, "text") and shape.text.strip(): elements.append(SlideElement( element_type=ElementType.TEXT, @@ -172,46 +198,46 @@ def get_next_chart() -> str: position=position, shape_id=shape_id )) - + elif hasattr(shape, "shapes"): - group_elements = process_group_shape(shape, processed_images, self.image_processor) + group_elements = process_group_shape(shape, processed_images, self.format_image_processor) elements.extend(group_elements) - + except Exception as shape_e: self.logger.warning(f"Error processing shape in slide {slide_idx + 1}: {shape_e}") continue - + elements.sort(key=lambda e: e.sort_key) slide_content = merge_slide_elements(elements) - + if slide_content.strip(): result_parts.append(slide_content) else: result_parts.append("[Empty Slide]\n") - + notes_text = extract_slide_notes(slide) if notes_text: result_parts.append(f"\n[Slide Notes]\n{notes_text}\n") - + result = "".join(result_parts) self.logger.info(f"Enhanced PPT: {len(prs.slides)} slides, {total_tables} tables, " f"{total_images} images, {total_charts} charts") - + return result - + except Exception as e: self.logger.error(f"Error in enhanced PPT processing: {e}") import traceback self.logger.debug(traceback.format_exc()) return self._extract_ppt_simple(current_file) - + def _format_chart_data(self, chart_data: "ChartData") -> str: """Format ChartData using ChartProcessor.""" from contextifier.core.functions.chart_extractor import ChartData - + if not isinstance(chart_data, ChartData): return "" - + if chart_data.has_data(): return self.chart_processor.format_chart_data( chart_type=chart_data.chart_type, @@ -224,18 +250,19 @@ def _format_chart_data(self, chart_data: "ChartData") -> str: chart_type=chart_data.chart_type, title=chart_data.title ) - + def _extract_ppt_simple(self, current_file: "CurrentFile") -> str: """Simple text extraction (fallback).""" try: + file_data = current_file.get("file_data", b"") file_stream = self.get_file_stream(current_file) - prs = Presentation(file_stream) + prs = self.file_converter.convert(file_data, file_stream) result_parts = [] - + for slide_idx, slide in enumerate(prs.slides): slide_tag = self.create_slide_tag(slide_idx + 1) result_parts.append(f"\n{slide_tag}\n") - + slide_texts = [] for shape in slide.shapes: try: @@ -247,14 +274,14 @@ def _extract_ppt_simple(self, current_file: "CurrentFile") -> str: slide_texts.append(table_text) except: continue - + if slide_texts: result_parts.append("\n".join(slide_texts) + "\n") else: result_parts.append("[Empty Slide]\n") - + return "".join(result_parts) - + except Exception as e: self.logger.error(f"Error in simple PPT extraction: {e}") return f"[PPT file processing failed: {str(e)}]" diff --git a/contextifier/core/processor/ppt_helper/__init__.py b/contextifier/core/processor/ppt_helper/__init__.py index 75c07fb..4612d62 100644 --- a/contextifier/core/processor/ppt_helper/__init__.py +++ b/contextifier/core/processor/ppt_helper/__init__.py @@ -24,8 +24,7 @@ # === Metadata === from contextifier.core.processor.ppt_helper.ppt_metadata import ( - extract_ppt_metadata, - format_metadata, + PPTMetadataExtractor, ) # === Bullet/Numbering === diff --git a/contextifier/core/processor/ppt_helper/ppt_file_converter.py b/contextifier/core/processor/ppt_helper/ppt_file_converter.py new file mode 100644 index 0000000..d7246a7 --- /dev/null +++ b/contextifier/core/processor/ppt_helper/ppt_file_converter.py @@ -0,0 +1,54 @@ +# libs/core/processor/ppt_helper/ppt_file_converter.py +""" +PPTFileConverter - PPT/PPTX file format converter + +Converts binary PPT/PPTX data to python-pptx Presentation object. +""" +from io import BytesIO +from typing import Any, Optional, BinaryIO + +from contextifier.core.functions.file_converter import BaseFileConverter + + +class PPTFileConverter(BaseFileConverter): + """ + PPT/PPTX file converter using python-pptx. + + Converts binary PPT/PPTX data to Presentation object. + """ + + # ZIP magic number (PPTX is a ZIP file) + ZIP_MAGIC = b'PK\x03\x04' + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> Any: + """ + Convert binary PPT/PPTX data to Presentation object. + + Args: + file_data: Raw binary PPT/PPTX data + file_stream: Optional file stream + **kwargs: Additional options + + Returns: + pptx.Presentation object + """ + from pptx import Presentation + + stream = file_stream if file_stream is not None else BytesIO(file_data) + stream.seek(0) + return Presentation(stream) + + def get_format_name(self) -> str: + """Return format name.""" + return "PPT/PPTX Presentation" + + def validate(self, file_data: bytes) -> bool: + """Validate if data is a valid PPTX.""" + if not file_data or len(file_data) < 4: + return False + return file_data[:4] == self.ZIP_MAGIC diff --git a/contextifier/core/processor/ppt_helper/ppt_image_processor.py b/contextifier/core/processor/ppt_helper/ppt_image_processor.py new file mode 100644 index 0000000..af05972 --- /dev/null +++ b/contextifier/core/processor/ppt_helper/ppt_image_processor.py @@ -0,0 +1,196 @@ +# contextifier/core/processor/ppt_helper/ppt_image_processor.py +""" +PPT Image Processor + +Provides PPT/PPTX-specific image processing that inherits from ImageProcessor. +Handles slide images, shape images, and embedded pictures. +""" +import logging +from typing import Any, Dict, Optional, Set, TYPE_CHECKING + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +if TYPE_CHECKING: + from pptx import Presentation + from pptx.slide import Slide + from pptx.shapes.base import BaseShape + +logger = logging.getLogger("contextify.image_processor.ppt") + + +class PPTImageProcessor(ImageProcessor): + """ + PPT/PPTX-specific image processor. + + Inherits from ImageProcessor and provides PPT-specific processing. + + Handles: + - Picture shapes + - Embedded images + - Group shape images + - Background images + + Example: + processor = PPTImageProcessor() + + # Process slide image + tag = processor.process_image(image_data, slide_num=1) + + # Process from shape + tag = processor.process_picture_shape(shape) + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize PPTImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + slide_num: Optional[int] = None, + shape_id: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process and save PPT image data. + + Args: + image_data: Raw image binary data + slide_num: Source slide number (for naming) + shape_id: Shape ID (for naming) + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = None + if slide_num is not None: + if shape_id is not None: + custom_name = f"ppt_slide{slide_num}_shape{shape_id}" + else: + custom_name = f"ppt_slide{slide_num}" + elif shape_id is not None: + custom_name = f"ppt_shape{shape_id}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_picture_shape( + self, + shape: "BaseShape", + slide_num: Optional[int] = None, + ) -> Optional[str]: + """ + Process python-pptx picture shape. + + Args: + shape: Picture shape object + slide_num: Source slide number + + Returns: + Image tag string, or None on failure + """ + try: + if not hasattr(shape, 'image'): + return None + + image = shape.image + image_data = image.blob + + if not image_data: + return None + + shape_id = shape.shape_id if hasattr(shape, 'shape_id') else None + + return self.process_image( + image_data, + slide_num=slide_num, + shape_id=shape_id + ) + + except Exception as e: + self._logger.warning(f"Failed to process picture shape: {e}") + return None + + def process_embedded_image( + self, + image_data: bytes, + image_name: Optional[str] = None, + slide_num: Optional[int] = None, + **kwargs + ) -> Optional[str]: + """ + Process embedded PPT image. + + Args: + image_data: Image binary data + image_name: Original image filename + slide_num: Source slide number + **kwargs: Additional options + + Returns: + Image tag string, or None on failure + """ + custom_name = image_name + if custom_name is None and slide_num is not None: + custom_name = f"ppt_embed_slide{slide_num}" + + return self.save_image(image_data, custom_name=custom_name) + + def process_group_shape_images( + self, + group_shape: "BaseShape", + slide_num: Optional[int] = None, + ) -> list: + """ + Process all images in a group shape. + + Args: + group_shape: Group shape containing other shapes + slide_num: Source slide number + + Returns: + List of image tags + """ + tags = [] + + try: + if not hasattr(group_shape, 'shapes'): + return tags + + for shape in group_shape.shapes: + if hasattr(shape, 'image'): + tag = self.process_picture_shape(shape, slide_num) + if tag: + tags.append(tag) + elif hasattr(shape, 'shapes'): + # Nested group + nested_tags = self.process_group_shape_images(shape, slide_num) + tags.extend(nested_tags) + + except Exception as e: + self._logger.warning(f"Failed to process group shape: {e}") + + return tags + + +__all__ = ["PPTImageProcessor"] diff --git a/contextifier/core/processor/ppt_helper/ppt_metadata.py b/contextifier/core/processor/ppt_helper/ppt_metadata.py index 9d8f487..ed12d94 100644 --- a/contextifier/core/processor/ppt_helper/ppt_metadata.py +++ b/contextifier/core/processor/ppt_helper/ppt_metadata.py @@ -1,105 +1,71 @@ +# contextifier/core/processor/ppt_helper/ppt_metadata.py """ -PPT 메타데이터 추출 모듈 +PPT Metadata Extraction Module -포함 함수: -- extract_ppt_metadata(): PPT에서 메타데이터 추출 -- format_metadata(): 메타데이터를 읽기 쉬운 문자열로 변환 +Provides PPTMetadataExtractor class for extracting metadata from PowerPoint documents. +Implements BaseMetadataExtractor interface. """ import logging -from datetime import datetime -from typing import Any, Dict +from typing import Any, Optional from pptx import Presentation -logger = logging.getLogger("document-processor") - - -def extract_ppt_metadata(prs: Presentation) -> Dict[str, Any]: - """ - PPT 파일에서 메타데이터를 추출합니다. - - python-pptx의 core_properties를 통해 다음 정보를 추출합니다: - - 제목 (title) - - 주제 (subject) - - 작성자 (author) - - 키워드 (keywords) - - 설명 (comments) - - 마지막 수정자 (last_modified_by) - - 작성일 (created) - - 수정일 (modified) - - Args: - prs: python-pptx Presentation 객체 - - Returns: - 메타데이터 딕셔너리 - """ - metadata = {} - - try: - props = prs.core_properties - - if props.title: - metadata['title'] = props.title - if props.subject: - metadata['subject'] = props.subject - if props.author: - metadata['author'] = props.author - if props.keywords: - metadata['keywords'] = props.keywords - if props.comments: - metadata['comments'] = props.comments - if props.last_modified_by: - metadata['last_saved_by'] = props.last_modified_by - if props.created: - metadata['create_time'] = props.created - if props.modified: - metadata['last_saved_time'] = props.modified - - logger.info(f"Extracted PPT metadata: {metadata}") +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) - except Exception as e: - logger.warning(f"Failed to extract PPT metadata: {e}") - - return metadata +logger = logging.getLogger("document-processor") -def format_metadata(metadata: Dict[str, Any]) -> str: +class PPTMetadataExtractor(BaseMetadataExtractor): """ - 메타데이터 딕셔너리를 읽기 쉬운 문자열로 변환합니다. - - Args: - metadata: 메타데이터 딕셔너리 - - Returns: - 포맷팅된 메타데이터 문자열 + PPT/PPTX Metadata Extractor. + + Extracts metadata from python-pptx Presentation objects. + + Supported fields: + - title, subject, author, keywords, comments + - last_saved_by, create_time, last_saved_time + + Usage: + extractor = PPTMetadataExtractor() + metadata = extractor.extract(presentation) + text = extractor.format(metadata) """ - if not metadata: - return "" - - lines = [""] - - field_names = { - 'title': '제목', - 'subject': '주제', - 'author': '작성자', - 'keywords': '키워드', - 'comments': '설명', - 'last_saved_by': '마지막 저장자', - 'create_time': '작성일', - 'last_saved_time': '수정일', - } - - for key, label in field_names.items(): - if key in metadata and metadata[key]: - value = metadata[key] - - # datetime 객체 포맷팅 - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%d %H:%M:%S') - - lines.append(f" {label}: {value}") - - lines.append("") - - return "\n".join(lines) + + def extract(self, source: Presentation) -> DocumentMetadata: + """ + Extract metadata from PPT document. + + Args: + source: python-pptx Presentation object + + Returns: + DocumentMetadata instance containing extracted metadata. + """ + try: + props = source.core_properties + + return DocumentMetadata( + title=self._get_value(props.title), + subject=self._get_value(props.subject), + author=self._get_value(props.author), + keywords=self._get_value(props.keywords), + comments=self._get_value(props.comments), + last_saved_by=self._get_value(props.last_modified_by), + create_time=props.created, + last_saved_time=props.modified, + ) + except Exception as e: + self.logger.warning(f"Failed to extract PPT metadata: {e}") + return DocumentMetadata() + + def _get_value(self, value: Optional[str]) -> Optional[str]: + """Return value if present, None otherwise.""" + return value if value else None + + +__all__ = [ + 'PPTMetadataExtractor', +] diff --git a/contextifier/core/processor/ppt_helper/ppt_preprocessor.py b/contextifier/core/processor/ppt_helper/ppt_preprocessor.py new file mode 100644 index 0000000..4a28b0d --- /dev/null +++ b/contextifier/core/processor/ppt_helper/ppt_preprocessor.py @@ -0,0 +1,77 @@ +# contextifier/core/processor/ppt_helper/ppt_preprocessor.py +""" +PPT Preprocessor - Process PPT/PPTX presentation after conversion. + +Processing Pipeline Position: + 1. PPTFileConverter.convert() → pptx.Presentation + 2. PPTPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. PPTMetadataExtractor.extract() → DocumentMetadata + 4. Content extraction (slides, shapes, images, charts) + +Current Implementation: + - Pass-through (PPT uses python-pptx Presentation object directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.ppt.preprocessor") + + +class PPTPreprocessor(BasePreprocessor): + """ + PPT/PPTX Presentation Preprocessor. + + Currently a pass-through implementation as PPT processing + is handled during the content extraction phase using python-pptx. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted PPT presentation. + + Args: + converted_data: pptx.Presentation object from PPTFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the presentation and any extracted resources + """ + metadata: Dict[str, Any] = {} + + if hasattr(converted_data, 'slides'): + metadata['slide_count'] = len(converted_data.slides) + + if hasattr(converted_data, 'slide_width'): + metadata['slide_width'] = converted_data.slide_width + metadata['slide_height'] = converted_data.slide_height + + logger.debug("PPT preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the Presentation + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - pptx.Presentation + encoding="utf-8", + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "PPT Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is a PPT Presentation object.""" + return hasattr(data, 'slides') and hasattr(data, 'slide_layouts') + + +__all__ = ['PPTPreprocessor'] diff --git a/contextifier/core/processor/rtf_handler.py b/contextifier/core/processor/rtf_handler.py new file mode 100644 index 0000000..5cd8420 --- /dev/null +++ b/contextifier/core/processor/rtf_handler.py @@ -0,0 +1,290 @@ +# contextifier/core/processor/rtf_handler.py +""" +RTF Handler + +Class-based handler for RTF files. +Follows the correct architecture: +1. Converter: Pass through (RTF uses raw binary) +2. Preprocessor: Binary preprocessing (image extraction, \\bin removal) +3. Handler: Sequential processing (metadata → tables → content → result) +""" +import logging +import re +from pathlib import Path +from typing import Any, Dict, Optional, TYPE_CHECKING + +from striprtf.striprtf import rtf_to_text + +from contextifier.core.processor.base_handler import BaseHandler +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor + +# Import from rtf_helper +from contextifier.core.processor.rtf_helper import ( + RTFFileConverter, + RTFConvertedData, + RTFMetadataExtractor, + RTFSourceInfo, + RTFPreprocessor, + extract_tables_with_positions, + extract_inline_content, + extract_text_only, + decode_content, + detect_encoding, +) + +if TYPE_CHECKING: + from contextifier.core.document_processor import CurrentFile + +logger = logging.getLogger("contextify.rtf.handler") + + +class RTFHandler(BaseHandler): + """ + RTF Document Processing Handler. + + Processing flow: + 1. file_converter.convert() → bytes (pass through) + 2. preprocessor.preprocess() → PreprocessedData (image extraction, binary cleanup) + 3. decode content → string + 4. metadata_extractor.extract() → DocumentMetadata + 5. extract_tables_with_positions() → List[RTFTable] + 6. extract_inline_content() → str + 7. Build result string + """ + + def _create_file_converter(self) -> RTFFileConverter: + """Create RTF-specific file converter.""" + return RTFFileConverter() + + def _create_preprocessor(self) -> RTFPreprocessor: + """Create RTF-specific preprocessor.""" + return RTFPreprocessor() + + def _create_chart_extractor(self) -> BaseChartExtractor: + """RTF files do not contain charts. Return NullChartExtractor.""" + return NullChartExtractor(self._chart_processor) + + def _create_metadata_extractor(self) -> RTFMetadataExtractor: + """Create RTF-specific metadata extractor.""" + return RTFMetadataExtractor() + + def _create_format_image_processor(self) -> ImageProcessor: + """Create RTF-specific image processor (use base for now).""" + return self._image_processor + + def extract_text( + self, + current_file: "CurrentFile", + extract_metadata: bool = True, + **kwargs + ) -> str: + """ + Extract text from RTF file. + + Args: + current_file: CurrentFile dict containing file info and binary data + extract_metadata: Whether to extract metadata + **kwargs: Additional options + + Returns: + Extracted text + """ + file_path = current_file.get("file_path", "unknown") + file_data = current_file.get("file_data", b"") + + self.logger.info(f"RTF processing: {file_path}") + + if not file_data: + self.logger.error(f"Empty file data: {file_path}") + return f"[RTF file is empty: {file_path}]" + + # Validate RTF format + if not file_data.strip().startswith(b'{\\rtf'): + self.logger.warning(f"Invalid RTF format: {file_path}") + return self._extract_fallback(file_data, extract_metadata) + + try: + # Step 1: Converter - pass through (RTF uses raw binary) + raw_data: bytes = self.file_converter.convert(file_data) + + # Step 2: Preprocessor - extract images, remove binary data + output_dir = self._get_output_dir(file_path) + doc_name = Path(file_path).stem if file_path != "unknown" else "document" + + preprocessed = self.preprocessor.preprocess( + raw_data, + output_dir=output_dir, + doc_name=doc_name, + ) + + clean_content = preprocessed.clean_content + image_tags = preprocessed.extracted_resources.get("image_tags", []) + encoding = preprocessed.encoding or "cp949" + + # Step 3: Decode to string if still bytes + if isinstance(clean_content, bytes): + encoding = detect_encoding(clean_content) or encoding + content = decode_content(clean_content, encoding) + else: + content = clean_content + + # Build RTFConvertedData for downstream processing + converted = RTFConvertedData( + content=content, + encoding=encoding, + image_tags=image_tags, + original_size=len(file_data), + ) + + self.logger.debug( + f"RTF preprocessed: encoding={encoding}, " + f"images={len(image_tags)}, size={len(file_data)}" + ) + + # Step 4: Extract content + return self._extract_from_converted( + converted, + current_file, + extract_metadata, + ) + + except Exception as e: + self.logger.error(f"Error in RTF processing: {e}", exc_info=True) + return self._extract_fallback(file_data, extract_metadata) + + def _extract_from_converted( + self, + converted: RTFConvertedData, + current_file: "CurrentFile", + extract_metadata: bool, + ) -> str: + """ + Internal method to extract content from RTFConvertedData. + + Args: + converted: RTFConvertedData object + current_file: CurrentFile dict + extract_metadata: Whether to extract metadata + + Returns: + Extracted text + """ + content = converted.content + encoding = converted.encoding + + result_parts = [] + + # Step 2: Extract metadata + if extract_metadata: + source = RTFSourceInfo(content=content, encoding=encoding) + metadata = self.metadata_extractor.extract(source) + metadata_str = self.metadata_extractor.format(metadata) + if metadata_str: + result_parts.append(metadata_str + "\n\n") + + # Add page tag + page_tag = self.create_page_tag(1) + result_parts.append(f"{page_tag}\n") + + # Step 3: Extract tables with positions + tables, table_regions = extract_tables_with_positions(content, encoding) + + # Step 4: Extract inline content (preserves table positions) + inline_content = extract_inline_content(content, table_regions, encoding) + + if inline_content: + result_parts.append(inline_content) + else: + # Fallback: separate text and tables + text_only = extract_text_only(content, encoding) + if text_only: + result_parts.append(text_only) + + for table in tables: + if not table.rows: + continue + if table.is_real_table(): + result_parts.append("\n" + table.to_html() + "\n") + else: + result_parts.append("\n" + table.to_text_list() + "\n") + + # Step 5: Add image tags + if converted.image_tags: + result_parts.append("\n") + for tag in converted.image_tags: + result_parts.append(tag + "\n") + + result = "\n".join(result_parts) + + # Clean up invalid image tags + result = re.sub(r'\[image:[^\]]*uploads/\.[^\]]*\]', '', result) + + return result + + def _extract_fallback( + self, + file_data: bytes, + extract_metadata: bool, + ) -> str: + """ + Fallback extraction using striprtf library. + + Args: + file_data: Raw binary data + extract_metadata: Whether to extract metadata + + Returns: + Extracted text + """ + # Try different encodings + content = None + for encoding in ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']: + try: + content = file_data.decode(encoding) + break + except (UnicodeDecodeError, UnicodeError): + continue + + if content is None: + content = file_data.decode('cp1252', errors='replace') + + result_parts = [] + + # Extract metadata from raw content + if extract_metadata: + source = RTFSourceInfo(content=content, encoding='cp1252') + metadata = self.metadata_extractor.extract(source) + metadata_str = self.extract_and_format_metadata(metadata) + if metadata_str: + result_parts.append(metadata_str + "\n\n") + + # Add page tag + page_tag = self.create_page_tag(1) + result_parts.append(f"{page_tag}\n") + + # Extract text using striprtf + try: + text = rtf_to_text(content) + except Exception: + # Manual cleanup + text = re.sub(r'\\[a-z]+\d*\s?', '', content) + text = re.sub(r"\\'[0-9a-fA-F]{2}", '', text) + text = re.sub(r'[{}]', '', text) + + if text: + text = re.sub(r'\n{3,}', '\n\n', text) + result_parts.append(text.strip()) + + return "\n".join(result_parts) + + def _get_output_dir(self, file_path: str) -> Optional[Path]: + """Get output directory for images.""" + if hasattr(self._image_processor, 'config'): + dir_path = self._image_processor.config.directory_path + if dir_path: + return Path(dir_path) + return None + + +__all__ = ['RTFHandler'] diff --git a/contextifier/core/processor/rtf_helper/__init__.py b/contextifier/core/processor/rtf_helper/__init__.py new file mode 100644 index 0000000..6a558a9 --- /dev/null +++ b/contextifier/core/processor/rtf_helper/__init__.py @@ -0,0 +1,128 @@ +# contextifier/core/processor/rtf_helper/__init__.py +""" +RTF Helper Module + +Provides RTF parsing and extraction utilities with proper interface separation. + +Architecture: + - RTFPreprocessor: Binary preprocessing (image extraction, \\bin handling) + - RTFFileConverter: Pass through (RTF uses raw binary) + - RTFMetadataExtractor: Metadata extraction + - Table extraction: extract_tables_with_positions() + - Content extraction: extract_inline_content(), extract_text_only() + +Usage: + from contextifier.core.processor.rtf_helper import ( + RTFFileConverter, + RTFConvertedData, + RTFPreprocessor, + RTFMetadataExtractor, + RTFSourceInfo, + extract_tables_with_positions, + extract_inline_content, + extract_text_only, + ) +""" + +# Converter +from contextifier.core.processor.rtf_helper.rtf_file_converter import ( + RTFFileConverter, + RTFConvertedData, +) + +# Preprocessor +from contextifier.core.processor.rtf_helper.rtf_preprocessor import ( + RTFPreprocessor, +) + +# Metadata +from contextifier.core.processor.rtf_helper.rtf_metadata_extractor import ( + RTFMetadataExtractor, + RTFSourceInfo, +) + +# Table extraction +from contextifier.core.processor.rtf_helper.rtf_table_extractor import ( + RTFCellInfo, + RTFTable, + extract_tables_with_positions, +) + +# Content extraction +from contextifier.core.processor.rtf_helper.rtf_content_extractor import ( + extract_inline_content, + extract_text_only, +) + +# Decoder utilities +from contextifier.core.processor.rtf_helper.rtf_decoder import ( + detect_encoding, + decode_content, + decode_bytes, + decode_hex_escapes, +) + +# Text cleaning utilities +from contextifier.core.processor.rtf_helper.rtf_text_cleaner import ( + clean_rtf_text, + remove_destination_groups, + remove_shape_groups, + remove_shape_property_groups, + remove_shprslt_blocks, +) + +# Region finder utilities +from contextifier.core.processor.rtf_helper.rtf_region_finder import ( + find_excluded_regions, + is_in_excluded_region, +) + +# Constants +from contextifier.core.processor.rtf_helper.rtf_constants import ( + SHAPE_PROPERTY_NAMES, + SKIP_DESTINATIONS, + EXCLUDE_DESTINATION_KEYWORDS, + IMAGE_DESTINATIONS, + CODEPAGE_ENCODING_MAP, + DEFAULT_ENCODINGS, +) + + +__all__ = [ + # Converter + 'RTFFileConverter', + 'RTFConvertedData', + # Preprocessor + 'RTFPreprocessor', + # Metadata + 'RTFMetadataExtractor', + 'RTFSourceInfo', + # Table + 'RTFCellInfo', + 'RTFTable', + 'extract_tables_with_positions', + # Content + 'extract_inline_content', + 'extract_text_only', + # Decoder + 'detect_encoding', + 'decode_content', + 'decode_bytes', + 'decode_hex_escapes', + # Text cleaner + 'clean_rtf_text', + 'remove_destination_groups', + 'remove_shape_groups', + 'remove_shape_property_groups', + 'remove_shprslt_blocks', + # Region finder + 'find_excluded_regions', + 'is_in_excluded_region', + # Constants + 'SHAPE_PROPERTY_NAMES', + 'SKIP_DESTINATIONS', + 'EXCLUDE_DESTINATION_KEYWORDS', + 'IMAGE_DESTINATIONS', + 'CODEPAGE_ENCODING_MAP', + 'DEFAULT_ENCODINGS', +] diff --git a/contextifier/core/processor/rtf_helper/rtf_constants.py b/contextifier/core/processor/rtf_helper/rtf_constants.py new file mode 100644 index 0000000..a9121a2 --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_constants.py @@ -0,0 +1,94 @@ +# contextifier/core/processor/rtf_helper/rtf_constants.py +""" +RTF Constants + +Constants used for RTF parsing. +""" + +# Shape property names (to be removed) +SHAPE_PROPERTY_NAMES = [ + 'shapeType', 'fFlipH', 'fFlipV', 'rotation', + 'posh', 'posrelh', 'posv', 'posrelv', + 'fLayoutInCell', 'fAllowOverlap', 'fBehindDocument', + 'fPseudoInline', 'fLockAnchor', 'fLockPosition', + 'fLockAspectRatio', 'fLockRotation', 'fLockAgainstSelect', + 'fLockCropping', 'fLockVerticies', 'fLockText', + 'fLockAdjustHandles', 'fLockAgainstGrouping', + 'geoLeft', 'geoTop', 'geoRight', 'geoBottom', + 'shapePath', 'pWrapPolygonVertices', 'dxWrapDistLeft', + 'dyWrapDistTop', 'dxWrapDistRight', 'dyWrapDistBottom', + 'fLine', 'fFilled', 'fillType', 'fillColor', + 'fillOpacity', 'fillBackColor', 'fillBackOpacity', + 'lineColor', 'lineOpacity', 'lineWidth', 'lineStyle', + 'lineDashing', 'lineStartArrowhead', 'lineStartArrowWidth', + 'lineStartArrowLength', 'lineEndArrowhead', 'lineEndArrowWidth', + 'lineEndArrowLength', 'shadowType', 'shadowColor', + 'shadowOpacity', 'shadowOffsetX', 'shadowOffsetY', +] + +# RTF destination 키워드 (제외 대상) +EXCLUDE_DESTINATION_KEYWORDS = [ + 'fonttbl', 'colortbl', 'stylesheet', 'listtable', + 'listoverridetable', 'revtbl', 'rsidtbl', 'generator', + 'info', 'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping', + 'datastore', 'latentstyles', 'pgptbl', 'protusertbl', +] + +# RTF skip destinations +SKIP_DESTINATIONS = { + 'fonttbl', 'colortbl', 'stylesheet', 'listtable', + 'listoverridetable', 'revtbl', 'rsidtbl', 'generator', + 'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping', + 'datastore', 'latentstyles', 'pgptbl', 'protusertbl', + 'bookmarkstart', 'bookmarkend', 'bkmkstart', 'bkmkend', + 'fldinst', 'fldrslt', # field instructions and results +} + +# Image-related destinations +IMAGE_DESTINATIONS = { + 'pict', 'shppict', 'nonshppict', 'blipuid', +} + +# Codepage to encoding mapping +CODEPAGE_ENCODING_MAP = { + 437: 'cp437', + 850: 'cp850', + 852: 'cp852', + 855: 'cp855', + 857: 'cp857', + 860: 'cp860', + 861: 'cp861', + 863: 'cp863', + 865: 'cp865', + 866: 'cp866', + 869: 'cp869', + 874: 'cp874', + 932: 'cp932', # Japanese + 936: 'gb2312', # Simplified Chinese + 949: 'cp949', # Korean + 950: 'big5', # Traditional Chinese + 1250: 'cp1250', # Central European + 1251: 'cp1251', # Cyrillic + 1252: 'cp1252', # Western European + 1253: 'cp1253', # Greek + 1254: 'cp1254', # Turkish + 1255: 'cp1255', # Hebrew + 1256: 'cp1256', # Arabic + 1257: 'cp1257', # Baltic + 1258: 'cp1258', # Vietnamese + 10000: 'mac_roman', + 65001: 'utf-8', +} + +# Default encodings to try +DEFAULT_ENCODINGS = ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1'] + + +__all__ = [ + 'SHAPE_PROPERTY_NAMES', + 'EXCLUDE_DESTINATION_KEYWORDS', + 'SKIP_DESTINATIONS', + 'IMAGE_DESTINATIONS', + 'CODEPAGE_ENCODING_MAP', + 'DEFAULT_ENCODINGS', +] diff --git a/contextifier/core/processor/doc_helpers/rtf_content_extractor.py b/contextifier/core/processor/rtf_helper/rtf_content_extractor.py similarity index 55% rename from contextifier/core/processor/doc_helpers/rtf_content_extractor.py rename to contextifier/core/processor/rtf_helper/rtf_content_extractor.py index 3b02262..f1d1e7f 100644 --- a/contextifier/core/processor/doc_helpers/rtf_content_extractor.py +++ b/contextifier/core/processor/rtf_helper/rtf_content_extractor.py @@ -1,89 +1,77 @@ -# service/document_processor/processor/doc_helpers/rtf_content_extractor.py +# contextifier/core/processor/rtf_helper/rtf_content_extractor.py """ -RTF 콘텐츠 추출기 +RTF Content Extractor -RTF 문서에서 인라인 콘텐츠(텍스트 + 테이블)를 추출하는 기능을 제공합니다. +Extracts inline content (text + tables) from RTF documents. """ import logging import re from typing import List, Tuple -from contextifier.core.processor.doc_helpers.rtf_models import ( - RTFTable, - RTFContentPart, -) -from contextifier.core.processor.doc_helpers.rtf_decoder import ( +from contextifier.core.processor.rtf_helper.rtf_decoder import ( decode_hex_escapes, ) -from contextifier.core.processor.doc_helpers.rtf_text_cleaner import ( +from contextifier.core.processor.rtf_helper.rtf_text_cleaner import ( clean_rtf_text, remove_destination_groups, remove_shape_groups, remove_shape_property_groups, ) -from contextifier.core.processor.doc_helpers.rtf_region_finder import ( +from contextifier.core.processor.rtf_helper.rtf_region_finder import ( find_excluded_regions, ) +from contextifier.core.processor.rtf_helper.rtf_table_extractor import ( + RTFTable, +) -logger = logging.getLogger("document-processor") +logger = logging.getLogger("contextify.rtf.content") def extract_inline_content( content: str, table_regions: List[Tuple[int, int, RTFTable]], encoding: str = "cp949" -) -> List[RTFContentPart]: +) -> str: """ - RTF에서 인라인 콘텐츠를 추출합니다. - - 테이블은 원래 위치에 배치됩니다. - + Extract inline content from RTF with tables in original positions. + Args: - content: RTF 문자열 콘텐츠 - table_regions: 테이블 영역 리스트 [(start, end, table), ...] - encoding: 사용할 인코딩 - + content: RTF string content + table_regions: Table region list [(start, end, table), ...] + encoding: Encoding to use + Returns: - 콘텐츠 파트 리스트 + Content string with tables inline """ - content_parts = [] - - # 헤더 영역 제거 (fonttbl, colortbl, stylesheet, info 등) - # 첫 번째 \pard 이전은 헤더로 간주 + # Find header end (before first \pard) header_end = 0 pard_match = re.search(r'\\pard\b', content) if pard_match: header_end = pard_match.start() - - # 제외 영역 찾기 (header, footer, footnote 등) + + # Find excluded regions (header, footer, footnote, etc.) excluded_regions = find_excluded_regions(content) - + def clean_segment(segment: str, start_pos: int) -> str: - """세그먼트를 정리하되 제외 영역은 건너뜁니다.""" + """Clean a segment while respecting excluded regions.""" if not excluded_regions: - # 제외 영역이 없으면 전체 정리 segment = remove_destination_groups(segment) decoded = decode_hex_escapes(segment, encoding) return clean_rtf_text(decoded, encoding) - - # 세그먼트 내에서 제외 영역을 마스킹 + result_parts = [] seg_pos = 0 - + for excl_start, excl_end in excluded_regions: - # 세그먼트 기준 상대 위치로 변환 rel_start = excl_start - start_pos rel_end = excl_end - start_pos - - # 세그먼트 범위 내에 있는지 확인 + if rel_end <= 0 or rel_start >= len(segment): - continue # 범위 밖 - - # 범위 조정 + continue + rel_start = max(0, rel_start) rel_end = min(len(segment), rel_end) - - # 제외 영역 전 텍스트 처리 + if rel_start > seg_pos: part = segment[seg_pos:rel_start] part = remove_destination_groups(part) @@ -91,10 +79,9 @@ def clean_segment(segment: str, start_pos: int) -> str: clean = clean_rtf_text(decoded, encoding) if clean.strip(): result_parts.append(clean) - + seg_pos = rel_end - - # 마지막 제외 영역 이후 텍스트 + if seg_pos < len(segment): part = segment[seg_pos:] part = remove_destination_groups(part) @@ -102,110 +89,100 @@ def clean_segment(segment: str, start_pos: int) -> str: clean = clean_rtf_text(decoded, encoding) if clean.strip(): result_parts.append(clean) - + return ' '.join(result_parts) - - # 테이블 영역이 없으면 전체 텍스트만 추출 + + result_parts = [] + + # No tables - just extract text if not table_regions: clean = clean_segment(content[header_end:], header_end) if clean.strip(): - content_parts.append(RTFContentPart( - content_type="text", - position=0, - text=clean - )) - return content_parts - - # 헤더 오프셋 적용 + result_parts.append(clean) + return '\n\n'.join(result_parts) + + # Adjust regions for header offset adjusted_regions = [] for start_pos, end_pos, table in table_regions: - # 헤더 이후 영역만 처리 if end_pos > header_end: adj_start = max(start_pos, header_end) adjusted_regions.append((adj_start, end_pos, table)) - - # 콘텐츠 파트 생성 + + # Build content parts last_end = header_end - + for start_pos, end_pos, table in adjusted_regions: - # 테이블 전 텍스트 + # Text before table if start_pos > last_end: segment = content[last_end:start_pos] clean = clean_segment(segment, last_end) if clean.strip(): - content_parts.append(RTFContentPart( - content_type="text", - position=last_end, - text=clean - )) - - # 테이블 - content_parts.append(RTFContentPart( - content_type="table", - position=start_pos, - table=table - )) - + result_parts.append(clean) + + # Table + if table.is_real_table(): + result_parts.append(table.to_html()) + else: + text_list = table.to_text_list() + if text_list: + result_parts.append(text_list) + last_end = end_pos - - # 마지막 부분 (테이블 이후 텍스트) + + # Text after last table if last_end < len(content): segment = content[last_end:] clean = clean_segment(segment, last_end) if clean.strip(): - content_parts.append(RTFContentPart( - content_type="text", - position=last_end, - text=clean - )) + result_parts.append(clean) + + return '\n\n'.join(result_parts) - return content_parts - -def extract_text_legacy(content: str, encoding: str = "cp949") -> str: +def extract_text_only(content: str, encoding: str = "cp949") -> str: """ - RTF에서 일반 텍스트를 추출합니다. - 테이블 영역은 제외하고 추출합니다. - (레거시 호환성을 위해 유지) - + Extract only text from RTF (exclude tables). + + Legacy compatibility function. + Args: - content: RTF 문자열 콘텐츠 - encoding: 사용할 인코딩 - + content: RTF string content + encoding: Encoding to use + Returns: - 추출된 텍스트 + Extracted text """ - # 헤더 영역 제거 (fonttbl, colortbl, stylesheet 등) + # Remove header (fonttbl, colortbl, stylesheet, etc.) pard_match = re.search(r'\\pard\b', content) if pard_match: content = content[pard_match.start():] - - # destination 그룹 제거 (latentstyles, themedata 등) + + # Remove destination groups content = remove_destination_groups(content) - - # Shape 그룹 처리 (shptxt 내용은 보존) + + # Handle shape groups (preserve shptxt content) content = remove_shape_groups(content) - - # Shape 속성 그룹 제거 + + # Remove shape property groups content = remove_shape_property_groups(content) - - # 테이블 영역 찾기 및 마킹 + + # Find table regions table_regions = [] for match in re.finditer(r'\\trowd.*?\\row', content, re.DOTALL): table_regions.append((match.start(), match.end())) - - # 테이블 영역을 병합 (인접한 테이블들) + + # Merge adjacent tables merged_regions = [] for start, end in table_regions: if merged_regions and start - merged_regions[-1][1] < 100: merged_regions[-1] = (merged_regions[-1][0], end) else: merged_regions.append((start, end)) - - # 테이블 영역을 제외한 텍스트 추출 + + # Extract text excluding table regions text_parts = [] last_end = 0 - + for start, end in merged_regions: if start > last_end: segment = content[last_end:start] @@ -214,17 +191,21 @@ def extract_text_legacy(content: str, encoding: str = "cp949") -> str: if clean: text_parts.append(clean) last_end = end - - # 마지막 부분 + if last_end < len(content): segment = content[last_end:] decoded = decode_hex_escapes(segment, encoding) clean = clean_rtf_text(decoded, encoding) if clean: text_parts.append(clean) - - # 연속된 빈 줄 정리 + text = '\n'.join(text_parts) text = re.sub(r'\n{3,}', '\n\n', text) - + return text.strip() + + +__all__ = [ + 'extract_inline_content', + 'extract_text_only', +] diff --git a/contextifier/core/processor/doc_helpers/rtf_decoder.py b/contextifier/core/processor/rtf_helper/rtf_decoder.py similarity index 51% rename from contextifier/core/processor/doc_helpers/rtf_decoder.py rename to contextifier/core/processor/rtf_helper/rtf_decoder.py index 4cd8bad..259825f 100644 --- a/contextifier/core/processor/doc_helpers/rtf_decoder.py +++ b/contextifier/core/processor/rtf_helper/rtf_decoder.py @@ -1,36 +1,37 @@ -# service/document_processor/processor/doc_helpers/rtf_decoder.py +# contextifier/core/processor/rtf_helper/rtf_decoder.py """ -RTF 디코딩 유틸리티 +RTF Decoding Utilities -RTF 인코딩 감지 및 디코딩 관련 함수들을 제공합니다. +Encoding detection and decoding functions for RTF content. """ import logging import re -from typing import List, Tuple +from typing import List -from contextifier.core.processor.doc_helpers.rtf_constants import ( +from contextifier.core.processor.rtf_helper.rtf_constants import ( CODEPAGE_ENCODING_MAP, DEFAULT_ENCODINGS, ) -logger = logging.getLogger("document-processor") +logger = logging.getLogger("contextify.rtf.decoder") def detect_encoding(content: bytes, default_encoding: str = "cp949") -> str: """ - RTF 콘텐츠에서 인코딩을 감지합니다. - + Detect encoding from RTF content. + + Looks for \\ansicpgXXXX pattern in the header. + Args: - content: RTF 바이트 데이터 - default_encoding: 기본 인코딩 - + content: RTF binary data + default_encoding: Fallback encoding + Returns: - 감지된 인코딩 문자열 + Detected encoding string """ try: text = content[:1000].decode('ascii', errors='ignore') - - # \ansicpgXXXX 패턴 찾기 + match = re.search(r'\\ansicpg(\d+)', text) if match: codepage = int(match.group(1)) @@ -39,44 +40,44 @@ def detect_encoding(content: bytes, default_encoding: str = "cp949") -> str: return encoding except Exception as e: logger.debug(f"Encoding detection failed: {e}") - + return default_encoding def decode_content(content: bytes, encoding: str = "cp949") -> str: """ - RTF 바이너리를 문자열로 디코딩합니다. - - 여러 인코딩을 시도하여 성공하는 첫 번째 결과를 반환합니다. - + Decode RTF binary to string. + + Tries multiple encodings and returns first successful result. + Args: - content: RTF 바이트 데이터 - encoding: 우선 시도할 인코딩 - + content: RTF binary data + encoding: Preferred encoding to try first + Returns: - 디코딩된 문자열 + Decoded string """ encodings = [encoding] + [e for e in DEFAULT_ENCODINGS if e != encoding] - + for enc in encodings: try: return content.decode(enc) except (UnicodeDecodeError, LookupError): continue - + return content.decode('cp1252', errors='replace') def decode_bytes(byte_list: List[int], encoding: str = "cp949") -> str: """ - 바이트 리스트를 문자열로 디코딩합니다. - + Decode byte list to string. + Args: - byte_list: 바이트 값 리스트 - encoding: 사용할 인코딩 - + byte_list: List of byte values + encoding: Encoding to use + Returns: - 디코딩된 문자열 + Decoded string """ try: return bytes(byte_list).decode(encoding) @@ -89,44 +90,52 @@ def decode_bytes(byte_list: List[int], encoding: str = "cp949") -> str: def decode_hex_escapes(text: str, encoding: str = "cp949") -> str: """ - RTF hex escape (\'XX) 시퀀스를 디코딩합니다. - + Decode RTF hex escape sequences (\\'XX). + Args: - text: RTF 텍스트 - encoding: 사용할 인코딩 - + text: RTF text with hex escapes + encoding: Encoding for decoding + Returns: - 디코딩된 텍스트 + Decoded text """ + if "\\'" not in text: + return text + result = [] byte_buffer = [] i = 0 - - while i < len(text): - if text[i:i+2] == "\\'": - # hex escape 발견 + n = len(text) + + while i < n: + if i + 3 < n and text[i:i+2] == "\\'": try: hex_val = text[i+2:i+4] byte_val = int(hex_val, 16) byte_buffer.append(byte_val) i += 4 - except (ValueError, IndexError): - # 잘못된 escape, 그대로 추가 - if byte_buffer: - result.append(decode_bytes(byte_buffer, encoding)) - byte_buffer = [] - result.append(text[i]) - i += 1 - else: - # 일반 문자 - if byte_buffer: - result.append(decode_bytes(byte_buffer, encoding)) - byte_buffer = [] - result.append(text[i]) - i += 1 - - # 남은 바이트 처리 + continue + except ValueError: + pass + + # Flush byte buffer + if byte_buffer: + result.append(decode_bytes(byte_buffer, encoding)) + byte_buffer = [] + + result.append(text[i]) + i += 1 + + # Flush remaining bytes if byte_buffer: result.append(decode_bytes(byte_buffer, encoding)) - + return ''.join(result) + + +__all__ = [ + 'detect_encoding', + 'decode_content', + 'decode_bytes', + 'decode_hex_escapes', +] diff --git a/contextifier/core/processor/rtf_helper/rtf_file_converter.py b/contextifier/core/processor/rtf_helper/rtf_file_converter.py new file mode 100644 index 0000000..fecd7b5 --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_file_converter.py @@ -0,0 +1,87 @@ +# contextifier/core/processor/rtf_helper/rtf_file_converter.py +""" +RTF File Converter + +RTF uses raw binary directly, so converter just passes through. +All actual processing is done by Preprocessor in Handler. +""" +import logging +from dataclasses import dataclass, field +from typing import Any, BinaryIO, List, Optional + +from contextifier.core.functions.file_converter import BaseFileConverter + +logger = logging.getLogger("contextify.rtf.converter") + + +@dataclass +class RTFConvertedData: + """ + RTF converted data container. + + Attributes: + content: RTF content string (after preprocessing) + encoding: Detected encoding + image_tags: List of image tags from preprocessing + original_size: Original binary data size + has_images: Whether images were extracted + """ + content: str + encoding: str = "cp949" + image_tags: List[str] = field(default_factory=list) + original_size: int = 0 + has_images: bool = False + + def __post_init__(self): + """Set has_images based on image_tags.""" + if self.image_tags: + self.has_images = True + + +class RTFFileConverter(BaseFileConverter): + """ + RTF file converter. + + RTF uses raw binary directly, so this converter just passes through. + All actual processing (image extraction, binary removal, decoding) + is done by RTFPreprocessor called from Handler. + """ + + def __init__(self): + """Initialize RTFFileConverter.""" + self.logger = logger + + def convert( + self, + file_data: bytes, + file_stream: Optional[BinaryIO] = None, + **kwargs + ) -> bytes: + """ + Pass through binary data. + + RTF processing uses raw binary, so just return as-is. + + Args: + file_data: Raw binary RTF data + file_stream: Optional file stream (not used) + **kwargs: Not used + + Returns: + Original bytes (pass through) + """ + return file_data + + def get_format_name(self) -> str: + """Return format name.""" + return "RTF Document" + + def close(self, converted_object: Any) -> None: + """Nothing to close.""" + pass + + +__all__ = [ + 'RTFFileConverter', + 'RTFConvertedData', +] diff --git a/contextifier/core/processor/rtf_helper/rtf_metadata_extractor.py b/contextifier/core/processor/rtf_helper/rtf_metadata_extractor.py new file mode 100644 index 0000000..633632a --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_metadata_extractor.py @@ -0,0 +1,179 @@ +# contextifier/core/processor/rtf_helper/rtf_metadata_extractor.py +""" +RTF Metadata Extractor + +Extracts metadata from RTF content. +Implements BaseMetadataExtractor interface. +""" +import logging +import re +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, Optional, Union + +from contextifier.core.functions.metadata_extractor import ( + BaseMetadataExtractor, + DocumentMetadata, +) +from contextifier.core.processor.rtf_helper.rtf_decoder import ( + decode_hex_escapes, +) +from contextifier.core.processor.rtf_helper.rtf_text_cleaner import ( + clean_rtf_text, +) + +logger = logging.getLogger("contextify.rtf.metadata") + + +@dataclass +class RTFSourceInfo: + """ + Source information for RTF metadata extraction. + + Container for data passed to RTFMetadataExtractor.extract(). + """ + content: str + encoding: str = "cp949" + + +class RTFMetadataExtractor(BaseMetadataExtractor): + """ + RTF Metadata Extractor. + + Extracts metadata from RTF content. + + Supported fields: + - title, subject, author, keywords, comments + - last_saved_by, create_time, last_saved_time + + Usage: + extractor = RTFMetadataExtractor() + source = RTFSourceInfo(content=rtf_content, encoding="cp949") + metadata = extractor.extract(source) + text = extractor.format(metadata) + """ + + def extract(self, source: Union[RTFSourceInfo, Dict[str, Any]]) -> DocumentMetadata: + """ + Extract metadata from RTF content. + + Args: + source: RTFSourceInfo object (content string and encoding) + OR Dict[str, Any] (pre-parsed metadata) + + Returns: + DocumentMetadata instance + """ + if isinstance(source, dict): + return self._from_dict(source) + + content = source.content + encoding = source.encoding + + title = None + subject = None + author = None + keywords = None + comments = None + last_saved_by = None + create_time = None + last_saved_time = None + + # Find \info group + info_match = re.search(r'\\info\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', content) + if info_match: + info_content = info_match.group(1) + + # Extract each metadata field + field_patterns = { + 'title': r'\\title\s*\{([^}]*)\}', + 'subject': r'\\subject\s*\{([^}]*)\}', + 'author': r'\\author\s*\{([^}]*)\}', + 'keywords': r'\\keywords\s*\{([^}]*)\}', + 'comments': r'\\doccomm\s*\{([^}]*)\}', + 'last_saved_by': r'\\operator\s*\{([^}]*)\}', + } + + for key, pattern in field_patterns.items(): + match = re.search(pattern, info_content) + if match: + value = decode_hex_escapes(match.group(1), encoding) + value = clean_rtf_text(value, encoding) + if value: + if key == 'title': + title = value + elif key == 'subject': + subject = value + elif key == 'author': + author = value + elif key == 'keywords': + keywords = value + elif key == 'comments': + comments = value + elif key == 'last_saved_by': + last_saved_by = value + + # Extract dates + create_time = self._extract_date( + content, + r'\\creatim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?' + ) + last_saved_time = self._extract_date( + content, + r'\\revtim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?' + ) + + self.logger.debug("Extracted RTF metadata fields") + + return DocumentMetadata( + title=title, + subject=subject, + author=author, + keywords=keywords, + comments=comments, + last_saved_by=last_saved_by, + create_time=create_time, + last_saved_time=last_saved_time, + ) + + def _extract_date(self, content: str, pattern: str) -> Optional[datetime]: + """Extract datetime from RTF date pattern.""" + match = re.search(pattern, content) + if match: + try: + year = int(match.group(1)) + month = int(match.group(2)) + day = int(match.group(3)) + hour = int(match.group(4)) if match.group(4) else 0 + minute = int(match.group(5)) if match.group(5) else 0 + return datetime(year, month, day, hour, minute) + except (ValueError, TypeError): + pass + return None + + def _from_dict(self, metadata: Dict[str, Any]) -> DocumentMetadata: + """ + Convert pre-parsed metadata dict to DocumentMetadata. + + Args: + metadata: Pre-parsed metadata dict + + Returns: + DocumentMetadata instance + """ + return DocumentMetadata( + title=metadata.get('title'), + subject=metadata.get('subject'), + author=metadata.get('author'), + keywords=metadata.get('keywords'), + comments=metadata.get('comments'), + last_saved_by=metadata.get('last_saved_by'), + create_time=metadata.get('create_time'), + last_saved_time=metadata.get('last_saved_time'), + ) + + +__all__ = [ + 'RTFMetadataExtractor', + 'RTFSourceInfo', +] diff --git a/contextifier/core/processor/rtf_helper/rtf_preprocessor.py b/contextifier/core/processor/rtf_helper/rtf_preprocessor.py new file mode 100644 index 0000000..2c7ebdf --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_preprocessor.py @@ -0,0 +1,426 @@ +# contextifier/core/processor/rtf_helper/rtf_preprocessor.py +""" +RTF Preprocessor + +Preprocesses RTF binary data before conversion: +- \\binN tag processing (skip N bytes of raw binary data) +- \\pict group image extraction +- Image saving and tag generation +- Encoding detection + +Implements BasePreprocessor interface. +""" +import hashlib +import logging +import re +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Set, Tuple + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend +from contextifier.core.processor.rtf_helper.rtf_decoder import ( + detect_encoding, +) + +logger = logging.getLogger("contextify.rtf.preprocessor") + + +# Image format magic numbers +IMAGE_SIGNATURES = { + b'\xff\xd8\xff': 'jpeg', + b'\x89PNG\r\n\x1a\n': 'png', + b'GIF87a': 'gif', + b'GIF89a': 'gif', + b'BM': 'bmp', + b'\xd7\xcd\xc6\x9a': 'wmf', + b'\x01\x00\x09\x00': 'wmf', + b'\x01\x00\x00\x00': 'emf', +} + +# RTF image type mapping +RTF_IMAGE_TYPES = { + 'jpegblip': 'jpeg', + 'pngblip': 'png', + 'wmetafile': 'wmf', + 'emfblip': 'emf', + 'dibitmap': 'bmp', + 'wbitmap': 'bmp', +} + +# Supported image formats for saving +SUPPORTED_IMAGE_FORMATS = {'jpeg', 'png', 'gif', 'bmp'} + + +@dataclass +class RTFBinaryRegion: + """RTF binary data region information.""" + start_pos: int + end_pos: int + bin_type: str # "bin" or "pict" + data_size: int + image_format: str = "" + image_data: bytes = b"" + + +class RTFPreprocessor(BasePreprocessor): + """ + RTF-specific preprocessor. + + Handles RTF binary preprocessing: + - Removes \\bin tag binary data + - Extracts embedded images + - Detects encoding + - Returns clean content ready for parsing + + Usage: + preprocessor = RTFPreprocessor(image_processor=img_proc) + result = preprocessor.preprocess(rtf_bytes) + + # result.clean_content - bytes ready for parsing + # result.encoding - detected encoding + # result.extracted_resources["image_tags"] - list of image tags + """ + + RTF_MAGIC = b'{\\rtf' + + def __init__( + self, + image_processor: Optional[ImageProcessor] = None, + processed_images: Optional[Set[str]] = None, + ): + """ + Initialize RTFPreprocessor. + + Args: + image_processor: Image processor for saving images + processed_images: Set of already processed image hashes + """ + self._image_processor = image_processor + self._processed_images = processed_images if processed_images is not None else set() + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess RTF data. + + For RTF, the converter returns raw bytes (pass-through), + so converted_data is the original RTF binary data. + + Args: + converted_data: RTF binary data (bytes) from RTFFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with clean content, encoding, and image tags + """ + # Handle bytes input + if isinstance(converted_data, bytes): + file_data = converted_data + elif hasattr(converted_data, 'read'): + # Handle file-like objects + file_data = converted_data.read() + else: + return PreprocessedData( + raw_content=b"", + clean_content=b"", + encoding="cp949", + ) + + if not file_data: + return PreprocessedData( + raw_content=b"", + clean_content=b"", + encoding="cp949", + ) + + # Get options from kwargs + image_processor = kwargs.get('image_processor', self._image_processor) + processed_images = kwargs.get('processed_images', self._processed_images) + + # Detect encoding + detected_encoding = detect_encoding(file_data, "cp949") + + # Process binary data (extract images, clean content) + clean_content, image_tags = self._process_binary_content( + file_data, + image_processor, + processed_images + ) + + # Filter valid image tags + valid_tags = [ + tag for tag in image_tags + if tag and tag.strip() and '/uploads/.' not in tag + ] + + return PreprocessedData( + raw_content=file_data, + clean_content=clean_content, + encoding=detected_encoding, + extracted_resources={ + "image_tags": valid_tags, + } + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "RTF Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is valid RTF content.""" + if isinstance(data, bytes): + if len(data) < 5: + return False + return data[:5] == self.RTF_MAGIC + return False + + def _process_binary_content( + self, + content: bytes, + image_processor: Optional[ImageProcessor], + processed_images: Set[str] + ) -> Tuple[bytes, List[str]]: + """ + Process RTF binary content. + + Args: + content: RTF binary content + image_processor: Image processor instance + processed_images: Set of processed image hashes + + Returns: + Tuple of (clean_content, list of image tags) + """ + image_tags: Dict[int, str] = {} + + # Find \bin tag regions + bin_regions = self._find_bin_regions(content) + + # Find \pict regions (excluding bin regions) + pict_regions = self._find_pict_regions(content, bin_regions) + + # Merge and sort all regions + all_regions = bin_regions + pict_regions + all_regions.sort(key=lambda r: r.start_pos) + + # Process images and generate tags + for region in all_regions: + if not region.image_data: + continue + + # Check for duplicates + image_hash = hashlib.md5(region.image_data).hexdigest() + if image_hash in processed_images: + image_tags[region.start_pos] = "" + continue + + processed_images.add(image_hash) + + if region.image_format in SUPPORTED_IMAGE_FORMATS and image_processor: + tag = image_processor.save_image(region.image_data) + if tag: + image_tags[region.start_pos] = f"\n{tag}\n" + logger.info( + f"Saved RTF image: {tag} " + f"(format={region.image_format}, size={region.data_size})" + ) + else: + image_tags[region.start_pos] = "" + else: + image_tags[region.start_pos] = "" + + # Remove binary data from content + clean_content = self._remove_binary_data(content, all_regions, image_tags) + + # Collect all image tags as list + tag_list = [tag for tag in image_tags.values() if tag and tag.strip()] + + return clean_content, tag_list + + def _find_bin_regions(self, content: bytes) -> List[RTFBinaryRegion]: + """Find \\binN tags and identify binary regions.""" + regions = [] + pattern = rb'\\bin(\d+)' + + for match in re.finditer(pattern, content): + try: + bin_size = int(match.group(1)) + bin_tag_start = match.start() + bin_tag_end = match.end() + + data_start = bin_tag_end + if data_start < len(content) and content[data_start:data_start+1] == b' ': + data_start += 1 + + data_end = data_start + bin_size + + if data_end <= len(content): + binary_data = content[data_start:data_end] + image_format = self._detect_image_format(binary_data) + + # Find parent \shppict group + group_start = bin_tag_start + group_end = data_end + + search_start = max(0, bin_tag_start - 500) + search_area = content[search_start:bin_tag_start] + + shppict_pos = search_area.rfind(b'\\shppict') + if shppict_pos != -1: + abs_pos = search_start + shppict_pos + brace_pos = abs_pos + while brace_pos > 0 and content[brace_pos:brace_pos+1] != b'{': + brace_pos -= 1 + group_start = brace_pos + + depth = 1 + j = data_end + while j < len(content) and depth > 0: + if content[j:j+1] == b'{': + depth += 1 + elif content[j:j+1] == b'}': + depth -= 1 + j += 1 + group_end = j + + regions.append(RTFBinaryRegion( + start_pos=group_start, + end_pos=group_end, + bin_type="bin", + data_size=bin_size, + image_format=image_format, + image_data=binary_data + )) + except (ValueError, IndexError): + continue + + return regions + + def _find_pict_regions( + self, + content: bytes, + exclude_regions: List[RTFBinaryRegion] + ) -> List[RTFBinaryRegion]: + """Find hex-encoded \\pict regions.""" + regions = [] + + bin_tag_positions = {r.start_pos for r in exclude_regions if r.bin_type == "bin"} + excluded_ranges = [(r.start_pos, r.end_pos) for r in exclude_regions] + + def is_excluded(pos: int) -> bool: + return any(start <= pos < end for start, end in excluded_ranges) + + def has_bin_nearby(pict_pos: int) -> bool: + return any(pict_pos < bp < pict_pos + 200 for bp in bin_tag_positions) + + try: + text_content = content.decode('cp1252', errors='replace') + pict_pattern = r'\\pict\s*((?:\\[a-zA-Z]+\d*\s*)*)' + + for match in re.finditer(pict_pattern, text_content): + start_pos = match.start() + + if is_excluded(start_pos) or has_bin_nearby(start_pos): + continue + + attrs = match.group(1) + image_format = "" + for rtf_type, fmt in RTF_IMAGE_TYPES.items(): + if rtf_type in attrs: + image_format = fmt + break + + # Extract hex data + hex_start = match.end() + hex_data = [] + i = hex_start + + while i < len(text_content): + ch = text_content[i] + if ch in '0123456789abcdefABCDEF': + hex_data.append(ch) + elif ch in ' \t\r\n': + pass + elif ch == '}': + break + elif ch == '\\': + if text_content[i:i+4] == '\\bin': + hex_data = [] + break + while i < len(text_content) and text_content[i] not in ' \t\r\n}': + i += 1 + continue + else: + break + i += 1 + + hex_str = ''.join(hex_data) + + if len(hex_str) >= 32: + try: + image_data = bytes.fromhex(hex_str) + if not image_format: + image_format = self._detect_image_format(image_data) + + if image_format: + regions.append(RTFBinaryRegion( + start_pos=start_pos, + end_pos=i, + bin_type="pict", + data_size=len(image_data), + image_format=image_format, + image_data=image_data + )) + except ValueError: + continue + except Exception as e: + logger.warning(f"Error finding pict regions: {e}") + + return regions + + def _detect_image_format(self, data: bytes) -> str: + """Detect image format from binary data.""" + if not data or len(data) < 4: + return "" + + for signature, format_name in IMAGE_SIGNATURES.items(): + if data.startswith(signature): + return format_name + + if len(data) >= 2 and data[0:2] == b'\xff\xd8': + return 'jpeg' + + return "" + + def _remove_binary_data( + self, + content: bytes, + regions: List[RTFBinaryRegion], + image_tags: Dict[int, str] + ) -> bytes: + """Remove binary data regions from content.""" + if not regions: + return content + + sorted_regions = sorted(regions, key=lambda r: r.start_pos, reverse=True) + result = bytearray(content) + + for region in sorted_regions: + replacement = b'' + if region.start_pos in image_tags: + tag = image_tags[region.start_pos] + if tag: + replacement = tag.encode('ascii', errors='replace') + result[region.start_pos:region.end_pos] = replacement + + return bytes(result) + + +__all__ = ['RTFPreprocessor', 'RTFBinaryRegion'] diff --git a/contextifier/core/processor/rtf_helper/rtf_region_finder.py b/contextifier/core/processor/rtf_helper/rtf_region_finder.py new file mode 100644 index 0000000..b508962 --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_region_finder.py @@ -0,0 +1,91 @@ +# contextifier/core/processor/rtf_helper/rtf_region_finder.py +""" +RTF Region Finder + +Functions for finding excluded regions (header, footer, footnote, etc.) in RTF. +""" +import re +from typing import List, Tuple + + +def find_excluded_regions(content: str) -> List[Tuple[int, int]]: + """ + Find regions to exclude from content extraction. + + Finds header, footer, footnote, and other special regions + that should not be part of main content. + + Args: + content: RTF content string + + Returns: + List of (start, end) position tuples + """ + regions = [] + + # Header/footer patterns + patterns = [ + (r'\\header[lrf]?\b', r'\\par\s*\}'), # Headers + (r'\\footer[lrf]?\b', r'\\par\s*\}'), # Footers + (r'\\footnote\b', r'\}'), # Footnotes + (r'\\annotation\b', r'\}'), # Annotations + (r'\{\\headerf', r'\}'), # First page header + (r'\{\\footerf', r'\}'), # First page footer + ] + + for start_pattern, end_pattern in patterns: + for match in re.finditer(start_pattern, content): + start_pos = match.start() + + # Find matching closing brace + depth = 0 + i = start_pos + found_start = False + + while i < len(content): + if content[i] == '{': + if not found_start: + found_start = True + depth += 1 + elif content[i] == '}': + depth -= 1 + if found_start and depth == 0: + regions.append((start_pos, i + 1)) + break + i += 1 + + # Merge overlapping regions + if regions: + regions.sort(key=lambda x: x[0]) + merged = [regions[0]] + for start, end in regions[1:]: + if start <= merged[-1][1]: + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + else: + merged.append((start, end)) + return merged + + return regions + + +def is_in_excluded_region(position: int, regions: List[Tuple[int, int]]) -> bool: + """ + Check if a position is within an excluded region. + + Args: + position: Position to check + regions: List of (start, end) tuples + + Returns: + True if position is in an excluded region + """ + for start, end in regions: + if start <= position < end: + return True + return False + + +__all__ = [ + 'find_excluded_regions', + 'is_in_excluded_region', +] diff --git a/contextifier/core/processor/rtf_helper/rtf_table_extractor.py b/contextifier/core/processor/rtf_helper/rtf_table_extractor.py new file mode 100644 index 0000000..51f4e61 --- /dev/null +++ b/contextifier/core/processor/rtf_helper/rtf_table_extractor.py @@ -0,0 +1,482 @@ +# contextifier/core/processor/rtf_helper/rtf_table_extractor.py +""" +RTF Table Extractor + +Extracts and parses tables from RTF content. +Includes RTFCellInfo and RTFTable data models. +""" +import logging +import re +from dataclasses import dataclass, field +from typing import List, NamedTuple, Optional, Tuple + +from contextifier.core.processor.rtf_helper.rtf_decoder import ( + decode_hex_escapes, +) +from contextifier.core.processor.rtf_helper.rtf_text_cleaner import ( + clean_rtf_text, +) +from contextifier.core.processor.rtf_helper.rtf_region_finder import ( + find_excluded_regions, + is_in_excluded_region, +) + +logger = logging.getLogger("contextify.rtf.table") + + +# ============================================================================= +# Data Models +# ============================================================================= + +class RTFCellInfo(NamedTuple): + """RTF cell information with merge info.""" + text: str # Cell text content + h_merge_first: bool # Horizontal merge start (clmgf) + h_merge_cont: bool # Horizontal merge continue (clmrg) + v_merge_first: bool # Vertical merge start (clvmgf) + v_merge_cont: bool # Vertical merge continue (clvmrg) + right_boundary: int # Cell right boundary (twips) + + +@dataclass +class RTFTable: + """RTF table structure with merge cell support.""" + rows: List[List[RTFCellInfo]] = field(default_factory=list) + col_count: int = 0 + position: int = 0 # Start position in document + end_position: int = 0 # End position in document + + def is_real_table(self) -> bool: + """ + Determine if this is a real table. + + n rows x 1 column is considered a list, not a table. + """ + if not self.rows: + return False + + effective_cols = self._get_effective_col_count() + return effective_cols >= 2 + + def _get_effective_col_count(self) -> int: + """Calculate effective column count (excluding empty columns).""" + if not self.rows: + return 0 + + effective_counts = [] + for row in self.rows: + non_empty_cells = [] + for i, cell in enumerate(row): + if cell.h_merge_cont: + continue + if cell.text.strip() or cell.v_merge_first: + non_empty_cells.append(i) + + if non_empty_cells: + effective_counts.append(max(non_empty_cells) + 1) + + return max(effective_counts) if effective_counts else 0 + + def to_html(self) -> str: + """Convert table to HTML with merge cell support.""" + if not self.rows: + return "" + + merge_info = self._calculate_merge_info() + html_parts = [''] + + for row_idx, row in enumerate(self.rows): + html_parts.append('') + + for col_idx, cell in enumerate(row): + if col_idx < len(merge_info[row_idx]): + colspan, rowspan = merge_info[row_idx][col_idx] + + if colspan == 0 or rowspan == 0: + continue + + cell_text = re.sub(r'\s+', ' ', cell.text).strip() + + attrs = [] + if colspan > 1: + attrs.append(f'colspan="{colspan}"') + if rowspan > 1: + attrs.append(f'rowspan="{rowspan}"') + + attr_str = ' ' + ' '.join(attrs) if attrs else '' + html_parts.append(f'{cell_text}') + else: + cell_text = re.sub(r'\s+', ' ', cell.text).strip() + html_parts.append(f'') + + html_parts.append('') + + html_parts.append('
{cell_text}
') + return '\n'.join(html_parts) + + def to_text_list(self) -> str: + """ + Convert 1-column table to text list. + + - 1x1 table: Return cell content only (container table) + - nx1 table: Return rows separated by blank lines + """ + if not self.rows: + return "" + + if len(self.rows) == 1 and len(self.rows[0]) == 1: + return self.rows[0][0].text + + lines = [] + for row in self.rows: + if row: + cell_text = row[0].text + if cell_text: + lines.append(cell_text) + + return '\n\n'.join(lines) + + def _calculate_merge_info(self) -> List[List[tuple]]: + """Calculate colspan and rowspan for each cell.""" + if not self.rows: + return [] + + num_rows = len(self.rows) + max_cols = max(len(row) for row in self.rows) if self.rows else 0 + + if max_cols == 0: + return [] + + # Initialize with (1, 1) for all cells + merge_info = [[(1, 1) for _ in range(max_cols)] for _ in range(num_rows)] + + # Process horizontal merges + for row_idx, row in enumerate(self.rows): + col_idx = 0 + while col_idx < len(row): + cell = row[col_idx] + + if cell.h_merge_first: + colspan = 1 + for next_col in range(col_idx + 1, len(row)): + if row[next_col].h_merge_cont: + colspan += 1 + merge_info[row_idx][next_col] = (0, 0) + else: + break + merge_info[row_idx][col_idx] = (colspan, 1) + + col_idx += 1 + + # Process vertical merges + for col_idx in range(max_cols): + row_idx = 0 + while row_idx < num_rows: + if col_idx >= len(self.rows[row_idx]): + row_idx += 1 + continue + + cell = self.rows[row_idx][col_idx] + + if cell.v_merge_first: + rowspan = 1 + for next_row in range(row_idx + 1, num_rows): + if col_idx < len(self.rows[next_row]) and self.rows[next_row][col_idx].v_merge_cont: + rowspan += 1 + merge_info[next_row][col_idx] = (0, 0) + else: + break + + current_colspan = merge_info[row_idx][col_idx][0] + merge_info[row_idx][col_idx] = (current_colspan, rowspan) + row_idx += rowspan + elif cell.v_merge_cont: + merge_info[row_idx][col_idx] = (0, 0) + row_idx += 1 + else: + row_idx += 1 + + return merge_info + + +# ============================================================================= +# Table Extraction Functions +# ============================================================================= + +def extract_tables_with_positions( + content: str, + encoding: str = "cp949" +) -> Tuple[List[RTFTable], List[Tuple[int, int, RTFTable]]]: + """ + Extract tables from RTF content with position information. + + RTF table structure: + - \\trowd: Table row start (row definition) + - \\cellxN: Cell boundary position + - \\clmgf: Horizontal merge start + - \\clmrg: Horizontal merge continue + - \\clvmgf: Vertical merge start + - \\clvmrg: Vertical merge continue + - \\intbl: Paragraph in cell + - \\cell: Cell end + - \\row: Row end + + Args: + content: RTF string content + encoding: Encoding to use + + Returns: + Tuple of (table list, table region list [(start, end, table), ...]) + """ + tables = [] + table_regions = [] + + # Find excluded regions (header, footer, footnote, etc.) + excluded_regions = find_excluded_regions(content) + + # Step 1: Find all \row positions + row_positions = [] + for match in re.finditer(r'\\row(?![a-z])', content): + row_positions.append(match.end()) + + if not row_positions: + return tables, table_regions + + # Step 2: Find \trowd before each \row + all_rows = [] + for i, row_end in enumerate(row_positions): + if i == 0: + search_start = 0 + else: + search_start = row_positions[i - 1] + + segment = content[search_start:row_end] + trowd_match = re.search(r'\\trowd', segment) + + if trowd_match: + row_start = search_start + trowd_match.start() + + # Skip rows in excluded regions + if is_in_excluded_region(row_start, excluded_regions): + logger.debug(f"Skipping table row at {row_start} (in header/footer/footnote)") + continue + + row_text = content[row_start:row_end] + all_rows.append((row_start, row_end, row_text)) + + if not all_rows: + return tables, table_regions + + # Group consecutive rows into tables + table_groups = [] + current_table = [] + current_start = -1 + current_end = -1 + prev_end = -1 + + for row_start, row_end, row_text in all_rows: + # Rows within 150 chars are same table + if prev_end == -1 or row_start - prev_end < 150: + if current_start == -1: + current_start = row_start + current_table.append(row_text) + current_end = row_end + else: + if current_table: + table_groups.append((current_start, current_end, current_table)) + current_table = [row_text] + current_start = row_start + current_end = row_end + prev_end = row_end + + if current_table: + table_groups.append((current_start, current_end, current_table)) + + logger.info(f"Found {len(table_groups)} table groups") + + # Parse each table group + for start_pos, end_pos, table_rows in table_groups: + table = _parse_table_with_merge(table_rows, encoding) + if table and table.rows: + table.position = start_pos + table.end_position = end_pos + tables.append(table) + table_regions.append((start_pos, end_pos, table)) + + logger.info(f"Extracted {len(tables)} tables") + return tables, table_regions + + +def _parse_table_with_merge(rows: List[str], encoding: str = "cp949") -> Optional[RTFTable]: + """ + Parse table rows to RTFTable object with merge support. + + Args: + rows: Table row text list + encoding: Encoding to use + + Returns: + RTFTable object + """ + table = RTFTable() + + for row_text in rows: + cells = _extract_cells_with_merge(row_text, encoding) + if cells: + table.rows.append(cells) + if len(cells) > table.col_count: + table.col_count = len(cells) + + return table if table.rows else None + + +def _extract_cells_with_merge(row_text: str, encoding: str = "cp949") -> List[RTFCellInfo]: + """ + Extract cell content and merge information from table row. + + Args: + row_text: Table row RTF text + encoding: Encoding to use + + Returns: + List of RTFCellInfo + """ + cells = [] + + # Step 1: Parse cell definitions (attributes before cellx) + cell_defs = [] + + # Find first \cell that is not \cellx + first_cell_idx = -1 + pos = 0 + while True: + idx = row_text.find('\\cell', pos) + if idx == -1: + first_cell_idx = len(row_text) + break + if idx + 5 < len(row_text) and row_text[idx + 5] == 'x': + pos = idx + 1 + continue + first_cell_idx = idx + break + + def_part = row_text[:first_cell_idx] + + current_def = { + 'h_merge_first': False, + 'h_merge_cont': False, + 'v_merge_first': False, + 'v_merge_cont': False, + 'right_boundary': 0 + } + + cell_def_pattern = r'\\cl(?:mgf|mrg|vmgf|vmrg)|\\cellx(-?\d+)' + + for match in re.finditer(cell_def_pattern, def_part): + token = match.group() + if token == '\\clmgf': + current_def['h_merge_first'] = True + elif token == '\\clmrg': + current_def['h_merge_cont'] = True + elif token == '\\clvmgf': + current_def['v_merge_first'] = True + elif token == '\\clvmrg': + current_def['v_merge_cont'] = True + elif token.startswith('\\cellx'): + if match.group(1): + current_def['right_boundary'] = int(match.group(1)) + cell_defs.append(current_def.copy()) + current_def = { + 'h_merge_first': False, + 'h_merge_cont': False, + 'v_merge_first': False, + 'v_merge_cont': False, + 'right_boundary': 0 + } + + # Step 2: Extract cell texts + cell_texts = _extract_cell_texts(row_text, encoding) + + # Step 3: Match cell definitions with content + for i, cell_text in enumerate(cell_texts): + if i < len(cell_defs): + cell_def = cell_defs[i] + else: + cell_def = { + 'h_merge_first': False, + 'h_merge_cont': False, + 'v_merge_first': False, + 'v_merge_cont': False, + 'right_boundary': 0 + } + + cells.append(RTFCellInfo( + text=cell_text, + h_merge_first=cell_def['h_merge_first'], + h_merge_cont=cell_def['h_merge_cont'], + v_merge_first=cell_def['v_merge_first'], + v_merge_cont=cell_def['v_merge_cont'], + right_boundary=cell_def['right_boundary'] + )) + + return cells + + +def _extract_cell_texts(row_text: str, encoding: str = "cp949") -> List[str]: + """ + Extract cell texts from row. + + Args: + row_text: Table row RTF text + encoding: Encoding to use + + Returns: + List of cell texts + """ + cell_texts = [] + + # Step 1: Find all \cell positions (not \cellx) + cell_positions = [] + pos = 0 + while True: + idx = row_text.find('\\cell', pos) + if idx == -1: + break + next_pos = idx + 5 + if next_pos < len(row_text) and row_text[next_pos] == 'x': + pos = idx + 1 + continue + cell_positions.append(idx) + pos = idx + 1 + + if not cell_positions: + return cell_texts + + # Step 2: Find last \cellx before first \cell + first_cell_pos = cell_positions[0] + def_part = row_text[:first_cell_pos] + + last_cellx_end = 0 + for match in re.finditer(r'\\cellx-?\d+', def_part): + last_cellx_end = match.end() + + # Step 3: Extract each cell content + prev_end = last_cellx_end + for cell_end in cell_positions: + cell_content = row_text[prev_end:cell_end] + + # RTF decoding and cleaning + decoded = decode_hex_escapes(cell_content, encoding) + clean = clean_rtf_text(decoded, encoding) + cell_texts.append(clean) + + prev_end = cell_end + 5 # len('\\cell') = 5 + + return cell_texts + + +__all__ = [ + 'RTFCellInfo', + 'RTFTable', + 'extract_tables_with_positions', +] diff --git a/contextifier/core/processor/doc_helpers/rtf_text_cleaner.py b/contextifier/core/processor/rtf_helper/rtf_text_cleaner.py similarity index 66% rename from contextifier/core/processor/doc_helpers/rtf_text_cleaner.py rename to contextifier/core/processor/rtf_helper/rtf_text_cleaner.py index ebeb61d..5aff9c4 100644 --- a/contextifier/core/processor/doc_helpers/rtf_text_cleaner.py +++ b/contextifier/core/processor/rtf_helper/rtf_text_cleaner.py @@ -1,63 +1,60 @@ -# service/document_processor/processor/doc_helpers/rtf_text_cleaner.py +# contextifier/core/processor/rtf_helper/rtf_text_cleaner.py """ -RTF 텍스트 정리 유틸리티 +RTF Text Cleaner -RTF 제어 코드 제거 및 텍스트 정리 관련 함수들을 제공합니다. +Functions for removing RTF control codes and cleaning text. """ import re from typing import List -from contextifier.core.processor.doc_helpers.rtf_constants import ( +from contextifier.core.processor.rtf_helper.rtf_constants import ( SHAPE_PROPERTY_NAMES, + SKIP_DESTINATIONS, + IMAGE_DESTINATIONS, ) -from contextifier.core.processor.doc_helpers.rtf_decoder import ( +from contextifier.core.processor.rtf_helper.rtf_decoder import ( decode_bytes, ) def clean_rtf_text(text: str, encoding: str = "cp949") -> str: """ - RTF 제어 코드를 안전하게 제거하고 순수 텍스트만 추출합니다. - - 토큰 기반 파싱으로 내용 유실을 방지합니다. - + Remove RTF control codes and extract pure text. + + Uses token-based parsing to prevent content loss. + Args: - text: RTF 텍스트 - encoding: 사용할 인코딩 - + text: RTF text + encoding: Encoding for decoding + Returns: - 정리된 텍스트 + Cleaned text """ if not text: return "" - - # 전처리: 이미지 태그 보호 (임시 마커로 치환) + + # Protect image tags (replace with temporary markers) image_tags = [] def save_image_tag(m): image_tags.append(m.group()) return f'\x00IMG{len(image_tags)-1}\x00' - + text = re.sub(r'\[image:[^\]]+\]', save_image_tag, text) - - # 전처리: Shape 속성 제거 ({\sp{\sn name}{\sv value}} 형식) + + # Remove shape properties text = re.sub(r'\{\\sp\{\\sn\s*\w+\}\{\\sv\s*[^}]*\}\}', '', text) - - # Shape 속성이 직접 출력된 경우도 제거 (shapeType202fFlipH0... 형태) text = re.sub(r'shapeType\d+[a-zA-Z0-9]+(?:posrelh\d+posrelv\d+)?', '', text) - - # \shp 관련 제어 워드 제거 text = re.sub(r'\\shp(?:inst|txt|left|right|top|bottom|bx\w+|by\w+|wr\d+|fblwtxt\d+|z\d+|lid\d+)\b\d*', '', text) - + result = [] i = 0 n = len(text) - + while i < n: ch = text[i] - - # 이미지 태그 마커 복원 + + # Restore image tag markers if ch == '\x00' and i + 3 < n and text[i+1:i+4] == 'IMG': - # \x00IMGn\x00 패턴 찾기 end_idx = text.find('\x00', i + 4) if end_idx != -1: try: @@ -67,13 +64,12 @@ def save_image_tag(m): continue except (ValueError, IndexError): pass - + if ch == '\\': - # 제어 워드 또는 제어 기호 if i + 1 < n: next_ch = text[i + 1] - - # 특수 이스케이프 처리 + + # Special escapes if next_ch == '\\': result.append('\\') i += 2 @@ -99,12 +95,11 @@ def save_image_tag(m): i += 2 continue elif next_ch == "'": - # hex escape \'XX + # Hex escape \'XX if i + 3 < n: try: hex_val = text[i+2:i+4] byte_val = int(hex_val, 16) - # 단일 바이트 디코딩 시도 try: result.append(bytes([byte_val]).decode(encoding)) except: @@ -119,33 +114,32 @@ def save_image_tag(m): i += 1 continue elif next_ch == '*': - # \* - destination 마커, 건너뛰기 + # \* destination marker, skip i += 2 continue elif next_ch.isalpha(): - # 제어 워드: \word[N][delimiter] + # Control word: \word[N][delimiter] j = i + 1 while j < n and text[j].isalpha(): j += 1 - + control_word = text[i+1:j] - - # 숫자 파라미터 스킵 + + # Skip numeric parameter while j < n and (text[j].isdigit() or text[j] == '-'): j += 1 - - # 구분자 처리 (공백은 제어 워드의 일부) + + # Handle delimiter (space is part of control word) if j < n and text[j] == ' ': j += 1 - - # 특별 처리가 필요한 제어 워드 + + # Special control words if control_word in ('par', 'line'): result.append('\n') elif control_word == 'tab': result.append('\t') elif control_word == 'u': - # 유니코드: \uN? - # 이미 파라미터를 스킵했으므로 다시 파싱 + # Unicode: \uN? um = re.match(r'\\u(-?\d+)\??', text[i:]) if um: try: @@ -156,65 +150,55 @@ def save_image_tag(m): except: pass j = i + um.end() - # 다른 제어 워드는 무시 - + i = j continue - + i += 1 elif ch == '{' or ch == '}': - # 중괄호는 건너뛰기 i += 1 elif ch == '\r' or ch == '\n': - # RTF에서 줄바꿈 문자는 무시 (\par가 실제 줄바꿈) i += 1 else: - # 일반 텍스트 result.append(ch) i += 1 - - # 최종 정리 + text_result = ''.join(result) - - # Shape 속성 이름 제거 + + # Remove shape property names shape_name_pattern = r'\b(' + '|'.join(SHAPE_PROPERTY_NAMES) + r')\b' text_result = re.sub(shape_name_pattern, '', text_result) - - # 숫자만 있는 쓰레기 제거 (예: -231, -1, -5 등) + + # Remove garbage numbers text_result = re.sub(r'\s*-\d+\s*', ' ', text_result) - - # Binary/Hex 데이터 제거 + + # Remove hex data outside image tags text_result = _remove_hex_outside_image_tags(text_result) - - # 여러 공백을 하나로 + + # Normalize whitespace text_result = re.sub(r'\s+', ' ', text_result) - + return text_result.strip() def _remove_hex_outside_image_tags(text: str) -> str: - """이미지 태그 외부의 긴 hex 문자열만 제거""" - # 이미지 태그 위치 찾기 + """Remove long hex strings outside image tags.""" protected_ranges = [] for m in re.finditer(r'\[image:[^\]]+\]', text): protected_ranges.append((m.start(), m.end())) - + if not protected_ranges: - # 이미지 태그가 없으면 그냥 제거 return re.sub(r'(? str: def remove_destination_groups(content: str) -> str: - r""" - RTF destination 그룹 {\*\destination...}을 제거합니다. - - 문서 끝에 나타나는 themedata, colorschememapping, latentstyles, datastore 등을 - 제거하여 메타데이터가 텍스트로 추출되는 것을 방지합니다. - + """ + Remove RTF destination groups {\\*\\destination...}. + + Removes themedata, colorschememapping, latentstyles, datastore, etc. + to prevent metadata from being extracted as text. + Args: - content: RTF 콘텐츠 - + content: RTF content + Returns: - destination 그룹이 제거된 콘텐츠 + Content with destination groups removed """ - from contextifier.core.processor.doc_helpers.rtf_constants import ( - SKIP_DESTINATIONS, - IMAGE_DESTINATIONS, - ) - result = [] i = 0 n = len(content) - + while i < n: - # {\* 패턴 감지 if content[i:i+3] == '{\\*': - # destination 이름 추출 j = i + 3 while j < n and content[j] in ' \t\r\n': j += 1 - + if j < n and content[j] == '\\': - # 제어 워드 추출 k = j + 1 while k < n and content[k].isalpha(): k += 1 ctrl_word = content[j+1:k] - + if ctrl_word in SKIP_DESTINATIONS: - # 이 그룹 전체를 건너뛰기 depth = 1 - i += 1 # '{' 다음으로 + i += 1 while i < n and depth > 0: if content[i] == '{': depth += 1 @@ -269,70 +244,62 @@ def remove_destination_groups(content: str) -> str: depth -= 1 i += 1 continue - + if ctrl_word in IMAGE_DESTINATIONS: - # 이미지 태그는 보존하면서 그룹 제거 depth = 1 group_start = i - i += 1 # '{' 다음으로 + i += 1 while i < n and depth > 0: if content[i] == '{': depth += 1 elif content[i] == '}': depth -= 1 i += 1 - - # 그룹 내에서 유효한 이미지 태그만 추출 + group_content = content[group_start:i] image_tag_match = re.search(r'\[image:[^\]]+\]', group_content) if image_tag_match: tag = image_tag_match.group() - # 유효한 태그인지 확인 if '/uploads/.' not in tag and 'uploads/.' not in tag: result.append(tag) continue - + result.append(content[i]) i += 1 - + return ''.join(result) def remove_shape_groups(content: str) -> str: """ - Shape 그룹을 제거하되, shptxt 내의 텍스트는 보존합니다. - - RTF Shape 구조: - {\\shp{\\*\\shpinst...{\\sp{\\sn xxx}{\\sv yyy}}...{\\shptxt 실제텍스트}}} - + Remove shape groups but preserve text in shptxt. + + RTF Shape structure: + {\\shp{\\*\\shpinst...{\\sp{\\sn xxx}{\\sv yyy}}...{\\shptxt actual_text}}} + Args: - content: RTF 콘텐츠 - + content: RTF content + Returns: - Shape 그룹이 정리된 콘텐츠 + Content with shape groups cleaned """ result = [] i = 0 - + while i < len(content): - # \shp 시작 감지 if content[i:i+5] == '{\\shp' or content[i:i+10] == '{\\*\\shpinst': - # Shape 그룹 시작 - # shptxt 내용만 추출하고 나머지는 건너뛰기 depth = 1 - start = i i += 1 shptxt_content = [] in_shptxt = False shptxt_depth = 0 - + while i < len(content) and depth > 0: if content[i] == '{': - # \shptxt 시작 확인 if content[i:i+8] == '{\\shptxt': in_shptxt = True shptxt_depth = depth + 1 - i += 8 # '{\\shptxt' 건너뛰기 + i += 8 continue depth += 1 elif content[i] == '}': @@ -342,78 +309,63 @@ def remove_shape_groups(content: str) -> str: elif in_shptxt: shptxt_content.append(content[i]) i += 1 - - # shptxt 내용이 있으면 추가 + if shptxt_content: - shptxt_text = ''.join(shptxt_content) - result.append(shptxt_text) + result.append(''.join(shptxt_content)) else: result.append(content[i]) i += 1 - + return ''.join(result) def remove_shape_property_groups(content: str) -> str: """ - Shape 속성 그룹 {\\sp{\\sn xxx}{\\sv yyy}}를 제거합니다. - + Remove shape property groups {\\sp{\\sn xxx}{\\sv yyy}}. + Args: - content: RTF 콘텐츠 - + content: RTF content + Returns: - Shape 속성이 제거된 콘텐츠 + Content with shape properties removed """ - # {\\sp{\\sn ...}{\\sv ...}} 패턴 제거 content = re.sub(r'\{\\sp\{\\sn\s*[^}]*\}\{\\sv\s*[^}]*\}\}', '', content) - - # 개별 {\\sp ...} 패턴도 제거 content = re.sub(r'\{\\sp\s*[^}]*\}', '', content) - - # {\\sn ...} 패턴 제거 content = re.sub(r'\{\\sn\s*[^}]*\}', '', content) - - # {\\sv ...} 패턴 제거 content = re.sub(r'\{\\sv\s*[^}]*\}', '', content) - return content def remove_shprslt_blocks(content: str) -> str: - r""" - \shprslt{...} 블록을 제거합니다. - - Word는 Shape (도형/테이블)를 \shp 블록으로 저장하고, - 이전 버전 호환성을 위해 \shprslt 블록에 동일한 내용을 중복 저장합니다. - + """ + Remove \\shprslt{...} blocks. + + Word saves Shape (drawing/table) in \\shp block and duplicates + the same content in \\shprslt block for backward compatibility. + Args: - content: RTF 콘텐츠 - + content: RTF content + Returns: - \shprslt 블록이 제거된 콘텐츠 + Content with \\shprslt blocks removed """ result = [] i = 0 pattern = '\\shprslt' - + while i < len(content): - # \shprslt 찾기 idx = content.find(pattern, i) if idx == -1: result.append(content[i:]) break - - # \shprslt 전까지 추가 + result.append(content[i:idx]) - - # \shprslt{ 다음의 중괄호 블록 건너뛰기 + brace_start = content.find('{', idx) if brace_start == -1: - # 중괄호가 없으면 \shprslt만 건너뛰기 i = idx + len(pattern) continue - - # 매칭되는 닫는 중괄호 찾기 + depth = 1 j = brace_start + 1 while j < len(content) and depth > 0: @@ -422,7 +374,16 @@ def remove_shprslt_blocks(content: str) -> str: elif content[j] == '}': depth -= 1 j += 1 - + i = j - + return ''.join(result) + + +__all__ = [ + 'clean_rtf_text', + 'remove_destination_groups', + 'remove_shape_groups', + 'remove_shape_property_groups', + 'remove_shprslt_blocks', +] diff --git a/contextifier/core/processor/text_handler.py b/contextifier/core/processor/text_handler.py index 28e2e27..0393c0a 100644 --- a/contextifier/core/processor/text_handler.py +++ b/contextifier/core/processor/text_handler.py @@ -10,6 +10,8 @@ from contextifier.core.processor.base_handler import BaseHandler from contextifier.core.functions.utils import clean_text, clean_code_text from contextifier.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor +from contextifier.core.processor.text_helper.text_image_processor import TextImageProcessor +from contextifier.core.functions.img_processor import ImageProcessor if TYPE_CHECKING: from contextifier.core.document_processor import CurrentFile @@ -22,11 +24,29 @@ class TextHandler(BaseHandler): """Text File Processing Handler Class""" - + + def _create_file_converter(self): + """Create text-specific file converter.""" + from contextifier.core.processor.text_helper.text_file_converter import TextFileConverter + return TextFileConverter() + + def _create_preprocessor(self): + """Create text-specific preprocessor.""" + from contextifier.core.processor.text_helper.text_preprocessor import TextPreprocessor + return TextPreprocessor() + def _create_chart_extractor(self) -> BaseChartExtractor: """Text files do not contain charts. Return NullChartExtractor.""" return NullChartExtractor(self._chart_processor) - + + def _create_metadata_extractor(self): + """Text files do not have embedded metadata. Return None (uses NullMetadataExtractor).""" + return None + + def _create_format_image_processor(self) -> ImageProcessor: + """Create text-specific image processor.""" + return TextImageProcessor() + def extract_text( self, current_file: "CurrentFile", @@ -38,7 +58,7 @@ def extract_text( ) -> str: """ Extract text from text file. - + Args: current_file: CurrentFile dict containing file info and binary data extract_metadata: Whether to extract metadata (ignored for text files) @@ -46,14 +66,19 @@ def extract_text( encodings: List of encodings to try is_code: Whether this is a code file **kwargs: Additional options - + Returns: Extracted text """ file_path = current_file.get("file_path", "unknown") file_data = current_file.get("file_data", b"") enc = encodings or DEFAULT_ENCODINGS - + + # Step 1: No file_converter for text files (direct decode) + # Step 2: Preprocess - clean_content is the TRUE SOURCE + preprocessed = self.preprocess(file_data) + file_data = preprocessed.clean_content # TRUE SOURCE + for e in enc: try: text = file_data.decode(e) @@ -65,5 +90,5 @@ def extract_text( except Exception as ex: self.logger.error(f"Error decoding file {file_path} with {e}: {ex}") continue - + raise Exception(f"Could not decode file {file_path} with any supported encoding") diff --git a/contextifier/core/processor/text_helper/__init__.py b/contextifier/core/processor/text_helper/__init__.py new file mode 100644 index 0000000..f0723e1 --- /dev/null +++ b/contextifier/core/processor/text_helper/__init__.py @@ -0,0 +1,17 @@ +# contextifier/core/processor/text_helper/__init__.py +""" +Text Helper 모듈 + +텍스트 파일 처리에 필요한 유틸리티를 제공합니다. + +모듈 구성: +- text_image_processor: 텍스트 파일용 이미지 프로세서 +""" + +from contextifier.core.processor.text_helper.text_image_processor import ( + TextImageProcessor, +) + +__all__ = [ + "TextImageProcessor", +] diff --git a/contextifier/core/processor/text_helper/text_file_converter.py b/contextifier/core/processor/text_helper/text_file_converter.py new file mode 100644 index 0000000..165c133 --- /dev/null +++ b/contextifier/core/processor/text_helper/text_file_converter.py @@ -0,0 +1,27 @@ +# libs/core/processor/text_helper/text_file_converter.py +""" +TextFileConverter - Text file format converter + +Converts binary text data to string with encoding detection. +""" +from typing import Optional, BinaryIO + +from contextifier.core.functions.file_converter import TextFileConverter as BaseTextFileConverter + + +class TextFileConverter(BaseTextFileConverter): + """ + Text file converter. + + Converts binary text data to decoded string. + Inherits from base TextFileConverter. + """ + + def __init__(self): + """Initialize with common text encodings.""" + super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii']) + + def get_format_name(self) -> str: + """Return format name.""" + enc = self._detected_encoding or 'unknown' + return f"Text File ({enc})" diff --git a/contextifier/core/processor/text_helper/text_image_processor.py b/contextifier/core/processor/text_helper/text_image_processor.py new file mode 100644 index 0000000..e6498d3 --- /dev/null +++ b/contextifier/core/processor/text_helper/text_image_processor.py @@ -0,0 +1,75 @@ +# contextifier/core/processor/text_helper/text_image_processor.py +""" +Text Image Processor + +Provides text-specific image processing that inherits from ImageProcessor. +Text files do not contain embedded images, so this is a minimal implementation. +""" +import logging +from typing import Any, Optional + +from contextifier.core.functions.img_processor import ImageProcessor +from contextifier.core.functions.storage_backend import BaseStorageBackend + +logger = logging.getLogger("contextify.image_processor.text") + + +class TextImageProcessor(ImageProcessor): + """ + Text-specific image processor. + + Inherits from ImageProcessor and provides text-specific processing. + Text files do not contain embedded images, so this processor + provides a consistent interface without additional functionality. + + This class exists to maintain interface consistency across all handlers. + + Example: + processor = TextImageProcessor() + + # No images in text files, but interface is consistent + tag = processor.process_image(image_data) # Falls back to base implementation + """ + + def __init__( + self, + directory_path: str = "temp/images", + tag_prefix: str = "[Image:", + tag_suffix: str = "]", + storage_backend: Optional[BaseStorageBackend] = None, + ): + """ + Initialize TextImageProcessor. + + Args: + directory_path: Image save directory + tag_prefix: Tag prefix for image references + tag_suffix: Tag suffix for image references + storage_backend: Storage backend for saving images + """ + super().__init__( + directory_path=directory_path, + tag_prefix=tag_prefix, + tag_suffix=tag_suffix, + storage_backend=storage_backend, + ) + + def process_image( + self, + image_data: bytes, + **kwargs + ) -> Optional[str]: + """ + Process and save image data. + + Text files do not contain embedded images, so this method + delegates to the base implementation. + + Args: + image_data: Raw image binary data + **kwargs: Additional options + + Returns: + Image tag string or None if processing failed + """ + return super().process_image(image_data, **kwargs) diff --git a/contextifier/core/processor/text_helper/text_preprocessor.py b/contextifier/core/processor/text_helper/text_preprocessor.py new file mode 100644 index 0000000..961077f --- /dev/null +++ b/contextifier/core/processor/text_helper/text_preprocessor.py @@ -0,0 +1,82 @@ +# contextifier/core/processor/text_helper/text_preprocessor.py +""" +Text Preprocessor - Process text content after conversion. + +Processing Pipeline Position: + 1. TextFileConverter.convert() → str + 2. TextPreprocessor.preprocess() → PreprocessedData (THIS STEP) + 3. TextMetadataExtractor.extract() → DocumentMetadata (if any) + 4. Content extraction + +Current Implementation: + - Pass-through (Text uses decoded string content directly) +""" +import logging +from typing import Any, Dict + +from contextifier.core.functions.preprocessor import ( + BasePreprocessor, + PreprocessedData, +) + +logger = logging.getLogger("contextify.text.preprocessor") + + +class TextPreprocessor(BasePreprocessor): + """ + Text Content Preprocessor. + + Currently a pass-through implementation as text processing + is straightforward. + """ + + def preprocess( + self, + converted_data: Any, + **kwargs + ) -> PreprocessedData: + """ + Preprocess the converted text content. + + Args: + converted_data: Text string from TextFileConverter + **kwargs: Additional options + + Returns: + PreprocessedData with the content + """ + metadata: Dict[str, Any] = {} + + content = "" + encoding = kwargs.get("encoding", "utf-8") + + if isinstance(converted_data, str): + content = converted_data + metadata['char_count'] = len(content) + metadata['line_count'] = len(content.split('\n')) + elif isinstance(converted_data, bytes): + content = converted_data.decode(encoding, errors='replace') + metadata['char_count'] = len(content) + metadata['line_count'] = len(content.split('\n')) + + logger.debug("Text preprocessor: pass-through, metadata=%s", metadata) + + # clean_content is the TRUE SOURCE - contains the processed text/bytes + return PreprocessedData( + raw_content=converted_data, + clean_content=converted_data, # TRUE SOURCE - bytes or str + encoding=encoding, + extracted_resources={}, + metadata=metadata, + ) + + def get_format_name(self) -> str: + """Return format name.""" + return "Text Preprocessor" + + def validate(self, data: Any) -> bool: + """Validate if data is text content.""" + return isinstance(data, (str, bytes)) + + +__all__ = ['TextPreprocessor'] diff --git a/pyproject.toml b/pyproject.toml index 568e3ea..81698c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "contextifier" -version = "0.1.6" +version = "0.2.0" description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking" readme = "README.md" requires-python = ">=3.12" @@ -75,7 +75,6 @@ dependencies = [ "pdf2image==1.17.0", "pytesseract==0.3.13", "striprtf==0.0.29", - "matplotlib==3.10.8", "cachetools==6.2.4", ] diff --git a/uv.lock b/uv.lock index a766275..1d59561 100644 --- a/uv.lock +++ b/uv.lock @@ -373,7 +373,7 @@ wheels = [ [[package]] name = "contextifier" -version = "0.1.5" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, @@ -390,7 +390,6 @@ dependencies = [ { name = "langchain-text-splitters" }, { name = "langgraph" }, { name = "langsmith" }, - { name = "matplotlib" }, { name = "olefile" }, { name = "openpyxl" }, { name = "orjson" }, @@ -430,7 +429,6 @@ requires-dist = [ { name = "langchain-text-splitters", specifier = "==1.1.0" }, { name = "langgraph", specifier = "==1.0.5" }, { name = "langsmith", specifier = "==0.6.2" }, - { name = "matplotlib", specifier = "==3.10.8" }, { name = "olefile", specifier = "==0.47" }, { name = "openpyxl", specifier = "==3.1.5" }, { name = "orjson", specifier = "==3.11.5" }, @@ -454,72 +452,6 @@ requires-dist = [ { name = "xlrd", specifier = "==2.0.2" }, ] -[[package]] -name = "contourpy" -version = "1.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419, upload-time = "2025-07-26T12:01:21.16Z" }, - { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979, upload-time = "2025-07-26T12:01:22.448Z" }, - { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653, upload-time = "2025-07-26T12:01:24.155Z" }, - { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536, upload-time = "2025-07-26T12:01:25.91Z" }, - { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397, upload-time = "2025-07-26T12:01:27.152Z" }, - { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601, upload-time = "2025-07-26T12:01:28.808Z" }, - { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288, upload-time = "2025-07-26T12:01:31.198Z" }, - { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386, upload-time = "2025-07-26T12:01:33.947Z" }, - { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018, upload-time = "2025-07-26T12:01:35.64Z" }, - { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567, upload-time = "2025-07-26T12:01:36.804Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655, upload-time = "2025-07-26T12:01:37.999Z" }, - { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257, upload-time = "2025-07-26T12:01:39.367Z" }, - { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034, upload-time = "2025-07-26T12:01:40.645Z" }, - { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672, upload-time = "2025-07-26T12:01:41.942Z" }, - { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234, upload-time = "2025-07-26T12:01:43.499Z" }, - { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169, upload-time = "2025-07-26T12:01:45.219Z" }, - { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859, upload-time = "2025-07-26T12:01:46.519Z" }, - { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062, upload-time = "2025-07-26T12:01:48.964Z" }, - { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932, upload-time = "2025-07-26T12:01:51.979Z" }, - { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024, upload-time = "2025-07-26T12:01:53.245Z" }, - { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578, upload-time = "2025-07-26T12:01:54.422Z" }, - { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524, upload-time = "2025-07-26T12:01:55.73Z" }, - { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730, upload-time = "2025-07-26T12:01:57.051Z" }, - { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897, upload-time = "2025-07-26T12:01:58.663Z" }, - { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751, upload-time = "2025-07-26T12:02:00.343Z" }, - { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486, upload-time = "2025-07-26T12:02:02.128Z" }, - { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106, upload-time = "2025-07-26T12:02:03.615Z" }, - { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548, upload-time = "2025-07-26T12:02:05.165Z" }, - { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297, upload-time = "2025-07-26T12:02:07.379Z" }, - { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023, upload-time = "2025-07-26T12:02:10.171Z" }, - { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157, upload-time = "2025-07-26T12:02:11.488Z" }, - { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570, upload-time = "2025-07-26T12:02:12.754Z" }, - { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713, upload-time = "2025-07-26T12:02:14.4Z" }, - { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189, upload-time = "2025-07-26T12:02:16.095Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251, upload-time = "2025-07-26T12:02:17.524Z" }, - { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810, upload-time = "2025-07-26T12:02:18.9Z" }, - { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871, upload-time = "2025-07-26T12:02:20.418Z" }, - { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264, upload-time = "2025-07-26T12:02:21.916Z" }, - { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819, upload-time = "2025-07-26T12:02:23.759Z" }, - { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650, upload-time = "2025-07-26T12:02:26.181Z" }, - { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833, upload-time = "2025-07-26T12:02:28.782Z" }, - { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692, upload-time = "2025-07-26T12:02:30.128Z" }, - { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424, upload-time = "2025-07-26T12:02:31.395Z" }, - { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300, upload-time = "2025-07-26T12:02:32.956Z" }, - { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769, upload-time = "2025-07-26T12:02:34.2Z" }, - { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892, upload-time = "2025-07-26T12:02:35.807Z" }, - { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748, upload-time = "2025-07-26T12:02:37.193Z" }, - { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554, upload-time = "2025-07-26T12:02:38.894Z" }, - { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118, upload-time = "2025-07-26T12:02:40.642Z" }, - { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555, upload-time = "2025-07-26T12:02:42.25Z" }, - { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295, upload-time = "2025-07-26T12:02:44.668Z" }, - { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027, upload-time = "2025-07-26T12:02:47.09Z" }, - { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428, upload-time = "2025-07-26T12:02:48.691Z" }, - { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331, upload-time = "2025-07-26T12:02:50.137Z" }, - { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" }, -] - [[package]] name = "cryptography" version = "46.0.3" @@ -576,15 +508,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, ] -[[package]] -name = "cycler" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, -] - [[package]] name = "dataclasses-json" version = "0.6.7" @@ -648,47 +571,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, ] -[[package]] -name = "fonttools" -version = "4.61.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, - { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, - { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, - { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, - { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, - { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, - { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, - { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, - { url = "https://files.pythonhosted.org/packages/4b/cf/00ba28b0990982530addb8dc3e9e6f2fa9cb5c20df2abdda7baa755e8fe1/fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c", size = 2846454, upload-time = "2025-12-12T17:30:24.938Z" }, - { url = "https://files.pythonhosted.org/packages/5a/ca/468c9a8446a2103ae645d14fee3f610567b7042aba85031c1c65e3ef7471/fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e", size = 2398191, upload-time = "2025-12-12T17:30:27.343Z" }, - { url = "https://files.pythonhosted.org/packages/a3/4b/d67eedaed19def5967fade3297fed8161b25ba94699efc124b14fb68cdbc/fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5", size = 4928410, upload-time = "2025-12-12T17:30:29.771Z" }, - { url = "https://files.pythonhosted.org/packages/b0/8d/6fb3494dfe61a46258cd93d979cf4725ded4eb46c2a4ca35e4490d84daea/fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd", size = 4984460, upload-time = "2025-12-12T17:30:32.073Z" }, - { url = "https://files.pythonhosted.org/packages/f7/f1/a47f1d30b3dc00d75e7af762652d4cbc3dff5c2697a0dbd5203c81afd9c3/fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3", size = 4925800, upload-time = "2025-12-12T17:30:34.339Z" }, - { url = "https://files.pythonhosted.org/packages/a7/01/e6ae64a0981076e8a66906fab01539799546181e32a37a0257b77e4aa88b/fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d", size = 5067859, upload-time = "2025-12-12T17:30:36.593Z" }, - { url = "https://files.pythonhosted.org/packages/73/aa/28e40b8d6809a9b5075350a86779163f074d2b617c15d22343fce81918db/fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c", size = 2267821, upload-time = "2025-12-12T17:30:38.478Z" }, - { url = "https://files.pythonhosted.org/packages/1a/59/453c06d1d83dc0951b69ef692d6b9f1846680342927df54e9a1ca91c6f90/fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b", size = 2318169, upload-time = "2025-12-12T17:30:40.951Z" }, - { url = "https://files.pythonhosted.org/packages/32/8f/4e7bf82c0cbb738d3c2206c920ca34ca74ef9dabde779030145d28665104/fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd", size = 2846094, upload-time = "2025-12-12T17:30:43.511Z" }, - { url = "https://files.pythonhosted.org/packages/71/09/d44e45d0a4f3a651f23a1e9d42de43bc643cce2971b19e784cc67d823676/fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e", size = 2396589, upload-time = "2025-12-12T17:30:45.681Z" }, - { url = "https://files.pythonhosted.org/packages/89/18/58c64cafcf8eb677a99ef593121f719e6dcbdb7d1c594ae5a10d4997ca8a/fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c", size = 4877892, upload-time = "2025-12-12T17:30:47.709Z" }, - { url = "https://files.pythonhosted.org/packages/8a/ec/9e6b38c7ba1e09eb51db849d5450f4c05b7e78481f662c3b79dbde6f3d04/fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75", size = 4972884, upload-time = "2025-12-12T17:30:49.656Z" }, - { url = "https://files.pythonhosted.org/packages/5e/87/b5339da8e0256734ba0dbbf5b6cdebb1dd79b01dc8c270989b7bcd465541/fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063", size = 4924405, upload-time = "2025-12-12T17:30:51.735Z" }, - { url = "https://files.pythonhosted.org/packages/0b/47/e3409f1e1e69c073a3a6fd8cb886eb18c0bae0ee13db2c8d5e7f8495e8b7/fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2", size = 5035553, upload-time = "2025-12-12T17:30:54.823Z" }, - { url = "https://files.pythonhosted.org/packages/bf/b6/1f6600161b1073a984294c6c031e1a56ebf95b6164249eecf30012bb2e38/fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c", size = 2271915, upload-time = "2025-12-12T17:30:57.913Z" }, - { url = "https://files.pythonhosted.org/packages/52/7b/91e7b01e37cc8eb0e1f770d08305b3655e4f002fc160fb82b3390eabacf5/fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c", size = 2323487, upload-time = "2025-12-12T17:30:59.804Z" }, - { url = "https://files.pythonhosted.org/packages/39/5c/908ad78e46c61c3e3ed70c3b58ff82ab48437faf84ec84f109592cabbd9f/fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa", size = 2929571, upload-time = "2025-12-12T17:31:02.574Z" }, - { url = "https://files.pythonhosted.org/packages/bd/41/975804132c6dea64cdbfbaa59f3518a21c137a10cccf962805b301ac6ab2/fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91", size = 2435317, upload-time = "2025-12-12T17:31:04.974Z" }, - { url = "https://files.pythonhosted.org/packages/b0/5a/aef2a0a8daf1ebaae4cfd83f84186d4a72ee08fd6a8451289fcd03ffa8a4/fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19", size = 4882124, upload-time = "2025-12-12T17:31:07.456Z" }, - { url = "https://files.pythonhosted.org/packages/80/33/d6db3485b645b81cea538c9d1c9219d5805f0877fda18777add4671c5240/fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba", size = 5100391, upload-time = "2025-12-12T17:31:09.732Z" }, - { url = "https://files.pythonhosted.org/packages/6c/d6/675ba631454043c75fcf76f0ca5463eac8eb0666ea1d7badae5fea001155/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7", size = 4978800, upload-time = "2025-12-12T17:31:11.681Z" }, - { url = "https://files.pythonhosted.org/packages/7f/33/d3ec753d547a8d2bdaedd390d4a814e8d5b45a093d558f025c6b990b554c/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118", size = 5006426, upload-time = "2025-12-12T17:31:13.764Z" }, - { url = "https://files.pythonhosted.org/packages/b4/40/cc11f378b561a67bea850ab50063366a0d1dd3f6d0a30ce0f874b0ad5664/fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5", size = 2335377, upload-time = "2025-12-12T17:31:16.49Z" }, - { url = "https://files.pythonhosted.org/packages/e4/ff/c9a2b66b39f8628531ea58b320d66d951267c98c6a38684daa8f50fb02f8/fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b", size = 2400613, upload-time = "2025-12-12T17:31:18.769Z" }, - { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, -] - [[package]] name = "frozenlist" version = "1.8.0" @@ -1005,78 +887,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" }, ] -[[package]] -name = "kiwisolver" -version = "1.4.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/86/c9/13573a747838aeb1c76e3267620daa054f4152444d1f3d1a2324b78255b5/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999", size = 123686, upload-time = "2025-08-10T21:26:10.034Z" }, - { url = "https://files.pythonhosted.org/packages/51/ea/2ecf727927f103ffd1739271ca19c424d0e65ea473fbaeea1c014aea93f6/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2", size = 66460, upload-time = "2025-08-10T21:26:11.083Z" }, - { url = "https://files.pythonhosted.org/packages/5b/5a/51f5464373ce2aeb5194508298a508b6f21d3867f499556263c64c621914/kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14", size = 64952, upload-time = "2025-08-10T21:26:12.058Z" }, - { url = "https://files.pythonhosted.org/packages/70/90/6d240beb0f24b74371762873e9b7f499f1e02166a2d9c5801f4dbf8fa12e/kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04", size = 1474756, upload-time = "2025-08-10T21:26:13.096Z" }, - { url = "https://files.pythonhosted.org/packages/12/42/f36816eaf465220f683fb711efdd1bbf7a7005a2473d0e4ed421389bd26c/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752", size = 1276404, upload-time = "2025-08-10T21:26:14.457Z" }, - { url = "https://files.pythonhosted.org/packages/2e/64/bc2de94800adc830c476dce44e9b40fd0809cddeef1fde9fcf0f73da301f/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77", size = 1294410, upload-time = "2025-08-10T21:26:15.73Z" }, - { url = "https://files.pythonhosted.org/packages/5f/42/2dc82330a70aa8e55b6d395b11018045e58d0bb00834502bf11509f79091/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198", size = 1343631, upload-time = "2025-08-10T21:26:17.045Z" }, - { url = "https://files.pythonhosted.org/packages/22/fd/f4c67a6ed1aab149ec5a8a401c323cee7a1cbe364381bb6c9c0d564e0e20/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d", size = 2224963, upload-time = "2025-08-10T21:26:18.737Z" }, - { url = "https://files.pythonhosted.org/packages/45/aa/76720bd4cb3713314677d9ec94dcc21ced3f1baf4830adde5bb9b2430a5f/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab", size = 2321295, upload-time = "2025-08-10T21:26:20.11Z" }, - { url = "https://files.pythonhosted.org/packages/80/19/d3ec0d9ab711242f56ae0dc2fc5d70e298bb4a1f9dfab44c027668c673a1/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2", size = 2487987, upload-time = "2025-08-10T21:26:21.49Z" }, - { url = "https://files.pythonhosted.org/packages/39/e9/61e4813b2c97e86b6fdbd4dd824bf72d28bcd8d4849b8084a357bc0dd64d/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145", size = 2291817, upload-time = "2025-08-10T21:26:22.812Z" }, - { url = "https://files.pythonhosted.org/packages/a0/41/85d82b0291db7504da3c2defe35c9a8a5c9803a730f297bd823d11d5fb77/kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54", size = 73895, upload-time = "2025-08-10T21:26:24.37Z" }, - { url = "https://files.pythonhosted.org/packages/e2/92/5f3068cf15ee5cb624a0c7596e67e2a0bb2adee33f71c379054a491d07da/kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60", size = 64992, upload-time = "2025-08-10T21:26:25.732Z" }, - { url = "https://files.pythonhosted.org/packages/31/c1/c2686cda909742ab66c7388e9a1a8521a59eb89f8bcfbee28fc980d07e24/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8", size = 123681, upload-time = "2025-08-10T21:26:26.725Z" }, - { url = "https://files.pythonhosted.org/packages/ca/f0/f44f50c9f5b1a1860261092e3bc91ecdc9acda848a8b8c6abfda4a24dd5c/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2", size = 66464, upload-time = "2025-08-10T21:26:27.733Z" }, - { url = "https://files.pythonhosted.org/packages/2d/7a/9d90a151f558e29c3936b8a47ac770235f436f2120aca41a6d5f3d62ae8d/kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f", size = 64961, upload-time = "2025-08-10T21:26:28.729Z" }, - { url = "https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098", size = 1474607, upload-time = "2025-08-10T21:26:29.798Z" }, - { url = "https://files.pythonhosted.org/packages/d9/28/aac26d4c882f14de59041636292bc838db8961373825df23b8eeb807e198/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed", size = 1276546, upload-time = "2025-08-10T21:26:31.401Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ad/8bfc1c93d4cc565e5069162f610ba2f48ff39b7de4b5b8d93f69f30c4bed/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525", size = 1294482, upload-time = "2025-08-10T21:26:32.721Z" }, - { url = "https://files.pythonhosted.org/packages/da/f1/6aca55ff798901d8ce403206d00e033191f63d82dd708a186e0ed2067e9c/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78", size = 1343720, upload-time = "2025-08-10T21:26:34.032Z" }, - { url = "https://files.pythonhosted.org/packages/d1/91/eed031876c595c81d90d0f6fc681ece250e14bf6998c3d7c419466b523b7/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b", size = 2224907, upload-time = "2025-08-10T21:26:35.824Z" }, - { url = "https://files.pythonhosted.org/packages/e9/ec/4d1925f2e49617b9cca9c34bfa11adefad49d00db038e692a559454dfb2e/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799", size = 2321334, upload-time = "2025-08-10T21:26:37.534Z" }, - { url = "https://files.pythonhosted.org/packages/43/cb/450cd4499356f68802750c6ddc18647b8ea01ffa28f50d20598e0befe6e9/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3", size = 2488313, upload-time = "2025-08-10T21:26:39.191Z" }, - { url = "https://files.pythonhosted.org/packages/71/67/fc76242bd99f885651128a5d4fa6083e5524694b7c88b489b1b55fdc491d/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c", size = 2291970, upload-time = "2025-08-10T21:26:40.828Z" }, - { url = "https://files.pythonhosted.org/packages/75/bd/f1a5d894000941739f2ae1b65a32892349423ad49c2e6d0771d0bad3fae4/kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d", size = 73894, upload-time = "2025-08-10T21:26:42.33Z" }, - { url = "https://files.pythonhosted.org/packages/95/38/dce480814d25b99a391abbddadc78f7c117c6da34be68ca8b02d5848b424/kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2", size = 64995, upload-time = "2025-08-10T21:26:43.889Z" }, - { url = "https://files.pythonhosted.org/packages/e2/37/7d218ce5d92dadc5ebdd9070d903e0c7cf7edfe03f179433ac4d13ce659c/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1", size = 126510, upload-time = "2025-08-10T21:26:44.915Z" }, - { url = "https://files.pythonhosted.org/packages/23/b0/e85a2b48233daef4b648fb657ebbb6f8367696a2d9548a00b4ee0eb67803/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1", size = 67903, upload-time = "2025-08-10T21:26:45.934Z" }, - { url = "https://files.pythonhosted.org/packages/44/98/f2425bc0113ad7de24da6bb4dae1343476e95e1d738be7c04d31a5d037fd/kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11", size = 66402, upload-time = "2025-08-10T21:26:47.101Z" }, - { url = "https://files.pythonhosted.org/packages/98/d8/594657886df9f34c4177cc353cc28ca7e6e5eb562d37ccc233bff43bbe2a/kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c", size = 1582135, upload-time = "2025-08-10T21:26:48.665Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c6/38a115b7170f8b306fc929e166340c24958347308ea3012c2b44e7e295db/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197", size = 1389409, upload-time = "2025-08-10T21:26:50.335Z" }, - { url = "https://files.pythonhosted.org/packages/bf/3b/e04883dace81f24a568bcee6eb3001da4ba05114afa622ec9b6fafdc1f5e/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c", size = 1401763, upload-time = "2025-08-10T21:26:51.867Z" }, - { url = "https://files.pythonhosted.org/packages/9f/80/20ace48e33408947af49d7d15c341eaee69e4e0304aab4b7660e234d6288/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185", size = 1453643, upload-time = "2025-08-10T21:26:53.592Z" }, - { url = "https://files.pythonhosted.org/packages/64/31/6ce4380a4cd1f515bdda976a1e90e547ccd47b67a1546d63884463c92ca9/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748", size = 2330818, upload-time = "2025-08-10T21:26:55.051Z" }, - { url = "https://files.pythonhosted.org/packages/fa/e9/3f3fcba3bcc7432c795b82646306e822f3fd74df0ee81f0fa067a1f95668/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64", size = 2419963, upload-time = "2025-08-10T21:26:56.421Z" }, - { url = "https://files.pythonhosted.org/packages/99/43/7320c50e4133575c66e9f7dadead35ab22d7c012a3b09bb35647792b2a6d/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff", size = 2594639, upload-time = "2025-08-10T21:26:57.882Z" }, - { url = "https://files.pythonhosted.org/packages/65/d6/17ae4a270d4a987ef8a385b906d2bdfc9fce502d6dc0d3aea865b47f548c/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07", size = 2391741, upload-time = "2025-08-10T21:26:59.237Z" }, - { url = "https://files.pythonhosted.org/packages/2a/8f/8f6f491d595a9e5912971f3f863d81baddccc8a4d0c3749d6a0dd9ffc9df/kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c", size = 68646, upload-time = "2025-08-10T21:27:00.52Z" }, - { url = "https://files.pythonhosted.org/packages/6b/32/6cc0fbc9c54d06c2969faa9c1d29f5751a2e51809dd55c69055e62d9b426/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386", size = 123806, upload-time = "2025-08-10T21:27:01.537Z" }, - { url = "https://files.pythonhosted.org/packages/b2/dd/2bfb1d4a4823d92e8cbb420fe024b8d2167f72079b3bb941207c42570bdf/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552", size = 66605, upload-time = "2025-08-10T21:27:03.335Z" }, - { url = "https://files.pythonhosted.org/packages/f7/69/00aafdb4e4509c2ca6064646cba9cd4b37933898f426756adb2cb92ebbed/kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3", size = 64925, upload-time = "2025-08-10T21:27:04.339Z" }, - { url = "https://files.pythonhosted.org/packages/43/dc/51acc6791aa14e5cb6d8a2e28cefb0dc2886d8862795449d021334c0df20/kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58", size = 1472414, upload-time = "2025-08-10T21:27:05.437Z" }, - { url = "https://files.pythonhosted.org/packages/3d/bb/93fa64a81db304ac8a246f834d5094fae4b13baf53c839d6bb6e81177129/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4", size = 1281272, upload-time = "2025-08-10T21:27:07.063Z" }, - { url = "https://files.pythonhosted.org/packages/70/e6/6df102916960fb8d05069d4bd92d6d9a8202d5a3e2444494e7cd50f65b7a/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df", size = 1298578, upload-time = "2025-08-10T21:27:08.452Z" }, - { url = "https://files.pythonhosted.org/packages/7c/47/e142aaa612f5343736b087864dbaebc53ea8831453fb47e7521fa8658f30/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6", size = 1345607, upload-time = "2025-08-10T21:27:10.125Z" }, - { url = "https://files.pythonhosted.org/packages/54/89/d641a746194a0f4d1a3670fb900d0dbaa786fb98341056814bc3f058fa52/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5", size = 2230150, upload-time = "2025-08-10T21:27:11.484Z" }, - { url = "https://files.pythonhosted.org/packages/aa/6b/5ee1207198febdf16ac11f78c5ae40861b809cbe0e6d2a8d5b0b3044b199/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf", size = 2325979, upload-time = "2025-08-10T21:27:12.917Z" }, - { url = "https://files.pythonhosted.org/packages/fc/ff/b269eefd90f4ae14dcc74973d5a0f6d28d3b9bb1afd8c0340513afe6b39a/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5", size = 2491456, upload-time = "2025-08-10T21:27:14.353Z" }, - { url = "https://files.pythonhosted.org/packages/fc/d4/10303190bd4d30de547534601e259a4fbf014eed94aae3e5521129215086/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce", size = 2294621, upload-time = "2025-08-10T21:27:15.808Z" }, - { url = "https://files.pythonhosted.org/packages/28/e0/a9a90416fce5c0be25742729c2ea52105d62eda6c4be4d803c2a7be1fa50/kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7", size = 75417, upload-time = "2025-08-10T21:27:17.436Z" }, - { url = "https://files.pythonhosted.org/packages/1f/10/6949958215b7a9a264299a7db195564e87900f709db9245e4ebdd3c70779/kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c", size = 66582, upload-time = "2025-08-10T21:27:18.436Z" }, - { url = "https://files.pythonhosted.org/packages/ec/79/60e53067903d3bc5469b369fe0dfc6b3482e2133e85dae9daa9527535991/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548", size = 126514, upload-time = "2025-08-10T21:27:19.465Z" }, - { url = "https://files.pythonhosted.org/packages/25/d1/4843d3e8d46b072c12a38c97c57fab4608d36e13fe47d47ee96b4d61ba6f/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d", size = 67905, upload-time = "2025-08-10T21:27:20.51Z" }, - { url = "https://files.pythonhosted.org/packages/8c/ae/29ffcbd239aea8b93108de1278271ae764dfc0d803a5693914975f200596/kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c", size = 66399, upload-time = "2025-08-10T21:27:21.496Z" }, - { url = "https://files.pythonhosted.org/packages/a1/ae/d7ba902aa604152c2ceba5d352d7b62106bedbccc8e95c3934d94472bfa3/kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122", size = 1582197, upload-time = "2025-08-10T21:27:22.604Z" }, - { url = "https://files.pythonhosted.org/packages/f2/41/27c70d427eddb8bc7e4f16420a20fefc6f480312122a59a959fdfe0445ad/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64", size = 1390125, upload-time = "2025-08-10T21:27:24.036Z" }, - { url = "https://files.pythonhosted.org/packages/41/42/b3799a12bafc76d962ad69083f8b43b12bf4fe78b097b12e105d75c9b8f1/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134", size = 1402612, upload-time = "2025-08-10T21:27:25.773Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b5/a210ea073ea1cfaca1bb5c55a62307d8252f531beb364e18aa1e0888b5a0/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370", size = 1453990, upload-time = "2025-08-10T21:27:27.089Z" }, - { url = "https://files.pythonhosted.org/packages/5f/ce/a829eb8c033e977d7ea03ed32fb3c1781b4fa0433fbadfff29e39c676f32/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21", size = 2331601, upload-time = "2025-08-10T21:27:29.343Z" }, - { url = "https://files.pythonhosted.org/packages/e0/4b/b5e97eb142eb9cd0072dacfcdcd31b1c66dc7352b0f7c7255d339c0edf00/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a", size = 2422041, upload-time = "2025-08-10T21:27:30.754Z" }, - { url = "https://files.pythonhosted.org/packages/40/be/8eb4cd53e1b85ba4edc3a9321666f12b83113a178845593307a3e7891f44/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f", size = 2594897, upload-time = "2025-08-10T21:27:32.803Z" }, - { url = "https://files.pythonhosted.org/packages/99/dd/841e9a66c4715477ea0abc78da039832fbb09dac5c35c58dc4c41a407b8a/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369", size = 2391835, upload-time = "2025-08-10T21:27:34.23Z" }, - { url = "https://files.pythonhosted.org/packages/0c/28/4b2e5c47a0da96896fdfdb006340ade064afa1e63675d01ea5ac222b6d52/kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891", size = 79988, upload-time = "2025-08-10T21:27:35.587Z" }, - { url = "https://files.pythonhosted.org/packages/80/be/3578e8afd18c88cdf9cb4cffde75a96d2be38c5a903f1ed0ceec061bd09e/kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32", size = 70260, upload-time = "2025-08-10T21:27:36.606Z" }, -] - [[package]] name = "langchain" version = "1.2.3" @@ -1388,60 +1198,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/2f/5108cb3ee4ba6501748c4908b908e55f42a5b66245b4cfe0c99326e1ef6e/marshmallow-3.26.2-py3-none-any.whl", hash = "sha256:013fa8a3c4c276c24d26d84ce934dc964e2aa794345a0f8c7e5a7191482c8a73", size = 50964, upload-time = "2025-12-22T06:53:51.801Z" }, ] -[[package]] -name = "matplotlib" -version = "3.10.8" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "contourpy" }, - { name = "cycler" }, - { name = "fonttools" }, - { name = "kiwisolver" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "pyparsing" }, - { name = "python-dateutil" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/67/f997cdcbb514012eb0d10cd2b4b332667997fb5ebe26b8d41d04962fa0e6/matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a", size = 8260453, upload-time = "2025-12-10T22:55:30.709Z" }, - { url = "https://files.pythonhosted.org/packages/7e/65/07d5f5c7f7c994f12c768708bd2e17a4f01a2b0f44a1c9eccad872433e2e/matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58", size = 8148321, upload-time = "2025-12-10T22:55:33.265Z" }, - { url = "https://files.pythonhosted.org/packages/3e/f3/c5195b1ae57ef85339fd7285dfb603b22c8b4e79114bae5f4f0fcf688677/matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04", size = 8716944, upload-time = "2025-12-10T22:55:34.922Z" }, - { url = "https://files.pythonhosted.org/packages/00/f9/7638f5cc82ec8a7aa005de48622eecc3ed7c9854b96ba15bd76b7fd27574/matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f", size = 9550099, upload-time = "2025-12-10T22:55:36.789Z" }, - { url = "https://files.pythonhosted.org/packages/57/61/78cd5920d35b29fd2a0fe894de8adf672ff52939d2e9b43cb83cd5ce1bc7/matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466", size = 9613040, upload-time = "2025-12-10T22:55:38.715Z" }, - { url = "https://files.pythonhosted.org/packages/30/4e/c10f171b6e2f44d9e3a2b96efa38b1677439d79c99357600a62cc1e9594e/matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf", size = 8142717, upload-time = "2025-12-10T22:55:41.103Z" }, - { url = "https://files.pythonhosted.org/packages/f1/76/934db220026b5fef85f45d51a738b91dea7d70207581063cd9bd8fafcf74/matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b", size = 8012751, upload-time = "2025-12-10T22:55:42.684Z" }, - { url = "https://files.pythonhosted.org/packages/3d/b9/15fd5541ef4f5b9a17eefd379356cf12175fe577424e7b1d80676516031a/matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6", size = 8261076, upload-time = "2025-12-10T22:55:44.648Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a0/2ba3473c1b66b9c74dc7107c67e9008cb1782edbe896d4c899d39ae9cf78/matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1", size = 8148794, upload-time = "2025-12-10T22:55:46.252Z" }, - { url = "https://files.pythonhosted.org/packages/75/97/a471f1c3eb1fd6f6c24a31a5858f443891d5127e63a7788678d14e249aea/matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486", size = 8718474, upload-time = "2025-12-10T22:55:47.864Z" }, - { url = "https://files.pythonhosted.org/packages/01/be/cd478f4b66f48256f42927d0acbcd63a26a893136456cd079c0cc24fbabf/matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce", size = 9549637, upload-time = "2025-12-10T22:55:50.048Z" }, - { url = "https://files.pythonhosted.org/packages/5d/7c/8dc289776eae5109e268c4fb92baf870678dc048a25d4ac903683b86d5bf/matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6", size = 9613678, upload-time = "2025-12-10T22:55:52.21Z" }, - { url = "https://files.pythonhosted.org/packages/64/40/37612487cc8a437d4dd261b32ca21fe2d79510fe74af74e1f42becb1bdb8/matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149", size = 8142686, upload-time = "2025-12-10T22:55:54.253Z" }, - { url = "https://files.pythonhosted.org/packages/66/52/8d8a8730e968185514680c2a6625943f70269509c3dcfc0dcf7d75928cb8/matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645", size = 8012917, upload-time = "2025-12-10T22:55:56.268Z" }, - { url = "https://files.pythonhosted.org/packages/b5/27/51fe26e1062f298af5ef66343d8ef460e090a27fea73036c76c35821df04/matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077", size = 8305679, upload-time = "2025-12-10T22:55:57.856Z" }, - { url = "https://files.pythonhosted.org/packages/2c/1e/4de865bc591ac8e3062e835f42dd7fe7a93168d519557837f0e37513f629/matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22", size = 8198336, upload-time = "2025-12-10T22:55:59.371Z" }, - { url = "https://files.pythonhosted.org/packages/c6/cb/2f7b6e75fb4dce87ef91f60cac4f6e34f4c145ab036a22318ec837971300/matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39", size = 8731653, upload-time = "2025-12-10T22:56:01.032Z" }, - { url = "https://files.pythonhosted.org/packages/46/b3/bd9c57d6ba670a37ab31fb87ec3e8691b947134b201f881665b28cc039ff/matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565", size = 9561356, upload-time = "2025-12-10T22:56:02.95Z" }, - { url = "https://files.pythonhosted.org/packages/c0/3d/8b94a481456dfc9dfe6e39e93b5ab376e50998cddfd23f4ae3b431708f16/matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a", size = 9614000, upload-time = "2025-12-10T22:56:05.411Z" }, - { url = "https://files.pythonhosted.org/packages/bd/cd/bc06149fe5585ba800b189a6a654a75f1f127e8aab02fd2be10df7fa500c/matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958", size = 8220043, upload-time = "2025-12-10T22:56:07.551Z" }, - { url = "https://files.pythonhosted.org/packages/e3/de/b22cf255abec916562cc04eef457c13e58a1990048de0c0c3604d082355e/matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5", size = 8062075, upload-time = "2025-12-10T22:56:09.178Z" }, - { url = "https://files.pythonhosted.org/packages/3c/43/9c0ff7a2f11615e516c3b058e1e6e8f9614ddeca53faca06da267c48345d/matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f", size = 8262481, upload-time = "2025-12-10T22:56:10.885Z" }, - { url = "https://files.pythonhosted.org/packages/6f/ca/e8ae28649fcdf039fda5ef554b40a95f50592a3c47e6f7270c9561c12b07/matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b", size = 8151473, upload-time = "2025-12-10T22:56:12.377Z" }, - { url = "https://files.pythonhosted.org/packages/f1/6f/009d129ae70b75e88cbe7e503a12a4c0670e08ed748a902c2568909e9eb5/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d", size = 9553896, upload-time = "2025-12-10T22:56:14.432Z" }, - { url = "https://files.pythonhosted.org/packages/f5/26/4221a741eb97967bc1fd5e4c52b9aa5a91b2f4ec05b59f6def4d820f9df9/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008", size = 9824193, upload-time = "2025-12-10T22:56:16.29Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f3/3abf75f38605772cf48a9daf5821cd4f563472f38b4b828c6fba6fa6d06e/matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c", size = 9615444, upload-time = "2025-12-10T22:56:18.155Z" }, - { url = "https://files.pythonhosted.org/packages/93/a5/de89ac80f10b8dc615807ee1133cd99ac74082581196d4d9590bea10690d/matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11", size = 8272719, upload-time = "2025-12-10T22:56:20.366Z" }, - { url = "https://files.pythonhosted.org/packages/69/ce/b006495c19ccc0a137b48083168a37bd056392dee02f87dba0472f2797fe/matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8", size = 8144205, upload-time = "2025-12-10T22:56:22.239Z" }, - { url = "https://files.pythonhosted.org/packages/68/d9/b31116a3a855bd313c6fcdb7226926d59b041f26061c6c5b1be66a08c826/matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50", size = 8305785, upload-time = "2025-12-10T22:56:24.218Z" }, - { url = "https://files.pythonhosted.org/packages/1e/90/6effe8103f0272685767ba5f094f453784057072f49b393e3ea178fe70a5/matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908", size = 8198361, upload-time = "2025-12-10T22:56:26.787Z" }, - { url = "https://files.pythonhosted.org/packages/d7/65/a73188711bea603615fc0baecca1061429ac16940e2385433cc778a9d8e7/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a", size = 9561357, upload-time = "2025-12-10T22:56:28.953Z" }, - { url = "https://files.pythonhosted.org/packages/f4/3d/b5c5d5d5be8ce63292567f0e2c43dde9953d3ed86ac2de0a72e93c8f07a1/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1", size = 9823610, upload-time = "2025-12-10T22:56:31.455Z" }, - { url = "https://files.pythonhosted.org/packages/4d/4b/e7beb6bbd49f6bae727a12b270a2654d13c397576d25bd6786e47033300f/matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c", size = 9614011, upload-time = "2025-12-10T22:56:33.85Z" }, - { url = "https://files.pythonhosted.org/packages/7c/e6/76f2813d31f032e65f6f797e3f2f6e4aab95b65015924b1c51370395c28a/matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b", size = 8362801, upload-time = "2025-12-10T22:56:36.107Z" }, - { url = "https://files.pythonhosted.org/packages/5d/49/d651878698a0b67f23aa28e17f45a6d6dd3d3f933fa29087fa4ce5947b5a/matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f", size = 8192560, upload-time = "2025-12-10T22:56:38.008Z" }, -] - [[package]] name = "multidict" version = "6.7.0" @@ -2193,15 +1949,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/96/fd59c1532891762ea4815e73956c532053d5e26d56969e1e5d1e4ca4b207/pymupdf-1.26.5-cp39-abi3-win_amd64.whl", hash = "sha256:39a6fb58182b27b51ea8150a0cd2e4ee7e0cf71e9d6723978f28699b42ee61ae", size = 18747258, upload-time = "2025-10-10T14:01:37.346Z" }, ] -[[package]] -name = "pyparsing" -version = "3.3.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/33/c1/1d9de9aeaa1b89b0186e5fe23294ff6517fce1bc69149185577cd31016b2/pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c", size = 1550512, upload-time = "2025-12-23T03:14:04.391Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8b/40/2614036cdd416452f5bf98ec037f38a1afb17f327cb8e6b652d4729e0af8/pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82", size = 121793, upload-time = "2025-12-23T03:14:02.103Z" }, -] - [[package]] name = "pypdfium2" version = "5.3.0"