diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 474c228..3ffe746 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from DocParser.TexSoup.TexSoup import TexSoup -from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from TexSoup.TexSoup import TexSoup +from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from DocParser.vrdu import logger -from DocParser.vrdu.config import envs +from vrdu import logger +from vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/main.py b/DocParser/main.py index 2abedd8..09c6c34 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -3,6 +3,7 @@ import os import shutil from tqdm import tqdm +import re from vrdu import logger @@ -58,6 +59,10 @@ def remove_redundant_stuff(main_directory: str) -> None: for file in redundant_files: os.remove(file) + redundant_bib_files = glob.glob(f"{main_directory}/bib_*") + for file in redundant_bib_files: + os.remove(file) + # remove useless pdf and image files # TODO: move this name pattern into config redundant_folders = glob.glob( @@ -86,6 +91,17 @@ def process_one_file(file_name: str) -> None: main_directory = os.path.dirname(file_name) log.info(f"[VRDU] file: {file_name}, start processing.") + # use unsrt as the default bibliography style + with open(file_name, "r") as file: + content = file.read() + # if cant find bibliographystyle, add it + if not re.search(r"\\bibliographystyle", content): + content = re.sub(r"\\end{document}", "\\\\bibliographystyle{unsrt}\n\\\\end{document}", content) + else: + content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content) + with open(file_name, "w") as file: + file.write(content) + # check if this paper has been processed quality_report_file = os.path.join( main_directory, "output/result/quality_report.json" @@ -103,17 +119,26 @@ def process_one_file(file_name: str) -> None: if os.path.exists(output_directory): shutil.rmtree(output_directory) - # output_directory stores the intermediate results - # result_directory stores the final results - os.makedirs(os.path.join(main_directory, "output/result")) - cwd = os.getcwd() try: - # change the working directory to the main directory of the paper + # # change the working directory to the main directory of the paper + # os.chdir(main_directory) + # # create output folder + # os.makedirs(os.path.join(main_directory, "output/result")) + + # Save current working directory + cwd = os.getcwd() + + # Change the working directory to the main directory of the paper os.chdir(main_directory) - # create output folder - os.makedirs(os.path.join(main_directory, "output/result")) + + # Create output folder if it doesn't exist + output_folder = os.path.join(main_directory, "output/result") + if not os.path.exists(output_folder): + os.makedirs(output_folder) + else: + print(f"Output folder '{output_folder}' already exists.") # step 1: preprocess the paper preprocess.run(original_tex) @@ -144,11 +169,12 @@ def process_one_file(file_name: str) -> None: log.info(f"[VRDU] file: {original_tex}, successfully processed.") except Exception as e: - error_type = e.__class__.__name__ - error_info = str(e) - log.error( - f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" - ) + # error_type = e.__class__.__name__ + # error_info = str(e) + # log.error( + # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" + # ) + raise e finally: # remove redundant files diff --git a/DocParser/vrdu/config/config.py b/DocParser/vrdu/config/config.py index 081f4ec..9028fba 100644 --- a/DocParser/vrdu/config/config.py +++ b/DocParser/vrdu/config/config.py @@ -79,6 +79,7 @@ "Equation", "Footnote", "List", + "Reference" ] diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index 31b7a1f..6fbb8c2 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -278,97 +278,107 @@ def generate_non_figure_bb(self) -> Dict[int, List[Block]]: log.debug(f"category: {category}, index: {index}") elements = [] - for image_pair in image_pairs: - page_index = image_pair[0] + try: + for image_pair in image_pairs: + page_index = image_pair[0] - image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) - image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) + image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) + image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) - diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) - if np.all(diff_image == 0): - continue - labeled_image, num = label( - diff_image > config.threshold, return_num=True - ) - if num == 0: - continue - - regions = regionprops(labeled_image) - bounding_boxes = [region.bbox for region in regions] - - if len(bounding_boxes) == 0: - continue - - separations = self.layout_metadata[page_index]["separations"] - top_margin = self.layout_metadata[page_index]["top_margin"] - - # We do not consider the cross column case for these envs. - if category in envs.one_column_envs: - bounding_boxes = [bb for bb in bounding_boxes] - if len(bounding_boxes) == 0: + diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) + if np.all(diff_image == 0): continue - element = Block( - bounding_box=BoundingBox.from_list(bounding_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, + labeled_image, num = label( + diff_image > config.threshold, return_num=True ) - if elements: - element.parent_block = elements[-1].block_id - elements.append(element) - continue + if num == 0: + continue + + regions = regionprops(labeled_image) + bounding_boxes = [region.bbox for region in regions] - # consider possible cross column case - for column in range(self.layout_metadata["num_columns"]): - # min_x: bb[1], min_y: bb[0], max_x: bb[4], max_y: bb[3] - column_boxes = [ - bb - for bb in bounding_boxes - if bb[1] >= separations[column] - and bb[1] <= separations[column + 1] - ] - if not column_boxes: + if len(bounding_boxes) == 0: continue - element = Block( - bounding_box=BoundingBox.from_list(column_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - - if ( - len(elements) > 0 - and elements[-1].category == element.category - and elements[-1].page_index == element.page_index - and elements[-1].source_code == element.source_code - and elements[-1].bbox.overlap(element.bbox) - ): - elements[-1].bbox = BoundingBox( - min( - elements[-1].bbox.x0, - element.bbox.x0, - ), - min( - elements[-1].bbox.y0, - element.bbox.y0, - ), - max( - elements[-1].bbox.x1, - element.bbox.x1, - ), - max( - elements[-1].bbox.y1, - element.bbox.y1, - ), + separations = self.layout_metadata[page_index]["separations"] + top_margin = self.layout_metadata[page_index]["top_margin"] + + # We do not consider the cross column case for these envs. + if category in envs.one_column_envs: + bounding_boxes = [bb for bb in bounding_boxes] + if len(bounding_boxes) == 0: + continue + element = Block( + bounding_box=BoundingBox.from_list(bounding_boxes), + source_code=self.text_info[category][index], + category=config.name2category[category], + page_index=page_index, ) + if elements: + element.parent_block = elements[-1].block_id + elements.append(element) continue - elements.append(element) - for element in elements: - layout_info[element.page_index].append(element) + # consider possible cross column case + for column in range(self.layout_metadata["num_columns"]): + try: + column_boxes = [ + bb + for bb in bounding_boxes + if bb[1] >= separations[column] + and bb[1] <= separations[column + 1] + ] + if not column_boxes: + continue + + element = Block( + bounding_box=BoundingBox.from_list(column_boxes), + source_code=self.text_info[category][index], + category=config.name2category[category], + page_index=page_index, + ) + if elements: + element.parent_block = elements[-1].block_id + + if ( + len(elements) > 0 + and elements[-1].category == element.category + and elements[-1].page_index == element.page_index + and elements[-1].source_code == element.source_code + and elements[-1].bbox.overlap(element.bbox) + ): + elements[-1].bbox = BoundingBox( + min( + elements[-1].bbox.x0, + element.bbox.x0, + ), + min( + elements[-1].bbox.y0, + element.bbox.y0, + ), + max( + elements[-1].bbox.x1, + element.bbox.x1, + ), + max( + elements[-1].bbox.y1, + element.bbox.y1, + ), + ) + continue + elements.append(element) + except IndexError: + log.error(f"IndexError: {column}") + continue # Skip processing for this column if index is out of range + + for element in elements: + layout_info[element.page_index].append(element) + + except Exception as e: + # Handle the exception as per your application's requirements + log.error(f"Error processing block directory {block_directory}: {str(e)}") + # Optionally, you can raise the exception to stop further processing + # raise return layout_info @@ -399,66 +409,6 @@ def generate_layout_info(self) -> Dict[int, List[Block]]: layout_info[page_index].extend(figure_layout_info[page_index]) return layout_info - def generate_reading_annotation( - self, layout_info: Dict[int, List[Block]] - ) -> DefaultDict[str, List]: - """Generate a reading annotation based on the layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information - for each page index. The keys are the page indices and the values are lists of - `Block` objects representing the blocks on each page. - - Returns: - DefaultDict[str, List]: A defaultdict containing the reading annotation. The keys - of the defaultdict are the page indices and the values are lists of dictionaries - representing the reading annotation for each block on the page. Each dictionary - contains the following keys: - - "source_code": The source code of the block. - - "image_path": The path to the saved image of the block. - - "category": The category of the block. - - The defaultdict also contains the following keys: - - "categories": A list of dictionaries representing the categories. Each - dictionary contains the following keys: - - "id": The ID of the category. - - "name": The name of the category. - - "macros": A dictionary containing the macro definitions extracted from - the original tex file. - """ - reading_annotation = defaultdict(list) - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), - key=lambda x: x[-6:-4], - ) - count = 0 - for page_index in layout_info.keys(): - page_image = Image.open(image_files[page_index]) - for block in layout_info[page_index]: - cropped_image = page_image.crop(block.bbox) - - image_name = config.folder_prefix + str(count).zfill(4) + ".jpg" - count += 1 - image_path = os.path.join(self.result_directory, image_name) - cropped_image.save(image_path) - reading_annotation[page_index].append( - { - "source_code": block.source_code, - "image_path": image_name, - "category": block.category, - } - ) - page_image.close() - - reading_annotation["categories"] = [ - {"id": index, "name": category} - for index, category, _ in config.config["category_name"] - ] - - return reading_annotation - def generate_image_annotation( self, layout_info: Dict[int, List[Block]] ) -> Dict[int, Dict[str, Any]]: @@ -536,13 +486,6 @@ def annotate(self): layout_info, image_annotation, file_path=layout_annotation_file ) - # step3: generate reading annotation - reading_annotation = self.generate_reading_annotation(layout_info) - reading_annotation_file = os.path.join( - self.result_directory, "reading_annotation.json" - ) - utils.export_to_json(reading_annotation, reading_annotation_file) - def get_image_pairs(dir1: str, dir2: str): """ diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index f4f4003..9e825bf 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -1,5 +1,6 @@ import os import re +from typing import Dict from arxiv_cleaner.cleaner import Cleaner @@ -31,6 +32,25 @@ def remove_comments(original_tex: str) -> None: with open(original_tex, "w") as file: file.write(removed_comments) +def remove_skip(original_tex: str) -> None: + """ + Removes skip from a TeX file. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None + """ + with open(original_tex, "r") as file: + content = file.read() + + pattern = r"\\vskip .*|\\vspace{.*}|\\vglue .*" + removed_skip = re.sub(pattern, '', content) + + with open(original_tex, "w") as file: + file.write(removed_skip) + def clean_tex(original_tex: str) -> None: """ @@ -60,10 +80,13 @@ def clean_tex(original_tex: str) -> None: # remove comments remove_comments(original_tex) + # remove skip + remove_skip(original_tex) -def replace_pdf_ps_figures_with_png(original_tex: str) -> None: + +def replace_non_png_jpg_figures(original_tex: str) -> None: """ - Replaces PDF, ps, eps figures with PNG figures in a TeX file + Replaces PDF, ps, eps figures' extension with PNG in a TeX file to support pdfminer detecting bounding box. Args: @@ -71,79 +94,80 @@ def replace_pdf_ps_figures_with_png(original_tex: str) -> None: Returns: None: This function does not return anything. - - Raises: - FileNotFoundError: If a PDF file specified in the TeX file is not found. """ - - # FIXME: use more robust way, since the path to images may not exists. main_directory = os.path.dirname(original_tex) - with open(original_tex) as f: + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + # Store the relative path of the image as the value + image_files[image_name] = os.path.relpath( + os.path.join(root, file), main_directory + ) + + replace_figures_in_tex_files(original_tex, image_files) + replace_figures_in_folders(image_files) + + +def replace_figures_in_tex_files( + original_tex: str, image_files: Dict[str, str] +) -> None: + with open(original_tex, "r") as f: content = f.read() - graphicspath_pattern = r"\\graphicspath\{\{(.+?)}" - match = re.search(graphicspath_pattern, content, re.DOTALL) - if match: - graphic_path = match.group(1) - else: - graphic_path = "" - - # Replace \psfig{...} with \includegraphics{...} - content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Replace \epsfig{...} with \includegraphics{...} - content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Regular expression pattern to match \includegraphics - # commands with PDF files - pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" - - # Find all matches of \includegraphics with PDF files - matches = re.findall(pattern, content) - - # Replace PDF paths with PNG paths - ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - for match in matches: - image_name = match[1] - if not any(ext in image_name for ext in ext_patterns): - for ext in ext_patterns: - image_file = os.path.join(main_directory, graphic_path, image_name, ext) - if os.path.exists(image_file): - image_name = image_name + ext - break - - # detectable image type, see pdfminer.six for details - if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]): - content = content.replace(match[1], image_name) - continue - - # convert eps to pdf - if any(ext in image_name for ext in [".eps", ".ps"]): - eps_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(eps_image): - log.error(f"File not found: {eps_image}") - continue - pdf_image = os.path.splitext(eps_image)[0] + ".pdf" - utils.convert_eps_image_to_pdf_image(eps_image, pdf_image) - image_name = os.path.basename(pdf_image) - - # convert pdf to png - if image_name.endswith(".pdf"): - pdf_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(pdf_image): - log.error(f"File not found: {pdf_image}") - continue - png_image = os.path.splitext(pdf_image)[0] + ".png" - utils.convert_pdf_figure_to_png_image(pdf_image, png_image) - image_name = os.path.splitext(image_name)[0] + ".png" - - # replace the reference in tex file - content = content.replace(match[1], image_name) - + # Replace \psfig and \epsfig commands with \includegraphics command + def custom_replace(match): + options = match.group(1) or "" + filepath = match.group(2) + if options: + return f"\\includegraphics[{options}]{{{filepath}}}" + else: + return f"\\includegraphics{{{filepath}}}" + + content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + + # Traverse the image_files dictionary to update file extensions + for image_name, file_path in image_files.items(): + base_name, current_extension = os.path.splitext(image_name) + correct_extension = os.path.splitext(file_path)[1] + + if correct_extension not in [".jpg", ".jpeg"]: + correct_extension = ".png" + + # Build a regular expression to match image files including optional extensions + pattern = re.compile( + r"(\\includegraphics(?:\[[^\]]*\])?\{.*?" + + re.escape(base_name) + + r")(\.\w+)?\}" + ) + replacement = rf"\1{correct_extension}}}" + content = pattern.sub(replacement, content) + + # Write the updated content back to the file with open(original_tex, "w") as f: f.write(content) +def replace_figures_in_folders(image_files: Dict[str, str]) -> None: + for image_name, file_path in image_files.items(): + if file_path.endswith(".eps") or file_path.endswith(".ps"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + temp_pdf = os.path.join(os.path.dirname(file_path), image_name + ".pdf") + # convert eps to pdf + utils.convert_eps_image_to_pdf_image(file_path, temp_pdf) + # convert pdf to png + utils.convert_pdf_figure_to_png_image(temp_pdf, output_png) + # remove redundant files + os.remove(temp_pdf) + elif file_path.endswith(".pdf"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + # convert pdf to png + utils.convert_pdf_figure_to_png_image(file_path, output_png) + + def delete_table_of_contents(original_tex: str) -> None: """ Deletes the table of contents from the given original_tex file. @@ -183,8 +207,8 @@ def run(original_tex: str) -> None: # Step 0: clean tex clean_tex(original_tex) - # Step 2: process images - replace_pdf_ps_figures_with_png(original_tex) + # Step 1: process images + replace_non_png_jpg_figures(original_tex) - # Step 3: delete table of contents + # Step 2: delete table of contents delete_table_of_contents(original_tex) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 5737bc7..88c26d9 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -5,16 +5,15 @@ import re -import DocParser.vrdu.utils as utils -import DocParser.vrdu.logger as logger -from DocParser.vrdu.config import config, envs +import vrdu.utils as utils +import vrdu.logger as logger +from vrdu.config import config, envs -from DocParser.TexSoup.TexSoup import TexSoup -import DocParser.TexSoup.app.conversion as conversion +from TexSoup.TexSoup import TexSoup +import TexSoup.app.conversion as conversion log = logger.get_logger(__name__) - class Renderer: def __init__(self) -> None: self.texts = defaultdict(list) @@ -74,6 +73,7 @@ def render_all_env(self, color_tex: str) -> None: """ self.render_simple_envs(color_tex) self.render_float_envs(color_tex) + self.render_reference(color_tex) def render_simple_envs(self, color_tex: str) -> None: """Renders simple environments in a LaTeX file. @@ -411,6 +411,7 @@ def modify_color_definitions(self, input_file: str, output_file: str) -> None: r"\\definecolor{" + color_name + r"}{RGB}{255, 255, 255}", content, ) + content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{bib_white}", content) with open(output_file, "w") as file: file.write(content) @@ -435,6 +436,27 @@ def get_env_orders(self, tex_file: str) -> List[str]: # the definitions are discarded return matches[len(colors) :] + + def get_bib_env_orders(self, tex_file: str) -> List[str]: + """Returns a list of environment orders based on the contents of the given `tex_file`. + + Args: + tex_file (str): The path to the .tex file. + + Returns: + List[str]: A list of environment orders. + """ + with open(tex_file) as f: + contents = f.read() + colors = list(config.name2color.values()) + matches = [] + + pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors) + for m in re.finditer(pattern, contents): + matches.append(m.group(0)) + + # the definitions are discarded + return matches def render_one_env(self, main_directory: str) -> None: """Render one environment by modifying the corresponding rendering color to black. @@ -445,6 +467,7 @@ def render_one_env(self, main_directory: str) -> None: Returns: None: This function does not return anything. """ + # handle latex file color_tex_file = os.path.join(main_directory, "paper_colored.tex") white_tex_file = os.path.join(main_directory, "paper_white.tex") self.modify_color_definitions(color_tex_file, white_tex_file) @@ -470,6 +493,51 @@ def render_one_env(self, main_directory: str) -> None: with open(output_file, "w") as f: f.write(new_content) + # handle bib file + paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") + shutil.copyfile(white_tex_file, paper_bib_white) + color_bib_file = os.path.join(main_directory, "bib_colored.bib") + white_bib_file = os.path.join(main_directory, "bib_white.bib") + self.modify_color_definitions(color_bib_file, white_bib_file) + ordered_env_colors = self.get_bib_env_orders(white_bib_file) + # print(ordered_env_colors) + + with open(white_bib_file, "r") as f: + bib_content = f.read() + + index_map = defaultdict(int) + + for index, env_color in enumerate(ordered_env_colors): + env = env_color[: -len(suffix)] + bib_new_content = replace_nth( + bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 + ) + bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}") + bib_output_file = os.path.join( + main_directory, + f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib", + ) + + # change the bib file name in paper_bib_white.tex + with open(paper_bib_white, "r") as f: + tex_content = f.read() + + bib_file_name = os.path.basename(bib_output_file).split(".")[0] + tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content) + + tex_output_file = os.path.join( + main_directory, + f"paper_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.tex", + ) + + index_map[env] += 1 + with open(bib_output_file, "w") as f: + f.write(bib_new_content) + + with open(tex_output_file, "w", encoding='utf-8') as f: + f.write(tex_new_content) + + def render_caption(self, tex_file: str) -> None: """Renders captions in a LaTeX file. @@ -616,6 +684,110 @@ def render_abstract(self, tex_file: str) -> None: with open(tex_file, "w") as f: f.write(result) + def render_reference(self, tex_file: str) -> None: + """ + Renders the reference section based on a BibTeX (.bib) file. + + Args: + tex_file (str): The path to the LaTex file. + + Returns: + None + """ + bib_pattern = r'\\bibliography\s*{\s*([^}]+)\s*}' + # Extract directory and filename from LaTeX file path + tex_dir, tex_filename = os.path.split(tex_file) + + # Extract BibTeX file path from LaTeX file + bib_file = None + with open(tex_file, 'r', encoding='utf-8') as tex_f: + tex_content = tex_f.read() + + # Search for bibliography command + match = re.search(bib_pattern, tex_content) + if match: + bib_filename = match.group(1) + '.bib' + bib_file = os.path.join(tex_dir, bib_filename) + + if not bib_file: + print("BibTeX file not found in the LaTeX file.") + return + main_directory = os.path.dirname(tex_file) + + # copy the original tex file + color_bib = os.path.join(main_directory, "bib_colored.bib") + white_bib = os.path.join(main_directory, "bib_white.bib") + shutil.copyfile(bib_file, color_bib) + shutil.copyfile(bib_file, white_bib) + + # Define colorize function inline + def colorize(text: str, category_name: str) -> str: + if category_name == "Reference": + # Define regex patterns + author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]") + year_pattern = re.compile(r"\byear\s*=\s*[\{\"]") + + # Find the position of the author and year + author_match = author_pattern.search(text) + if author_match: + # Find the start of the author field + author_start = author_match.end() - 1 + author_end = text.find("}", author_start) + author_mid = text.find(",", author_start) + if author_end == -1: + author_end = text.find("\"", author_start) + if author_end == -1: + author_end = text.find("\"", author_start) + 1 + # Replace author field with colorized version + if author_end != -1: + if author_mid != -1 and author_mid < author_end: + text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:] + else: + text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] + + year_match = year_pattern.search(text) + if year_match: + # Find the start of the year field + year_start = year_match.end() - 1 + year_end_1 = text.find("\"", year_start + 1) + year_end_2 = text.find("}", year_start + 1) + # find the before year_end + if year_end_1 != -1 and year_end_2 != -1: + year_end = min(year_end_1, year_end_2) + else: + year_end = max(year_end_1, year_end_2) + # Replace year field with black color + if year_end != -1: + text = text[:year_end] + "\\color{white}" + text[year_end:] + + return text + + with open(white_bib, 'r') as bib_file: + bib_content = bib_file.read() + + # use bibtexparser to parse the bib file + bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL) + for item in bib_entries: + self.texts["Reference"].append(item) + + # Read BibTeX file + with open(color_bib, 'r', encoding='utf-8') as bib_f: + bibtex_entries = bib_f.readlines() + + # Colorize and format references in LaTeX format + colored_references = [] + for entry in bibtex_entries: + if entry.strip().startswith('@'): + formatted_entry = f"{entry.strip()}" + else: + formatted_entry = f" {entry.strip()}" + colored_ref = colorize(formatted_entry, "Reference") + colored_references.append(colored_ref) + # Write back to the BibTeX file + with open(color_bib, 'w', encoding='utf-8') as bib_f: + for ref in colored_references: + bib_f.write(ref + "\n") + def render_tabular(self, tex_file: str) -> None: """Renders tabular environments in a LaTeX file. diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index be6fa51..a4d1e9f 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -7,8 +7,8 @@ from pdf2image import pdf2image from pdf2image import generators -from DocParser.vrdu.block import Block -from DocParser.vrdu.config import config +from vrdu.block import Block +from vrdu.config import config def export_to_json(data: Union[Dict, List], file_path: str) -> None: @@ -40,7 +40,7 @@ def load_json(file_path: str) -> Union[Dict, List]: def compile_latex(file: str) -> None: """ - Compile a LaTeX file using pdflatex engine. + Compile a LaTeX file using pdflatex and bibtex engines. Parameters: file (str): The path to the LaTeX file to be compiled. @@ -49,19 +49,38 @@ def compile_latex(file: str) -> None: None """ file_name = os.path.basename(file) + base_name, _ = os.path.splitext(file_name) + # First compilation with SyncTeX subprocess.run( ["pdflatex", "-interaction=nonstopmode", file_name], timeout=1000, stdout=subprocess.DEVNULL, ) + # Compile BibTeX if .aux file exists + if os.path.exists(base_name + ".aux"): + subprocess.run( + ["bibtex", base_name], + timeout=1000, + stdout=subprocess.DEVNULL, + ) + + # Second compilation to include bibliography subprocess.run( ["pdflatex", "-interaction=nonstopmode", file_name], timeout=1000, stdout=subprocess.DEVNULL, ) + # Third compilation to finalize references and SyncTeX + subprocess.run( + ["pdflatex", "-interaction=nonstopmode", file_name], + timeout=1000, + stdout=subprocess.DEVNULL, + ) + + # Additional compilation for specific file if file_name == "paper_colored.tex": subprocess.run( ["pdflatex", "-interaction=nonstopmode", "-synctex=1", file_name], @@ -69,7 +88,6 @@ def compile_latex(file: str) -> None: stdout=subprocess.DEVNULL, ) - def pdf2jpg(pdf_path: str, output_directory: str) -> None: """ Convert a PDF file into a series of jpg images. @@ -251,4 +269,5 @@ def colorize(text: str, category_name: str) -> str: if category_name == "Code": return "{\\color{" + color + "}" + text + "}" + raise NotImplementedError(f"Invalid category name: {category_name}") diff --git a/scripts/app.py b/scripts/app.py index 54b4a1c..549d682 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -3,8 +3,8 @@ import glob from PIL import Image, ImageDraw -from DocParser.vrdu import utils -from DocParser.vrdu.config import config +from vrdu import utils +from vrdu.config import config pn.extension() diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 971f779..c7c9e10 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,7 +5,7 @@ import tarfile -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(logger_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index 78dbe8d..e357ac4 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -5,8 +5,8 @@ from typing import List import pandas as pd -from DocParser.vrdu import logger -from DocParser.main import process_one_file +from vrdu import logger +from main import process_one_file log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index f8c41d8..fafb3d2 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index f098d64..a4104b7 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -4,8 +4,8 @@ import os from pathlib import Path -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py index 6897c67..bf97df2 100644 --- a/scripts/retrieve_metadata.py +++ b/scripts/retrieve_metadata.py @@ -6,8 +6,8 @@ import argparse -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py index b59b365..2f5bc5b 100644 --- a/scripts/visualize_order_annotations.py +++ b/scripts/visualize_order_annotations.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from DocParser.vrdu import utils +from vrdu import utils def draw_arrow_line( diff --git a/setup.py b/setup.py index ad473aa..ba3749e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name="vrdu_data_process", + name="DocParser", version="1.0.0", description="process the academic papers with .tex source files", author="Mao Song", diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index 096ca65..f3ca221 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 14a2cd5..8335db3 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 3baa280..6426411 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from DocParser.vrdu.renderer import is_text_eq +from vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_predefined_color.py similarity index 90% rename from tests/test_remove_hyperref_color.py rename to tests/test_remove_predefined_color.py index 3b6a287..fdc1b34 100644 --- a/tests/test_remove_hyperref_color.py +++ b/tests/test_remove_predefined_color.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): @@ -21,7 +21,7 @@ def test1(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -33,7 +33,7 @@ def test2(self): new=unittest.mock.mock_open(read_data=self.mock_file_content2), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -45,7 +45,7 @@ def test3(self): new=unittest.mock.mock_open(read_data=self.mock_file_content3), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -57,7 +57,7 @@ def test4(self): new=unittest.mock.mock_open(read_data=self.mock_file_content4), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\usepackage{amsmath}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 405f6da..16f2cb9 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index a4cf6ad..c15821e 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index b526f60..eb21de8 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index 55082de..79dae23 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCode(unittest.TestCase): @@ -71,7 +71,7 @@ def test_no_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -83,7 +83,7 @@ def test_remove_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content5), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( r"""\documentclass{article}\n\usepackage{listings}\n\usepackage{xcolor}\n\n\definecolor{codegreen}{rgb}{0,0.6,0}\n\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\n\lstdefinestyle{mystyle}{\n backgroundcolor=\color{backcolour}, \n commentstyle=\color{codegreen},\n keywordstyle=\color{magenta},\n numberstyle=\tiny\color{codegray},\n stringstyle=\color{codepurple},\n basicstyle=\ttfamily\footnotesize,\n breakatwhitespace=false, \n breaklines=true, \n captionpos=b, \n keepspaces=true, \n numbers=left, \n numbersep=5pt, \n showspaces=false, \n showstringspaces=false,\n showtabs=false, \n tabsize=2\n}\n\n\n\n\begin{document}\nThe next code will be directly imported from a file\n\n\lstinputlisting[language=Octave]{BitXorMatrix.m}\n\end{document}""" diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e81e0fd..e0fcebd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index e57f363..7cb1e52 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTabular(unittest.TestCase): diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 122063b..343714e 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTitle(unittest.TestCase): diff --git a/tests/test_replace_figures_in_folders.py b/tests/test_replace_figures_in_folders.py new file mode 100644 index 0000000..fcfec6a --- /dev/null +++ b/tests/test_replace_figures_in_folders.py @@ -0,0 +1,33 @@ +import unittest +from unittest.mock import patch +from DocParser.vrdu.preprocess import replace_figures_in_folders + + +class TestGeneratePngFigure(unittest.TestCase): + def setUp(self): + # Simulate image files + self.image_files = { + "file1": "dir1/file1.eps", + "file2": "dir/dir2/file2.png", + "file3": "dir1/file3.jpg", + "file4": "file4.jpeg", + "file5": "dir/dir2/dir5/file5.ps", + "file6": "dir/dir2/dir5/file6.pdf", + } + + @patch("vrdu.utils.convert_eps_image_to_pdf_image") + @patch("vrdu.utils.convert_pdf_figure_to_png_image") + @patch("os.remove") + def test_png_generation( + self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf + ): + + # Mock os.remove to do nothing + mock_os_remove.side_effect = lambda x: None + + replace_figures_in_folders(self.image_files) + + # Test the number of times the file conversion function is called + self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) + self.assertEqual(mock_os_remove.call_count, 2) + self.assertEqual(mock_convert_pdf_to_png.call_count, 3) diff --git a/tests/test_replace_figures_in_tex_file.py b/tests/test_replace_figures_in_tex_file.py new file mode 100644 index 0000000..f39fca6 --- /dev/null +++ b/tests/test_replace_figures_in_tex_file.py @@ -0,0 +1,51 @@ +import unittest +import unittest.mock +from DocParser.vrdu.preprocess import replace_figures_in_tex_files + + +class TestAbstract(unittest.TestCase): + def setUp(self): + self.initial_content = """ + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.pdf}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} + \\subfigure[]{\\epsfig{dir2/iterate_error.eps}} + \\subfigure[]{\\psfig[width=0.48\\columnwidth]{time_constraint.es}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1}} + \\label{fig:iteration_information} + """ + + # Simulate image files with correct extensions + self.image_files = { + "time_vs_dimension": "dir1/time_vs_dimension.pdf", + "iterate_constraint": "dir2/iterate_constraint.jpg", + "iterate_error": "dir2/iterate_error.eps", + "time_constraint": "time_constraint.es", + "iterate_correct": "dir3/dir4/iterate_correct.png", + "time_error": "dir3/time_error.pdf", + "time_error_1": "dir3/time_error_1.jpeg", + } + + def test_replace_figures(self): + expected_content = """ + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} + \\subfigure[]{\\includegraphics{dir2/iterate_error.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{time_constraint.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1.jpeg}} + \\label{fig:iteration_information} + """ + + with unittest.mock.patch( + "builtins.open", + new=unittest.mock.mock_open(read_data=self.initial_content), + create=True, + ) as file_mock: + replace_figures_in_tex_files(file_mock, self.image_files) + file_mock.assert_called_with(file_mock, "w") + file_mock().write.assert_called_with(expected_content)