From 4e8c728760cbe24091c7bbb52c6bcc97b48f3910 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:12:27 +0800 Subject: [PATCH 01/39] refactor(preprocess.py): use a more robust to replace figure --- vrdu/preprocess.py | 111 +++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 70 deletions(-) diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py index f4f4003..a3dc264 100644 --- a/vrdu/preprocess.py +++ b/vrdu/preprocess.py @@ -61,9 +61,9 @@ def clean_tex(original_tex: str) -> None: remove_comments(original_tex) -def replace_pdf_ps_figures_with_png(original_tex: str) -> None: +def replace_figures_extension_with_png(original_tex: str) -> None: """ - Replaces PDF, ps, eps figures with PNG figures in a TeX file + Replaces PDF, ps, eps figures' extension with PNG in a TeX file to support pdfminer detecting bounding box. Args: @@ -71,76 +71,47 @@ def replace_pdf_ps_figures_with_png(original_tex: str) -> None: Returns: None: This function does not return anything. - - Raises: - FileNotFoundError: If a PDF file specified in the TeX file is not found. """ - - # FIXME: use more robust way, since the path to images may not exists. main_directory = os.path.dirname(original_tex) - with open(original_tex) as f: + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + # Store the relative path of the image as the value + image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory) + + with open(original_tex, 'r') as f: content = f.read() - graphicspath_pattern = r"\\graphicspath\{\{(.+?)}" - match = re.search(graphicspath_pattern, content, re.DOTALL) - if match: - graphic_path = match.group(1) - else: - graphic_path = "" - - # Replace \psfig{...} with \includegraphics{...} - content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Replace \epsfig{...} with \includegraphics{...} - content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Regular expression pattern to match \includegraphics - # commands with PDF files - pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" - - # Find all matches of \includegraphics with PDF files - matches = re.findall(pattern, content) - - # Replace PDF paths with PNG paths - ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - for match in matches: - image_name = match[1] - if not any(ext in image_name for ext in ext_patterns): - for ext in ext_patterns: - image_file = os.path.join(main_directory, graphic_path, image_name, ext) - if os.path.exists(image_file): - image_name = image_name + ext - break - - # detectable image type, see pdfminer.six for details - if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]): - content = content.replace(match[1], image_name) - continue - - # convert eps to pdf - if any(ext in image_name for ext in [".eps", ".ps"]): - eps_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(eps_image): - log.error(f"File not found: {eps_image}") - continue - pdf_image = os.path.splitext(eps_image)[0] + ".pdf" - utils.convert_eps_image_to_pdf_image(eps_image, pdf_image) - image_name = os.path.basename(pdf_image) - - # convert pdf to png - if image_name.endswith(".pdf"): - pdf_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(pdf_image): - log.error(f"File not found: {pdf_image}") - continue - png_image = os.path.splitext(pdf_image)[0] + ".png" - utils.convert_pdf_figure_to_png_image(pdf_image, png_image) - image_name = os.path.splitext(image_name)[0] + ".png" - - # replace the reference in tex file - content = content.replace(match[1], image_name) - - with open(original_tex, "w") as f: + # Replace \psfig and \epsfig commands with \includegraphics command + def custom_replace(match): + options = match.group(1) or '' + filepath = match.group(2) + if options: + return f"\\includegraphics[{options}]{{{filepath}}}" + else: + return f"\\includegraphics{{{filepath}}}" + + content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + + # Traverse the image_files dictionary to update file extensions + for image_name, file_path in image_files.items(): + base_name, current_extension = os.path.splitext(image_name) + correct_extension = os.path.splitext(file_path)[1] + + if correct_extension not in ['.jpg', '.jpeg']: + correct_extension = '.png' + + # Build a regular expression to match image files including optional extensions + pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}') + replacement = rf'\1{correct_extension}}}' + content = pattern.sub(replacement, content) + + # Write the updated content back to the file + with open(original_tex, 'w') as f: f.write(content) @@ -183,8 +154,8 @@ def run(original_tex: str) -> None: # Step 0: clean tex clean_tex(original_tex) - # Step 2: process images - replace_pdf_ps_figures_with_png(original_tex) + # Step 1: process images + replace_figures_extension_with_png(original_tex) # Step 3: delete table of contents delete_table_of_contents(original_tex) From 278d6644714b9106771d796fc8afcbaefc4f35f7 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:18:02 +0800 Subject: [PATCH 02/39] feat(preprocess.py): generate png figures --- vrdu/preprocess.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py index a3dc264..d776b4d 100644 --- a/vrdu/preprocess.py +++ b/vrdu/preprocess.py @@ -115,6 +115,41 @@ def custom_replace(match): f.write(content) + +def generate_png_figure(original_tex: str) -> None: + """ + Generate PNG figures for PDF, ps, eps figures. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None: This function does not return anything. + """ + main_directory = os.path.dirname(original_tex) + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + image_files[image_name] = os.path.join(root, file) + + for image_name, file_path in image_files.items(): + if file_path.endswith(".eps") or file_path.endswith(".ps"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + temp_pdf = os.path.join(os.path.dirname(file_path), image_name + ".pdf") + # convert eps to pdf + utils.convert_eps_image_to_pdf_image(file_path, temp_pdf) + # convert pdf to png + utils.convert_pdf_figure_to_png_image(temp_pdf, output_png) + elif file_path.endswith(".pdf"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + # convert pdf to png + utils.convert_pdf_figure_to_png_image(file_path, output_png) + + + def delete_table_of_contents(original_tex: str) -> None: """ Deletes the table of contents from the given original_tex file. @@ -157,5 +192,8 @@ def run(original_tex: str) -> None: # Step 1: process images replace_figures_extension_with_png(original_tex) + # Step 2: generate png figures + generate_png_figure(original_tex) + # Step 3: delete table of contents delete_table_of_contents(original_tex) From 4293732b914286721bd4c35b1465f2e0ba802016 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:19:36 +0800 Subject: [PATCH 03/39] test(test_extension.py): add test for replace_figures_extension_with_png --- tests/test_extension.py | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tests/test_extension.py diff --git a/tests/test_extension.py b/tests/test_extension.py new file mode 100644 index 0000000..243fc3c --- /dev/null +++ b/tests/test_extension.py @@ -0,0 +1,60 @@ +import unittest +import os +import unittest.mock + + +from replace_figure_extension import replace_figures_extension_with_png + + +class TestAbstract(unittest.TestCase): + def setUp(self) -> None: + + # 测试环境的设置,包括创建测试文件夹和文件 + self.test_dir = 'test_directory' + self.original_tex = os.path.join(self.test_dir, 'test.tex') + os.makedirs(self.test_dir, exist_ok=True) + with open(self.original_tex, 'w') as f: + f.write(r''' + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} + \\subfigure[]{\\epsfig{figures/iterate_error.eps}} + \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}} + \\label{fig:iteration_information} + ''') + + # 模拟图片文件 + self.image_files = [ + 'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg' + ] + for file_name in self.image_files: + with open(os.path.join(self.test_dir, file_name), 'w') as f: + f.write('dummy content') + + def tearDown(self): + # 清理测试创建的文件和目录 + for root, dirs, files in os.walk(self.test_dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(self.test_dir) + + def test(self): + replace_figures_extension_with_png(self.original_tex) + with open(self.original_tex, 'r') as f: + content = f.read() + self.assertEqual(content, r''' + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} + \\subfigure[]{\\includegraphics{figures/iterate_error.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}} + \\label{fig:iteration_information} + ''') From 94b64554e5642253aebee2f30e08755b291d776a Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:20:05 +0800 Subject: [PATCH 04/39] test(test_folder.py): add test for generate_png_figure --- tests/test_folder.py | 50 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/test_folder.py diff --git a/tests/test_folder.py b/tests/test_folder.py new file mode 100644 index 0000000..40386e0 --- /dev/null +++ b/tests/test_folder.py @@ -0,0 +1,50 @@ +import unittest +import os +from unittest.mock import patch, MagicMock +from generate_figure import generate_png_figure + +class TestGeneratePngFigure(unittest.TestCase): + def setUp(self): + # 设置测试环境,模拟有各种类型文件的目录 + self.test_dir = 'test_directory_1' + self.original_tex = os.path.join(self.test_dir, 'test.tex') + os.makedirs(self.test_dir, exist_ok=True) + self.image_files = [ + 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf' + ] + for file_name in self.image_files: + with open(os.path.join(self.test_dir, file_name), 'w') as f: + f.write('dummy content') + + def tearDown(self): + # 清理测试创建的文件和目录 + for root, dirs, files in os.walk(self.test_dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(self.test_dir) + + @patch('vrdu.utils.convert_eps_image_to_pdf_image') + @patch('vrdu.utils.convert_pdf_figure_to_png_image') + def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf): + generate_png_figure(self.original_tex) + + # 检查文件生成情况 + expected_files = [ + 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', + 'image1.png', 'image2.png', 'image6.png' + ] + # 获取当前目录下所有文件 + generated_files = os.listdir(self.test_dir) + + + # 目前模拟的测试环境中,无法真的生成文件,导致expected_files和generated_files不一致 + + # print("Expected Files:", expected_files) + # print("Generated Files:", generated_files) + # self.assertCountEqual(expected_files, generated_files) + + # 检查函数调用 + self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) # 对于两个EPS/PS文件的调用 + self.assertEqual(mock_convert_pdf_to_png.call_count, 3) # 对于三个PDF文件的调用 \ No newline at end of file From d174257cb6ae63eed4274230d6c82e4229f92ee0 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:12:27 +0800 Subject: [PATCH 05/39] refactor(preprocess.py): use a more robust to replace figure --- DocParser/vrdu/preprocess.py | 111 +++++++++++++---------------------- 1 file changed, 41 insertions(+), 70 deletions(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index f4f4003..a3dc264 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -61,9 +61,9 @@ def clean_tex(original_tex: str) -> None: remove_comments(original_tex) -def replace_pdf_ps_figures_with_png(original_tex: str) -> None: +def replace_figures_extension_with_png(original_tex: str) -> None: """ - Replaces PDF, ps, eps figures with PNG figures in a TeX file + Replaces PDF, ps, eps figures' extension with PNG in a TeX file to support pdfminer detecting bounding box. Args: @@ -71,76 +71,47 @@ def replace_pdf_ps_figures_with_png(original_tex: str) -> None: Returns: None: This function does not return anything. - - Raises: - FileNotFoundError: If a PDF file specified in the TeX file is not found. """ - - # FIXME: use more robust way, since the path to images may not exists. main_directory = os.path.dirname(original_tex) - with open(original_tex) as f: + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + # Store the relative path of the image as the value + image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory) + + with open(original_tex, 'r') as f: content = f.read() - graphicspath_pattern = r"\\graphicspath\{\{(.+?)}" - match = re.search(graphicspath_pattern, content, re.DOTALL) - if match: - graphic_path = match.group(1) - else: - graphic_path = "" - - # Replace \psfig{...} with \includegraphics{...} - content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Replace \epsfig{...} with \includegraphics{...} - content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content) - - # Regular expression pattern to match \includegraphics - # commands with PDF files - pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}" - - # Find all matches of \includegraphics with PDF files - matches = re.findall(pattern, content) - - # Replace PDF paths with PNG paths - ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - for match in matches: - image_name = match[1] - if not any(ext in image_name for ext in ext_patterns): - for ext in ext_patterns: - image_file = os.path.join(main_directory, graphic_path, image_name, ext) - if os.path.exists(image_file): - image_name = image_name + ext - break - - # detectable image type, see pdfminer.six for details - if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]): - content = content.replace(match[1], image_name) - continue - - # convert eps to pdf - if any(ext in image_name for ext in [".eps", ".ps"]): - eps_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(eps_image): - log.error(f"File not found: {eps_image}") - continue - pdf_image = os.path.splitext(eps_image)[0] + ".pdf" - utils.convert_eps_image_to_pdf_image(eps_image, pdf_image) - image_name = os.path.basename(pdf_image) - - # convert pdf to png - if image_name.endswith(".pdf"): - pdf_image = os.path.join(main_directory, graphic_path, image_name) - if not os.path.exists(pdf_image): - log.error(f"File not found: {pdf_image}") - continue - png_image = os.path.splitext(pdf_image)[0] + ".png" - utils.convert_pdf_figure_to_png_image(pdf_image, png_image) - image_name = os.path.splitext(image_name)[0] + ".png" - - # replace the reference in tex file - content = content.replace(match[1], image_name) - - with open(original_tex, "w") as f: + # Replace \psfig and \epsfig commands with \includegraphics command + def custom_replace(match): + options = match.group(1) or '' + filepath = match.group(2) + if options: + return f"\\includegraphics[{options}]{{{filepath}}}" + else: + return f"\\includegraphics{{{filepath}}}" + + content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + + # Traverse the image_files dictionary to update file extensions + for image_name, file_path in image_files.items(): + base_name, current_extension = os.path.splitext(image_name) + correct_extension = os.path.splitext(file_path)[1] + + if correct_extension not in ['.jpg', '.jpeg']: + correct_extension = '.png' + + # Build a regular expression to match image files including optional extensions + pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}') + replacement = rf'\1{correct_extension}}}' + content = pattern.sub(replacement, content) + + # Write the updated content back to the file + with open(original_tex, 'w') as f: f.write(content) @@ -183,8 +154,8 @@ def run(original_tex: str) -> None: # Step 0: clean tex clean_tex(original_tex) - # Step 2: process images - replace_pdf_ps_figures_with_png(original_tex) + # Step 1: process images + replace_figures_extension_with_png(original_tex) # Step 3: delete table of contents delete_table_of_contents(original_tex) From 7467858a08b012483fed8a61d596021329382e43 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:18:02 +0800 Subject: [PATCH 06/39] feat(preprocess.py): generate png figures --- DocParser/vrdu/preprocess.py | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index a3dc264..d776b4d 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -115,6 +115,41 @@ def custom_replace(match): f.write(content) + +def generate_png_figure(original_tex: str) -> None: + """ + Generate PNG figures for PDF, ps, eps figures. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None: This function does not return anything. + """ + main_directory = os.path.dirname(original_tex) + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + image_files[image_name] = os.path.join(root, file) + + for image_name, file_path in image_files.items(): + if file_path.endswith(".eps") or file_path.endswith(".ps"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + temp_pdf = os.path.join(os.path.dirname(file_path), image_name + ".pdf") + # convert eps to pdf + utils.convert_eps_image_to_pdf_image(file_path, temp_pdf) + # convert pdf to png + utils.convert_pdf_figure_to_png_image(temp_pdf, output_png) + elif file_path.endswith(".pdf"): + output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") + # convert pdf to png + utils.convert_pdf_figure_to_png_image(file_path, output_png) + + + def delete_table_of_contents(original_tex: str) -> None: """ Deletes the table of contents from the given original_tex file. @@ -157,5 +192,8 @@ def run(original_tex: str) -> None: # Step 1: process images replace_figures_extension_with_png(original_tex) + # Step 2: generate png figures + generate_png_figure(original_tex) + # Step 3: delete table of contents delete_table_of_contents(original_tex) From afa40757f1c2319d99ade9e6d59a373c25d8c565 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:19:36 +0800 Subject: [PATCH 07/39] test(test_extension.py): add test for replace_figures_extension_with_png --- tests/test_extension.py | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tests/test_extension.py diff --git a/tests/test_extension.py b/tests/test_extension.py new file mode 100644 index 0000000..243fc3c --- /dev/null +++ b/tests/test_extension.py @@ -0,0 +1,60 @@ +import unittest +import os +import unittest.mock + + +from replace_figure_extension import replace_figures_extension_with_png + + +class TestAbstract(unittest.TestCase): + def setUp(self) -> None: + + # 测试环境的设置,包括创建测试文件夹和文件 + self.test_dir = 'test_directory' + self.original_tex = os.path.join(self.test_dir, 'test.tex') + os.makedirs(self.test_dir, exist_ok=True) + with open(self.original_tex, 'w') as f: + f.write(r''' + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} + \\subfigure[]{\\epsfig{figures/iterate_error.eps}} + \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}} + \\label{fig:iteration_information} + ''') + + # 模拟图片文件 + self.image_files = [ + 'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg' + ] + for file_name in self.image_files: + with open(os.path.join(self.test_dir, file_name), 'w') as f: + f.write('dummy content') + + def tearDown(self): + # 清理测试创建的文件和目录 + for root, dirs, files in os.walk(self.test_dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(self.test_dir) + + def test(self): + replace_figures_extension_with_png(self.original_tex) + with open(self.original_tex, 'r') as f: + content = f.read() + self.assertEqual(content, r''' + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} + \\subfigure[]{\\includegraphics{figures/iterate_error.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}} + \\label{fig:iteration_information} + ''') From e8a9c5f19e2c0cec57c5694c54e89439474aaa2c Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:20:05 +0800 Subject: [PATCH 08/39] test(test_folder.py): add test for generate_png_figure --- tests/test_folder.py | 50 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/test_folder.py diff --git a/tests/test_folder.py b/tests/test_folder.py new file mode 100644 index 0000000..40386e0 --- /dev/null +++ b/tests/test_folder.py @@ -0,0 +1,50 @@ +import unittest +import os +from unittest.mock import patch, MagicMock +from generate_figure import generate_png_figure + +class TestGeneratePngFigure(unittest.TestCase): + def setUp(self): + # 设置测试环境,模拟有各种类型文件的目录 + self.test_dir = 'test_directory_1' + self.original_tex = os.path.join(self.test_dir, 'test.tex') + os.makedirs(self.test_dir, exist_ok=True) + self.image_files = [ + 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf' + ] + for file_name in self.image_files: + with open(os.path.join(self.test_dir, file_name), 'w') as f: + f.write('dummy content') + + def tearDown(self): + # 清理测试创建的文件和目录 + for root, dirs, files in os.walk(self.test_dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(self.test_dir) + + @patch('vrdu.utils.convert_eps_image_to_pdf_image') + @patch('vrdu.utils.convert_pdf_figure_to_png_image') + def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf): + generate_png_figure(self.original_tex) + + # 检查文件生成情况 + expected_files = [ + 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', + 'image1.png', 'image2.png', 'image6.png' + ] + # 获取当前目录下所有文件 + generated_files = os.listdir(self.test_dir) + + + # 目前模拟的测试环境中,无法真的生成文件,导致expected_files和generated_files不一致 + + # print("Expected Files:", expected_files) + # print("Generated Files:", generated_files) + # self.assertCountEqual(expected_files, generated_files) + + # 检查函数调用 + self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) # 对于两个EPS/PS文件的调用 + self.assertEqual(mock_convert_pdf_to_png.call_count, 3) # 对于三个PDF文件的调用 \ No newline at end of file From 2af62ad37f932682d8d77812c39705a2e198579a Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Thu, 20 Jun 2024 18:08:08 +0800 Subject: [PATCH 09/39] test(test_folder.py): test without creating actual files --- tests/test_folder.py | 57 +++++++++++++------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/tests/test_folder.py b/tests/test_folder.py index 40386e0..835d6a3 100644 --- a/tests/test_folder.py +++ b/tests/test_folder.py @@ -1,50 +1,27 @@ import unittest import os from unittest.mock import patch, MagicMock -from generate_figure import generate_png_figure -class TestGeneratePngFigure(unittest.TestCase): - def setUp(self): - # 设置测试环境,模拟有各种类型文件的目录 - self.test_dir = 'test_directory_1' - self.original_tex = os.path.join(self.test_dir, 'test.tex') - os.makedirs(self.test_dir, exist_ok=True) - self.image_files = [ - 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf' - ] - for file_name in self.image_files: - with open(os.path.join(self.test_dir, file_name), 'w') as f: - f.write('dummy content') - def tearDown(self): - # 清理测试创建的文件和目录 - for root, dirs, files in os.walk(self.test_dir, topdown=False): - for name in files: - os.remove(os.path.join(root, name)) - for name in dirs: - os.rmdir(os.path.join(root, name)) - os.rmdir(self.test_dir) +from DocParser.vrdu.preprocess import generate_png_figure - @patch('vrdu.utils.convert_eps_image_to_pdf_image') - @patch('vrdu.utils.convert_pdf_figure_to_png_image') - def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf): - generate_png_figure(self.original_tex) - # 检查文件生成情况 - expected_files = [ - 'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', - 'image1.png', 'image2.png', 'image6.png' +class TestGeneratePngFigure(unittest.TestCase): + @patch("os.path.dirname", return_value="/mocked/dir/") + @patch("os.walk") + @patch("DocParser.vrdu.utils.convert_pdf_figure_to_png_image") + def test_single_pdf_generation(self, mock_save, mock_walk, mock_dirname): + mocked_file = "/mocked/dir/original.tex" + mock_walk.return_value = [ + ("/mocked/dir/", ["dir1", "dir2"], ["file1.txt", "file2.csv"]), + ("/mocked/dir/dir1", [], ["file3.json"]), + ("/mocked/dir/dir2", [], ["file4.pdf"]), ] - # 获取当前目录下所有文件 - generated_files = os.listdir(self.test_dir) - - - # 目前模拟的测试环境中,无法真的生成文件,导致expected_files和generated_files不一致 + generate_png_figure(mocked_file) + # mock_dirname.assert_called_once_with(mocked_file) - # print("Expected Files:", expected_files) - # print("Generated Files:", generated_files) - # self.assertCountEqual(expected_files, generated_files) + mock_walk.assert_called_once_with("/mocked/dir/") - # 检查函数调用 - self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) # 对于两个EPS/PS文件的调用 - self.assertEqual(mock_convert_pdf_to_png.call_count, 3) # 对于三个PDF文件的调用 \ No newline at end of file + mock_save.assert_called_once_with( + "/mocked/dir/dir2/file4.pdf", "/mocked/dir/dir2/file4.png" + ) From 001847a49518bd129f5642003bba905c1708344f Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Fri, 21 Jun 2024 10:36:52 +0800 Subject: [PATCH 10/39] refactor(preprocess.py): use image_files as argument to prevent repeat code --- DocParser/vrdu/preprocess.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index d776b4d..2bb179a 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -115,26 +115,7 @@ def custom_replace(match): f.write(content) - -def generate_png_figure(original_tex: str) -> None: - """ - Generate PNG figures for PDF, ps, eps figures. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None: This function does not return anything. - """ - main_directory = os.path.dirname(original_tex) - image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - image_files = {} - for root, _, files in os.walk(main_directory): - for file in files: - if any(file.endswith(ext) for ext in image_extensions): - image_name, ext = os.path.splitext(file) - image_files[image_name] = os.path.join(root, file) - +def replace_figures_in_folders(image_files: Dict[str, str]) -> None: for image_name, file_path in image_files.items(): if file_path.endswith(".eps") or file_path.endswith(".ps"): output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") From 3ba9de545fb10cf831497e17763f3ab13cbc4ee3 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Fri, 21 Jun 2024 10:39:06 +0800 Subject: [PATCH 11/39] refactor(preprocess.py): enclose replacing figures in tex file as a function --- DocParser/vrdu/preprocess.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index 2bb179a..cee78ea 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -80,14 +80,21 @@ def replace_figures_extension_with_png(original_tex: str) -> None: if any(file.endswith(ext) for ext in image_extensions): image_name, ext = os.path.splitext(file) # Store the relative path of the image as the value - image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory) + image_files[image_name] = os.path.relpath( + os.path.join(root, file), main_directory + ) - with open(original_tex, 'r') as f: + replace_figures_in_tex_files(original_tex, image_files) + +def replace_figures_in_tex_files( + original_tex: str, image_files: Dict[str, str] +) -> None: + with open(original_tex, "r") as f: content = f.read() # Replace \psfig and \epsfig commands with \includegraphics command def custom_replace(match): - options = match.group(1) or '' + options = match.group(1) or "" filepath = match.group(2) if options: return f"\\includegraphics[{options}]{{{filepath}}}" @@ -102,16 +109,20 @@ def custom_replace(match): base_name, current_extension = os.path.splitext(image_name) correct_extension = os.path.splitext(file_path)[1] - if correct_extension not in ['.jpg', '.jpeg']: - correct_extension = '.png' + if correct_extension not in [".jpg", ".jpeg"]: + correct_extension = ".png" # Build a regular expression to match image files including optional extensions - pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}') - replacement = rf'\1{correct_extension}}}' + pattern = re.compile( + r"(\\includegraphics(?:\[[^\]]*\])?\{.*?" + + re.escape(base_name) + + r")(\.\w+)?\}" + ) + replacement = rf"\1{correct_extension}}}" content = pattern.sub(replacement, content) # Write the updated content back to the file - with open(original_tex, 'w') as f: + with open(original_tex, "w") as f: f.write(content) From b69c55aed083770c489fc8abe49cb27bf9e08ca3 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Fri, 21 Jun 2024 10:40:15 +0800 Subject: [PATCH 12/39] feat(preprocess.py): remove intermediate generated pdf --- DocParser/vrdu/preprocess.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index cee78ea..6483b65 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -1,5 +1,6 @@ import os import re +from typing import Dict from arxiv_cleaner.cleaner import Cleaner @@ -85,6 +86,8 @@ def replace_figures_extension_with_png(original_tex: str) -> None: ) replace_figures_in_tex_files(original_tex, image_files) + replace_figures_in_folders(image_files) + def replace_figures_in_tex_files( original_tex: str, image_files: Dict[str, str] @@ -135,13 +138,14 @@ def replace_figures_in_folders(image_files: Dict[str, str]) -> None: utils.convert_eps_image_to_pdf_image(file_path, temp_pdf) # convert pdf to png utils.convert_pdf_figure_to_png_image(temp_pdf, output_png) + # remove redundant files + os.remove(temp_pdf) elif file_path.endswith(".pdf"): output_png = os.path.join(os.path.dirname(file_path), image_name + ".png") # convert pdf to png utils.convert_pdf_figure_to_png_image(file_path, output_png) - def delete_table_of_contents(original_tex: str) -> None: """ Deletes the table of contents from the given original_tex file. From 3c41811b64178a510b38811ac46882d9f39d8d09 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Fri, 21 Jun 2024 10:42:19 +0800 Subject: [PATCH 13/39] refactor(preprocess.py): use meaningful function name --- DocParser/vrdu/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index 6483b65..c63a1c2 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -62,7 +62,7 @@ def clean_tex(original_tex: str) -> None: remove_comments(original_tex) -def replace_figures_extension_with_png(original_tex: str) -> None: +def replace_non_png_jpg_figures(original_tex: str) -> None: """ Replaces PDF, ps, eps figures' extension with PNG in a TeX file to support pdfminer detecting bounding box. @@ -186,7 +186,7 @@ def run(original_tex: str) -> None: clean_tex(original_tex) # Step 1: process images - replace_figures_extension_with_png(original_tex) + replace_non_png_jpg_figures(original_tex) # Step 2: generate png figures generate_png_figure(original_tex) From c3e0ba6b295213b441b2f6e420bd300ad14a3bd2 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Fri, 21 Jun 2024 10:42:44 +0800 Subject: [PATCH 14/39] refactor(preprocess.py): delete unused function call --- DocParser/vrdu/preprocess.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index c63a1c2..4d02e9c 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -188,8 +188,5 @@ def run(original_tex: str) -> None: # Step 1: process images replace_non_png_jpg_figures(original_tex) - # Step 2: generate png figures - generate_png_figure(original_tex) - - # Step 3: delete table of contents + # Step 2: delete table of contents delete_table_of_contents(original_tex) From c565dfccf5aeb8e66a3811e34ba0343cc000c797 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 21 Jun 2024 18:30:53 +0800 Subject: [PATCH 15/39] test(test_extension): modify the content to test the function of replace_figures_in_tex_files --- tests/test_extension.py | 97 +++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/tests/test_extension.py b/tests/test_extension.py index 243fc3c..2c00812 100644 --- a/tests/test_extension.py +++ b/tests/test_extension.py @@ -1,60 +1,51 @@ import unittest -import os import unittest.mock - - -from replace_figure_extension import replace_figures_extension_with_png - - +from DocParser.vrdu.preprocess import replace_figures_in_tex_files class TestAbstract(unittest.TestCase): def setUp(self) -> None: - - # 测试环境的设置,包括创建测试文件夹和文件 - self.test_dir = 'test_directory' - self.original_tex = os.path.join(self.test_dir, 'test.tex') - os.makedirs(self.test_dir, exist_ok=True) - with open(self.original_tex, 'w') as f: - f.write(r''' - \\begin{figure}[ht] - \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}} - \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} - \\subfigure[]{\\epsfig{figures/iterate_error.eps}} - \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}} - \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} - \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}} - \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}} - \\label{fig:iteration_information} - ''') + self.initial_content = """ + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.pdf}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} + \\subfigure[]{\\epsfig{dir2/iterate_error.eps}} + \\subfigure[]{\\psfig[width=0.48\\columnwidth]{time_constraint.es}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1}} + \\label{fig:iteration_information} + """ - # 模拟图片文件 - self.image_files = [ - 'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg' - ] - for file_name in self.image_files: - with open(os.path.join(self.test_dir, file_name), 'w') as f: - f.write('dummy content') + # Simulate image files with correct extensions + self.image_files = { + 'time_vs_dimension': 'dir1/time_vs_dimension.pdf', + 'iterate_constraint': 'dir2/iterate_constraint.jpg', + 'iterate_error': 'dir2/iterate_error.eps', + 'time_constraint': 'time_constraint.es', + 'iterate_correct': 'dir3/dir4/iterate_correct.png', + 'time_error': 'dir3/time_error.pdf', + 'time_error_1': 'dir3/time_error_1.jpeg' + } - def tearDown(self): - # 清理测试创建的文件和目录 - for root, dirs, files in os.walk(self.test_dir, topdown=False): - for name in files: - os.remove(os.path.join(root, name)) - for name in dirs: - os.rmdir(os.path.join(root, name)) - os.rmdir(self.test_dir) + def test_replace_figures(self): + expected_content = """ + \\begin{figure}[ht] + \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} + \\subfigure[]{\\includegraphics{dir2/iterate_error.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{time_constraint.png}} + \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error.png}} + \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1.jpeg}} + \\label{fig:iteration_information} + """ - def test(self): - replace_figures_extension_with_png(self.original_tex) - with open(self.original_tex, 'r') as f: - content = f.read() - self.assertEqual(content, r''' - \\begin{figure}[ht] - \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}} - \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} - \\subfigure[]{\\includegraphics{figures/iterate_error.png}} - \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}} - \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}} - \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}} - \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}} - \\label{fig:iteration_information} - ''') + with unittest.mock.patch( + "builtins.open", + new=unittest.mock.mock_open(read_data=self.initial_content), + create=True, + ) as file_mock: + replace_figures_in_tex_files(file_mock,self.image_files) + file_mock.assert_called_with(file_mock, "w") + file_mock().write.assert_called_with( + expected_content + ) \ No newline at end of file From 8d4bc5f582cb34575fe232f25c7e51f06e9a24df Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 21 Jun 2024 18:31:39 +0800 Subject: [PATCH 16/39] test(test_folder): modify the content to test the function of replace_figures_in_folders --- tests/test_folder.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/tests/test_folder.py b/tests/test_folder.py index 835d6a3..34e3b0a 100644 --- a/tests/test_folder.py +++ b/tests/test_folder.py @@ -1,27 +1,30 @@ import unittest import os from unittest.mock import patch, MagicMock +from DocParser.vrdu.preprocess import replace_figures_in_folders +class TestGeneratePngFigure(unittest.TestCase): + def setUp(self): + # Simulate image files + self.image_files = { + "file1": "dir1/file1.eps", + "file2": "dir/dir2/file2.png", + "file3": "dir1/file3.jpg", + "file4": "file4.jpeg", + "file5": "dir/dir2/dir5/file5.ps", + "file6": "dir/dir2/dir5/file6.pdf" + } -from DocParser.vrdu.preprocess import generate_png_figure - + @patch('vrdu.utils.convert_eps_image_to_pdf_image') + @patch('vrdu.utils.convert_pdf_figure_to_png_image') + @patch('os.remove') + def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf): -class TestGeneratePngFigure(unittest.TestCase): - @patch("os.path.dirname", return_value="/mocked/dir/") - @patch("os.walk") - @patch("DocParser.vrdu.utils.convert_pdf_figure_to_png_image") - def test_single_pdf_generation(self, mock_save, mock_walk, mock_dirname): - mocked_file = "/mocked/dir/original.tex" - mock_walk.return_value = [ - ("/mocked/dir/", ["dir1", "dir2"], ["file1.txt", "file2.csv"]), - ("/mocked/dir/dir1", [], ["file3.json"]), - ("/mocked/dir/dir2", [], ["file4.pdf"]), - ] - generate_png_figure(mocked_file) - # mock_dirname.assert_called_once_with(mocked_file) + # Mock os.remove to do nothing + mock_os_remove.side_effect = lambda x: None - mock_walk.assert_called_once_with("/mocked/dir/") + replace_figures_in_folders(self.image_files) - mock_save.assert_called_once_with( - "/mocked/dir/dir2/file4.pdf", "/mocked/dir/dir2/file4.png" - ) + # Test the number of times the file conversion function is called + self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) + self.assertEqual(mock_convert_pdf_to_png.call_count, 3) From b57be7892746a42968ed181934a346d1e05ad289 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 24 Jun 2024 10:39:06 +0800 Subject: [PATCH 17/39] test(test_folder.py): assert temp pdf is deleted --- tests/test_folder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_folder.py b/tests/test_folder.py index 34e3b0a..381ba77 100644 --- a/tests/test_folder.py +++ b/tests/test_folder.py @@ -27,4 +27,5 @@ def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_conv # Test the number of times the file conversion function is called self.assertEqual(mock_convert_eps_to_pdf.call_count, 2) + self.assertEqual(mock_os_remove.call_count, 2) self.assertEqual(mock_convert_pdf_to_png.call_count, 3) From 4625088161a52ffabfd745a511222ce79e81d3f3 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 24 Jun 2024 10:39:46 +0800 Subject: [PATCH 18/39] style(test_folder.py, test_extension.py): format file --- tests/test_extension.py | 24 ++++++++++++------------ tests/test_folder.py | 16 +++++++++------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/test_extension.py b/tests/test_extension.py index 2c00812..f39fca6 100644 --- a/tests/test_extension.py +++ b/tests/test_extension.py @@ -1,8 +1,10 @@ import unittest import unittest.mock from DocParser.vrdu.preprocess import replace_figures_in_tex_files + + class TestAbstract(unittest.TestCase): - def setUp(self) -> None: + def setUp(self): self.initial_content = """ \\begin{figure}[ht] \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.pdf}} @@ -17,13 +19,13 @@ def setUp(self) -> None: # Simulate image files with correct extensions self.image_files = { - 'time_vs_dimension': 'dir1/time_vs_dimension.pdf', - 'iterate_constraint': 'dir2/iterate_constraint.jpg', - 'iterate_error': 'dir2/iterate_error.eps', - 'time_constraint': 'time_constraint.es', - 'iterate_correct': 'dir3/dir4/iterate_correct.png', - 'time_error': 'dir3/time_error.pdf', - 'time_error_1': 'dir3/time_error_1.jpeg' + "time_vs_dimension": "dir1/time_vs_dimension.pdf", + "iterate_constraint": "dir2/iterate_constraint.jpg", + "iterate_error": "dir2/iterate_error.eps", + "time_constraint": "time_constraint.es", + "iterate_correct": "dir3/dir4/iterate_correct.png", + "time_error": "dir3/time_error.pdf", + "time_error_1": "dir3/time_error_1.jpeg", } def test_replace_figures(self): @@ -44,8 +46,6 @@ def test_replace_figures(self): new=unittest.mock.mock_open(read_data=self.initial_content), create=True, ) as file_mock: - replace_figures_in_tex_files(file_mock,self.image_files) + replace_figures_in_tex_files(file_mock, self.image_files) file_mock.assert_called_with(file_mock, "w") - file_mock().write.assert_called_with( - expected_content - ) \ No newline at end of file + file_mock().write.assert_called_with(expected_content) diff --git a/tests/test_folder.py b/tests/test_folder.py index 381ba77..fcfec6a 100644 --- a/tests/test_folder.py +++ b/tests/test_folder.py @@ -1,8 +1,8 @@ import unittest -import os -from unittest.mock import patch, MagicMock +from unittest.mock import patch from DocParser.vrdu.preprocess import replace_figures_in_folders + class TestGeneratePngFigure(unittest.TestCase): def setUp(self): # Simulate image files @@ -12,13 +12,15 @@ def setUp(self): "file3": "dir1/file3.jpg", "file4": "file4.jpeg", "file5": "dir/dir2/dir5/file5.ps", - "file6": "dir/dir2/dir5/file6.pdf" + "file6": "dir/dir2/dir5/file6.pdf", } - @patch('vrdu.utils.convert_eps_image_to_pdf_image') - @patch('vrdu.utils.convert_pdf_figure_to_png_image') - @patch('os.remove') - def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf): + @patch("vrdu.utils.convert_eps_image_to_pdf_image") + @patch("vrdu.utils.convert_pdf_figure_to_png_image") + @patch("os.remove") + def test_png_generation( + self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf + ): # Mock os.remove to do nothing mock_os_remove.side_effect = lambda x: None From aafd06d244d3f05ca41c734a3802cf00cd45c606 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 24 Jun 2024 10:40:48 +0800 Subject: [PATCH 19/39] refactor(tests/): use meaningful file name --- tests/{test_folder.py => test_replace_figures_in_folders.py} | 0 tests/{test_extension.py => test_replace_figures_in_tex_file.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_folder.py => test_replace_figures_in_folders.py} (100%) rename tests/{test_extension.py => test_replace_figures_in_tex_file.py} (100%) diff --git a/tests/test_folder.py b/tests/test_replace_figures_in_folders.py similarity index 100% rename from tests/test_folder.py rename to tests/test_replace_figures_in_folders.py diff --git a/tests/test_extension.py b/tests/test_replace_figures_in_tex_file.py similarity index 100% rename from tests/test_extension.py rename to tests/test_replace_figures_in_tex_file.py From 762aa2e9f95b16476426f2a81710f4a9e7bd67ff Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 4 Jul 2024 10:51:31 +0800 Subject: [PATCH 20/39] feat(feat(render.py): add function to handle reference render): --- DocParser/vrdu/renderer.py | 157 +++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 5737bc7..a9ac1b6 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -12,6 +12,10 @@ from DocParser.TexSoup.TexSoup import TexSoup import DocParser.TexSoup.app.conversion as conversion +import bibtexparser +from bibtexparser.bparser import BibTexParser +from bibtexparser.customization import convert_to_unicode + log = logger.get_logger(__name__) @@ -74,6 +78,7 @@ def render_all_env(self, color_tex: str) -> None: """ self.render_simple_envs(color_tex) self.render_float_envs(color_tex) + self.render_reference(color_tex) def render_simple_envs(self, color_tex: str) -> None: """Renders simple environments in a LaTeX file. @@ -445,8 +450,14 @@ def render_one_env(self, main_directory: str) -> None: Returns: None: This function does not return anything. """ + + # handle latex file color_tex_file = os.path.join(main_directory, "paper_colored.tex") white_tex_file = os.path.join(main_directory, "paper_white.tex") + + paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") + shutil.copyfile(color_tex_file, paper_bib_white) + self.modify_color_definitions(color_tex_file, white_tex_file) ordered_env_colors = self.get_env_orders(white_tex_file) suffix = "_color" @@ -470,6 +481,50 @@ def render_one_env(self, main_directory: str) -> None: with open(output_file, "w") as f: f.write(new_content) + # handle bib file + color_bib_file = os.path.join(main_directory, "bib_colored.bib") + white_bib_file = os.path.join(main_directory, "bib_white.bib") + self.modify_color_definitions(color_bib_file, white_bib_file) + ordered_env_colors = self.get_env_orders(white_bib_file) + index_map = defaultdict(int) + + with open(white_bib_file, "r") as f: + bib_content = f.read() + + for index, env_color in enumerate(ordered_env_colors): + env = env_color[: -len(suffix)] + # the first one is the color definition, skip it + bib_new_content = replace_nth( + bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2 + ) + + bib_output_file = os.path.join( + main_directory, + f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib", + ) + + # change the bib file name in paper_bib_white.tex + # \bibliographystyle{bib file name} + with open(paper_bib_white, "r") as f: + tex_content = f.read() + + bib_file_name = os.path.basename(bib_output_file).split(".")[0] + # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content) + tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content) + + tex_output_file = os.path.join( + main_directory, + f"paper_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.tex", + ) + + index_map[env] += 1 + with open(bib_output_file, "w") as f: + f.write(bib_new_content) + + with open(tex_output_file, "w", encoding='utf-8') as f: + f.write(tex_new_content) + + def render_caption(self, tex_file: str) -> None: """Renders captions in a LaTeX file. @@ -616,6 +671,108 @@ def render_abstract(self, tex_file: str) -> None: with open(tex_file, "w") as f: f.write(result) + def render_reference(self, tex_file: str) -> None: + """ + Renders the reference section based on a BibTeX (.bib) file. + + Args: + tex_file (str): The path to the LaTex file. + + Returns: + None + """ + bib_pattern = r'\\bibliography\s*{\s*([^}]+)\s*}' + # Extract directory and filename from LaTeX file path + tex_dir, tex_filename = os.path.split(tex_file) + + # Extract BibTeX file path from LaTeX file + bib_file = None + with open(tex_file, 'r', encoding='utf-8') as tex_f: + tex_content = tex_f.read() + + # Search for bibliography command + match = re.search(bib_pattern, tex_content) + if match: + bib_filename = match.group(1) + '.bib' + bib_file = os.path.join(tex_dir, bib_filename) + + if not bib_file: + print("BibTeX file not found in the LaTeX file.") + return + main_directory = os.path.dirname(tex_file) + + # copy the original tex file + color_bib = os.path.join(main_directory, "bib_colored.bib") + white_bib = os.path.join(main_directory, "bib_white.bib") + shutil.copyfile(bib_file, color_bib) + shutil.copyfile(bib_file, white_bib) + + # Define colorize function inline + def colorize(text: str, category_name: str) -> str: + if category_name == "Reference": + # Define regex patterns + author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]") + note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]") + + # Find the position of the author and year + author_match = author_pattern.search(text) + if author_match: + # Find the start of the author field + author_start = author_match.end() - 1 + author_end = text.find("}", author_start) + if author_end == -1: + author_end = text.find("\"", author_start) + if author_end == -1: + author_end = text.find("\"", author_start) + 1 + # Replace author field with colorized version + if author_end != -1: + text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] + + note_match = note_pattern.search(text) + if note_match: + # Find the start of the year field + year_start = note_match.end() - 1 + year_end_1 = text.find("\"", year_start + 1) + year_end_2 = text.find("}", year_start + 1) + # find the before year_end + if year_end_1 != -1 and year_end_2 != -1: + year_end = min(year_end_1, year_end_2) + else: + year_end = max(year_end_1, year_end_2) + # Replace year field with black color + if year_end != -1: + text = text[:year_end] + "\\color{black}" + text[year_end:] + + else: + # Check if text ends with "}" + if text.endswith("}"): + # Check if the character before the last "}" is "," + if text[-2] == ",": + text = text[:-2] + ",note={\\color{black}}}" + else: + text = text[:-1] + ",note={\\color{black}}}" + + return text + + # Read BibTeX file + with open(color_bib, 'r', encoding='utf-8') as bib_f: + bibtex_entries = bib_f.readlines() + + # Colorize and format references in LaTeX format + colored_references = [] + for entry in bibtex_entries: + if entry.strip().startswith('@'): + formatted_entry = f"{entry.strip()}" + else: + formatted_entry = f" {entry.strip()}" + self.texts["Reference"].append(formatted_entry) + colored_ref = colorize(formatted_entry, "Reference") + colored_references.append(colored_ref) + # Write back to the BibTeX file + with open(color_bib, 'w', encoding='utf-8') as bib_f: + for ref in colored_references: + bib_f.write(ref + "\n") + def render_tabular(self, tex_file: str) -> None: """Renders tabular environments in a LaTeX file. From e0dbb67094d47311c4e7e6ba520cc4051fd7ae57 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 4 Jul 2024 10:54:54 +0800 Subject: [PATCH 21/39] refactor(utils.py): add bib compilation during compilation --- DocParser/vrdu/utils.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index be6fa51..a5f300f 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -40,7 +40,7 @@ def load_json(file_path: str) -> Union[Dict, List]: def compile_latex(file: str) -> None: """ - Compile a LaTeX file using pdflatex engine. + Compile a LaTeX file using pdflatex and bibtex engines. Parameters: file (str): The path to the LaTeX file to be compiled. @@ -49,19 +49,38 @@ def compile_latex(file: str) -> None: None """ file_name = os.path.basename(file) + base_name, _ = os.path.splitext(file_name) + # First compilation with SyncTeX subprocess.run( ["pdflatex", "-interaction=nonstopmode", file_name], timeout=1000, stdout=subprocess.DEVNULL, ) + # Compile BibTeX if .aux file exists + if os.path.exists(base_name + ".aux"): + subprocess.run( + ["bibtex", base_name], + timeout=1000, + stdout=subprocess.DEVNULL, + ) + + # Second compilation to include bibliography subprocess.run( ["pdflatex", "-interaction=nonstopmode", file_name], timeout=1000, stdout=subprocess.DEVNULL, ) + # Third compilation to finalize references and SyncTeX + subprocess.run( + ["pdflatex", "-interaction=nonstopmode", file_name], + timeout=1000, + stdout=subprocess.DEVNULL, + ) + + # Additional compilation for specific file if file_name == "paper_colored.tex": subprocess.run( ["pdflatex", "-interaction=nonstopmode", "-synctex=1", file_name], @@ -69,7 +88,6 @@ def compile_latex(file: str) -> None: stdout=subprocess.DEVNULL, ) - def pdf2jpg(pdf_path: str, output_directory: str) -> None: """ Convert a PDF file into a series of jpg images. @@ -251,4 +269,5 @@ def colorize(text: str, category_name: str) -> str: if category_name == "Code": return "{\\color{" + color + "}" + text + "}" + raise NotImplementedError(f"Invalid category name: {category_name}") From 9b1bdb1362c1108938d9c94e6f08c0a51d652d86 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 4 Jul 2024 10:58:04 +0800 Subject: [PATCH 22/39] fix(main.py): fix folder already exists --- DocParser/main.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/DocParser/main.py b/DocParser/main.py index 2abedd8..299f13a 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -58,6 +58,10 @@ def remove_redundant_stuff(main_directory: str) -> None: for file in redundant_files: os.remove(file) + redundant_bib_files = glob.glob(f"{main_directory}/bib_*") + for file in redundant_bib_files: + os.remove(file) + # remove useless pdf and image files # TODO: move this name pattern into config redundant_folders = glob.glob( @@ -110,10 +114,23 @@ def process_one_file(file_name: str) -> None: cwd = os.getcwd() try: - # change the working directory to the main directory of the paper + # # change the working directory to the main directory of the paper + # os.chdir(main_directory) + # # create output folder + # os.makedirs(os.path.join(main_directory, "output/result")) + + # Save current working directory + cwd = os.getcwd() + + # Change the working directory to the main directory of the paper os.chdir(main_directory) - # create output folder - os.makedirs(os.path.join(main_directory, "output/result")) + + # Create output folder if it doesn't exist + output_folder = os.path.join(main_directory, "output/result") + if not os.path.exists(output_folder): + os.makedirs(output_folder) + else: + print(f"Output folder '{output_folder}' already exists.") # step 1: preprocess the paper preprocess.run(original_tex) From 497da712c3cd0dd97e0ec99f5eee4192fb2c5e51 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 4 Jul 2024 10:59:19 +0800 Subject: [PATCH 23/39] refactor(config.py): add reference category --- DocParser/vrdu/config/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/DocParser/vrdu/config/config.py b/DocParser/vrdu/config/config.py index 081f4ec..9028fba 100644 --- a/DocParser/vrdu/config/config.py +++ b/DocParser/vrdu/config/config.py @@ -79,6 +79,7 @@ "Equation", "Footnote", "List", + "Reference" ] From 201295638f33bc4ffe882c343c05dd4106a9892f Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Mon, 8 Jul 2024 15:26:02 +0800 Subject: [PATCH 24/39] refactor(renderer.py): modify the method of handling Reference --- DocParser/vrdu/renderer.py | 48 +++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index a9ac1b6..be5b9cd 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -416,6 +416,7 @@ def modify_color_definitions(self, input_file: str, output_file: str) -> None: r"\\definecolor{" + color_name + r"}{RGB}{255, 255, 255}", content, ) + content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{bib_white}", content) with open(output_file, "w") as file: file.write(content) @@ -450,14 +451,9 @@ def render_one_env(self, main_directory: str) -> None: Returns: None: This function does not return anything. """ - # handle latex file color_tex_file = os.path.join(main_directory, "paper_colored.tex") white_tex_file = os.path.join(main_directory, "paper_white.tex") - - paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") - shutil.copyfile(color_tex_file, paper_bib_white) - self.modify_color_definitions(color_tex_file, white_tex_file) ordered_env_colors = self.get_env_orders(white_tex_file) suffix = "_color" @@ -482,22 +478,24 @@ def render_one_env(self, main_directory: str) -> None: f.write(new_content) # handle bib file + paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") + shutil.copyfile(white_tex_file, paper_bib_white) color_bib_file = os.path.join(main_directory, "bib_colored.bib") white_bib_file = os.path.join(main_directory, "bib_white.bib") self.modify_color_definitions(color_bib_file, white_bib_file) ordered_env_colors = self.get_env_orders(white_bib_file) - index_map = defaultdict(int) with open(white_bib_file, "r") as f: - bib_content = f.read() + bib_content = f.read() + + index_map = defaultdict(int) for index, env_color in enumerate(ordered_env_colors): env = env_color[: -len(suffix)] - # the first one is the color definition, skip it bib_new_content = replace_nth( - bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2 + bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 ) - + bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}") bib_output_file = os.path.join( main_directory, f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib", @@ -509,7 +507,6 @@ def render_one_env(self, main_directory: str) -> None: tex_content = f.read() bib_file_name = os.path.basename(bib_output_file).split(".")[0] - # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content) tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content) tex_output_file = os.path.join( @@ -712,7 +709,7 @@ def colorize(text: str, category_name: str) -> str: if category_name == "Reference": # Define regex patterns author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]") - note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]") + year_pattern = re.compile(r"\byear\s*=\s*[\{\"]") # Find the position of the author and year author_match = author_pattern.search(text) @@ -728,10 +725,10 @@ def colorize(text: str, category_name: str) -> str: if author_end != -1: text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] - note_match = note_pattern.search(text) - if note_match: + year_match = year_pattern.search(text) + if year_match: # Find the start of the year field - year_start = note_match.end() - 1 + year_start = year_match.end() - 1 year_end_1 = text.find("\"", year_start + 1) year_end_2 = text.find("}", year_start + 1) # find the before year_end @@ -741,19 +738,18 @@ def colorize(text: str, category_name: str) -> str: year_end = max(year_end_1, year_end_2) # Replace year field with black color if year_end != -1: - text = text[:year_end] + "\\color{black}" + text[year_end:] - - else: - # Check if text ends with "}" - if text.endswith("}"): - # Check if the character before the last "}" is "," - if text[-2] == ",": - text = text[:-2] + ",note={\\color{black}}}" - else: - text = text[:-1] + ",note={\\color{black}}}" + text = text[:year_end] + "\\color{white}" + text[year_end:] return text + with open(white_bib, 'r') as bib_file: + bib_content = bib_file.read() + + # use bibtexparser to parse the bib file + bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL) + for item in bib_entries: + self.texts["Reference"].append(item) + # Read BibTeX file with open(color_bib, 'r', encoding='utf-8') as bib_f: bibtex_entries = bib_f.readlines() @@ -765,7 +761,7 @@ def colorize(text: str, category_name: str) -> str: formatted_entry = f"{entry.strip()}" else: formatted_entry = f" {entry.strip()}" - self.texts["Reference"].append(formatted_entry) + # self.texts["Reference"].append(formatted_entry) colored_ref = colorize(formatted_entry, "Reference") colored_references.append(colored_ref) # Write back to the BibTeX file From d59eae1b8363578e7eaed15cee7165dabbc6ec52 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Mon, 8 Jul 2024 18:03:12 +0800 Subject: [PATCH 25/39] refactor(main.py): use unsrt as the default bibliography style --- DocParser/main.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/DocParser/main.py b/DocParser/main.py index 299f13a..18873a7 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -3,6 +3,7 @@ import os import shutil from tqdm import tqdm +import re from vrdu import logger @@ -90,6 +91,13 @@ def process_one_file(file_name: str) -> None: main_directory = os.path.dirname(file_name) log.info(f"[VRDU] file: {file_name}, start processing.") + # use unsrt as the default bibliography style + with open(file_name, "r") as file: + content = file.read() + content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content) + with open(file_name, "w") as file: + file.write(content) + # check if this paper has been processed quality_report_file = os.path.join( main_directory, "output/result/quality_report.json" From 36af9cf8068d76f52ef407b69ec106fc9697c46b Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 12 Jul 2024 09:50:55 +0800 Subject: [PATCH 26/39] feat(render.py): add a function for bib to get env orders --- DocParser/vrdu/renderer.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index be5b9cd..5636ece 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -441,6 +441,27 @@ def get_env_orders(self, tex_file: str) -> List[str]: # the definitions are discarded return matches[len(colors) :] + + def get_bib_env_orders(self, tex_file: str) -> List[str]: + """Returns a list of environment orders based on the contents of the given `tex_file`. + + Args: + tex_file (str): The path to the .tex file. + + Returns: + List[str]: A list of environment orders. + """ + with open(tex_file) as f: + contents = f.read() + colors = list(config.name2color.values()) + matches = [] + + pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors) + for m in re.finditer(pattern, contents): + matches.append(m.group(0)) + + # the definitions are discarded + return matches def render_one_env(self, main_directory: str) -> None: """Render one environment by modifying the corresponding rendering color to black. @@ -483,10 +504,10 @@ def render_one_env(self, main_directory: str) -> None: color_bib_file = os.path.join(main_directory, "bib_colored.bib") white_bib_file = os.path.join(main_directory, "bib_white.bib") self.modify_color_definitions(color_bib_file, white_bib_file) - ordered_env_colors = self.get_env_orders(white_bib_file) + ordered_env_colors = self.get_bib_env_orders(white_bib_file) with open(white_bib_file, "r") as f: - bib_content = f.read() + bib_content = f.read() index_map = defaultdict(int) From 6bd14c134d351a5566e6dbc576090c8da10bf08c Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 12 Jul 2024 09:51:51 +0800 Subject: [PATCH 27/39] refactor(render.py): modify the method color the author --- DocParser/vrdu/renderer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 5636ece..86906f7 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -738,13 +738,17 @@ def colorize(text: str, category_name: str) -> str: # Find the start of the author field author_start = author_match.end() - 1 author_end = text.find("}", author_start) + author_mid = text.find(",", author_start) if author_end == -1: author_end = text.find("\"", author_start) if author_end == -1: author_end = text.find("\"", author_start) + 1 # Replace author field with colorized version if author_end != -1: - text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] + if author_mid != -1 and author_mid < author_end: + text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:] + else: + text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] year_match = year_pattern.search(text) if year_match: From 6aafbb753e8744f2136b7e0be9632ec6fb9be828 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 12 Jul 2024 09:55:03 +0800 Subject: [PATCH 28/39] refactor(layout_annotation.py): add error catching mechanism --- DocParser/vrdu/layout_annotation.py | 172 +++++++++++++++------------- 1 file changed, 91 insertions(+), 81 deletions(-) diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index 31b7a1f..772d952 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -278,97 +278,107 @@ def generate_non_figure_bb(self) -> Dict[int, List[Block]]: log.debug(f"category: {category}, index: {index}") elements = [] - for image_pair in image_pairs: - page_index = image_pair[0] + try: + for image_pair in image_pairs: + page_index = image_pair[0] - image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) - image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) + image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8) + image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8) - diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) - if np.all(diff_image == 0): - continue - labeled_image, num = label( - diff_image > config.threshold, return_num=True - ) - if num == 0: - continue - - regions = regionprops(labeled_image) - bounding_boxes = [region.bbox for region in regions] - - if len(bounding_boxes) == 0: - continue - - separations = self.layout_metadata[page_index]["separations"] - top_margin = self.layout_metadata[page_index]["top_margin"] - - # We do not consider the cross column case for these envs. - if category in envs.one_column_envs: - bounding_boxes = [bb for bb in bounding_boxes] - if len(bounding_boxes) == 0: + diff_image = np.abs(image2_array - image1_array, dtype=np.uint8) + if np.all(diff_image == 0): continue - element = Block( - bounding_box=BoundingBox.from_list(bounding_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, + labeled_image, num = label( + diff_image > config.threshold, return_num=True ) - if elements: - element.parent_block = elements[-1].block_id - elements.append(element) - continue + if num == 0: + continue - # consider possible cross column case - for column in range(self.layout_metadata["num_columns"]): - # min_x: bb[1], min_y: bb[0], max_x: bb[4], max_y: bb[3] - column_boxes = [ - bb - for bb in bounding_boxes - if bb[1] >= separations[column] - and bb[1] <= separations[column + 1] - ] - if not column_boxes: + regions = regionprops(labeled_image) + bounding_boxes = [region.bbox for region in regions] + + if len(bounding_boxes) == 0: continue - element = Block( - bounding_box=BoundingBox.from_list(column_boxes), - source_code=self.text_info[category][index], - category=config.name2category[category], - page_index=page_index, - ) - if elements: - element.parent_block = elements[-1].block_id - - if ( - len(elements) > 0 - and elements[-1].category == element.category - and elements[-1].page_index == element.page_index - and elements[-1].source_code == element.source_code - and elements[-1].bbox.overlap(element.bbox) - ): - elements[-1].bbox = BoundingBox( - min( - elements[-1].bbox.x0, - element.bbox.x0, - ), - min( - elements[-1].bbox.y0, - element.bbox.y0, - ), - max( - elements[-1].bbox.x1, - element.bbox.x1, - ), - max( - elements[-1].bbox.y1, - element.bbox.y1, - ), + separations = self.layout_metadata[page_index]["separations"] + top_margin = self.layout_metadata[page_index]["top_margin"] + + # We do not consider the cross column case for these envs. + if category in envs.one_column_envs: + bounding_boxes = [bb for bb in bounding_boxes] + if len(bounding_boxes) == 0: + continue + element = Block( + bounding_box=BoundingBox.from_list(bounding_boxes), + source_code=self.text_info[category][index], + category=config.name2category[category], + page_index=page_index, ) + if elements: + element.parent_block = elements[-1].block_id + elements.append(element) continue - elements.append(element) - for element in elements: - layout_info[element.page_index].append(element) + # consider possible cross column case + for column in range(self.layout_metadata["num_columns"]): + try: + column_boxes = [ + bb + for bb in bounding_boxes + if bb[1] >= separations[column] + and bb[1] <= separations[column + 1] + ] + if not column_boxes: + continue + + element = Block( + bounding_box=BoundingBox.from_list(column_boxes), + source_code=self.text_info[category][index], + category=config.name2category[category], + page_index=page_index, + ) + if elements: + element.parent_block = elements[-1].block_id + + if ( + len(elements) > 0 + and elements[-1].category == element.category + and elements[-1].page_index == element.page_index + and elements[-1].source_code == element.source_code + and elements[-1].bbox.overlap(element.bbox) + ): + elements[-1].bbox = BoundingBox( + min( + elements[-1].bbox.x0, + element.bbox.x0, + ), + min( + elements[-1].bbox.y0, + element.bbox.y0, + ), + max( + elements[-1].bbox.x1, + element.bbox.x1, + ), + max( + elements[-1].bbox.y1, + element.bbox.y1, + ), + ) + continue + elements.append(element) + except IndexError: + log.error(f"IndexError: {column}") + continue # Skip processing for this column if index is out of range + + for element in elements: + layout_info[element.page_index].append(element) + + except Exception as e: + # Handle the exception as per your application's requirements + log.error(f"Error processing block directory {block_directory}: {str(e)}") + # Optionally, you can raise the exception to stop further processing + # raise return layout_info From bd3eb24d18005cd1b8923bdf1c629dcaa3da9922 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Fri, 12 Jul 2024 09:56:36 +0800 Subject: [PATCH 29/39] refactor(main.py): modify the method of change bibliographystyle --- DocParser/main.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/DocParser/main.py b/DocParser/main.py index 18873a7..9082faa 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -94,7 +94,11 @@ def process_one_file(file_name: str) -> None: # use unsrt as the default bibliography style with open(file_name, "r") as file: content = file.read() - content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content) + # if cant find bibliographystyle, add it + if not re.search(r"\\bibliographystyle", content): + content = re.sub(r"\\end{document}", "\\\\bibliographystyle{unsrt}\n\\\\end{document}", content) + else: + content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content) with open(file_name, "w") as file: file.write(content) @@ -169,11 +173,12 @@ def process_one_file(file_name: str) -> None: log.info(f"[VRDU] file: {original_tex}, successfully processed.") except Exception as e: - error_type = e.__class__.__name__ - error_info = str(e) - log.error( - f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" - ) + # error_type = e.__class__.__name__ + # error_info = str(e) + # log.error( + # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}" + # ) + raise e finally: # remove redundant files From 77d06dc94180b5502665ce540e1489785a5526ea Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 11:29:25 +0800 Subject: [PATCH 30/39] refactor(layout_annotation.py): rm reading_annotation the reading annotation result is already contained in order annotation --- DocParser/vrdu/layout_annotation.py | 67 ----------------------------- 1 file changed, 67 deletions(-) diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py index 772d952..6fbb8c2 100644 --- a/DocParser/vrdu/layout_annotation.py +++ b/DocParser/vrdu/layout_annotation.py @@ -409,66 +409,6 @@ def generate_layout_info(self) -> Dict[int, List[Block]]: layout_info[page_index].extend(figure_layout_info[page_index]) return layout_info - def generate_reading_annotation( - self, layout_info: Dict[int, List[Block]] - ) -> DefaultDict[str, List]: - """Generate a reading annotation based on the layout information. - - Args: - layout_info (Dict[int, List[Block]]): A dictionary containing the layout information - for each page index. The keys are the page indices and the values are lists of - `Block` objects representing the blocks on each page. - - Returns: - DefaultDict[str, List]: A defaultdict containing the reading annotation. The keys - of the defaultdict are the page indices and the values are lists of dictionaries - representing the reading annotation for each block on the page. Each dictionary - contains the following keys: - - "source_code": The source code of the block. - - "image_path": The path to the saved image of the block. - - "category": The category of the block. - - The defaultdict also contains the following keys: - - "categories": A list of dictionaries representing the categories. Each - dictionary contains the following keys: - - "id": The ID of the category. - - "name": The name of the category. - - "macros": A dictionary containing the macro definitions extracted from - the original tex file. - """ - reading_annotation = defaultdict(list) - - # sort all images by page index, see utils.pdf2jpg for details - image_files = sorted( - glob.glob(os.path.join(self.pdf_images_path, "*.jpg")), - key=lambda x: x[-6:-4], - ) - count = 0 - for page_index in layout_info.keys(): - page_image = Image.open(image_files[page_index]) - for block in layout_info[page_index]: - cropped_image = page_image.crop(block.bbox) - - image_name = config.folder_prefix + str(count).zfill(4) + ".jpg" - count += 1 - image_path = os.path.join(self.result_directory, image_name) - cropped_image.save(image_path) - reading_annotation[page_index].append( - { - "source_code": block.source_code, - "image_path": image_name, - "category": block.category, - } - ) - page_image.close() - - reading_annotation["categories"] = [ - {"id": index, "name": category} - for index, category, _ in config.config["category_name"] - ] - - return reading_annotation - def generate_image_annotation( self, layout_info: Dict[int, List[Block]] ) -> Dict[int, Dict[str, Any]]: @@ -546,13 +486,6 @@ def annotate(self): layout_info, image_annotation, file_path=layout_annotation_file ) - # step3: generate reading annotation - reading_annotation = self.generate_reading_annotation(layout_info) - reading_annotation_file = os.path.join( - self.result_directory, "reading_annotation.json" - ) - utils.export_to_json(reading_annotation, reading_annotation_file) - def get_image_pairs(dir1: str, dir2: str): """ From 39b37f9967a3486e636657dc98a9b784fe25a47b Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:12:56 +0800 Subject: [PATCH 31/39] fix(main.py): make dirs twice --- DocParser/main.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/DocParser/main.py b/DocParser/main.py index 9082faa..09c6c34 100644 --- a/DocParser/main.py +++ b/DocParser/main.py @@ -119,10 +119,6 @@ def process_one_file(file_name: str) -> None: if os.path.exists(output_directory): shutil.rmtree(output_directory) - # output_directory stores the intermediate results - # result_directory stores the final results - os.makedirs(os.path.join(main_directory, "output/result")) - cwd = os.getcwd() try: From e6c750a845d18075b114d8af7eed64528ffeabf0 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:26:58 +0800 Subject: [PATCH 32/39] refactor(renderer.py): merge logic of processing predefined color --- DocParser/vrdu/renderer.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 86906f7..638f056 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -367,30 +367,13 @@ def remove_hyperref_color(self, color_tex: str) -> None: if re.search(pattern, content[:preamble_loc]): content = content[:preamble_loc] + hyper_setup + content[preamble_loc:] - # Write the modified content back to the input file - with open(color_tex, "w") as file: - file.write(content) - - def remove_lstlisting_color(self, color_tex: str) -> None: - """Remove color definitions from a LaTeX file. - - Args: - color_tex (str): The path to the LaTeX file. - - Returns: - None - """ - # Read the content of the input file - with open(color_tex, "r") as file: - content = file.read() - - # delete the color definitions + # delete the lstlisting color definitions pattern = r"\\lstset\{.*?\}" - modified_content = re.sub(pattern, "", content) + content = re.sub(pattern, "", content) - # Write the modified content to the output file + # Write the modified content back to the input file with open(color_tex, "w") as file: - file.write(modified_content) + file.write(content) def modify_color_definitions(self, input_file: str, output_file: str) -> None: """Modify the pre-defined color definitions in the input file and write the modified content to the output file. From 4e46a5b566ff2e42c88d21bece090c6bc7268bef Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 17 Jun 2024 15:27:33 +0800 Subject: [PATCH 33/39] refactor(renderer.py, test/): use more meaningful name --- DocParser/vrdu/renderer.py | 7 +++---- ..._hyperref_color.py => test_remove_predefined_color.py} | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) rename tests/{test_remove_hyperref_color.py => test_remove_predefined_color.py} (91%) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 638f056..f7da7d9 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -55,8 +55,7 @@ def render(self, origin_tex: str) -> None: self.add_layout_definition(color_tex) # remove color definitions to prevent conflict - self.remove_hyperref_color(color_tex) - self.remove_lstlisting_color(color_tex) + self.remove_predefined_color(color_tex) self.render_all_env(color_tex) @@ -335,8 +334,8 @@ def add_layout_definition(self, color_tex: str) -> None: with open(color_tex, "w") as f: f.write(content) - def remove_hyperref_color(self, color_tex: str) -> None: - """Removes hyperref color settings from a LaTeX file. + def remove_predefined_color(self, color_tex: str) -> None: + """Removes hyperref and lstlisting color settings from a LaTeX file. Args: color_tex (str): The path to the LaTeX file to modify. diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_predefined_color.py similarity index 91% rename from tests/test_remove_hyperref_color.py rename to tests/test_remove_predefined_color.py index 3b6a287..356f378 100644 --- a/tests/test_remove_hyperref_color.py +++ b/tests/test_remove_predefined_color.py @@ -21,7 +21,7 @@ def test1(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -33,7 +33,7 @@ def test2(self): new=unittest.mock.mock_open(read_data=self.mock_file_content2), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -45,7 +45,7 @@ def test3(self): new=unittest.mock.mock_open(read_data=self.mock_file_content3), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" @@ -57,7 +57,7 @@ def test4(self): new=unittest.mock.mock_open(read_data=self.mock_file_content4), create=True, ) as file_mock: - self.renderer.remove_hyperref_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\usepackage{amsmath}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}""" From 98bed2fc19854f5683dad27f7d9e31cf670c6d02 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Mon, 15 Jul 2024 16:55:15 +0800 Subject: [PATCH 34/39] fix(all): Module DocParser not found --- DocParser/TexSoup/app/conversion.py | 8 ++++---- DocParser/vrdu/renderer.py | 10 +++++----- DocParser/vrdu/utils.py | 4 ++-- scripts/app.py | 4 ++-- scripts/arxiv_download.py | 2 +- scripts/batch_process.py | 4 ++-- scripts/export_to_dataset.py | 2 +- scripts/generate_reading_annotation.py | 4 ++-- scripts/retrieve_metadata.py | 4 ++-- scripts/visualize_order_annotations.py | 2 +- setup.py | 2 +- tests/test_add_definitions.py | 2 +- tests/test_extract_graphics.py | 2 +- tests/test_is_text_eq.py | 2 +- tests/test_remove_predefined_color.py | 2 +- tests/test_render_abstract.py | 2 +- tests/test_render_algorithm.py | 2 +- tests/test_render_caption.py | 2 +- tests/test_render_code.py | 6 +++--- tests/test_render_footnote.py | 2 +- tests/test_render_tabular.py | 2 +- tests/test_render_title.py | 2 +- 22 files changed, 36 insertions(+), 36 deletions(-) diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py index 474c228..3ffe746 100644 --- a/DocParser/TexSoup/app/conversion.py +++ b/DocParser/TexSoup/app/conversion.py @@ -1,11 +1,11 @@ import re -from DocParser.TexSoup.TexSoup import TexSoup -from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup +from TexSoup.TexSoup import TexSoup +from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup -from DocParser.vrdu import logger -from DocParser.vrdu.config import envs +from vrdu import logger +from vrdu.config import envs log = logger.get_logger(__name__) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index f7da7d9..9e7a408 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -5,12 +5,12 @@ import re -import DocParser.vrdu.utils as utils -import DocParser.vrdu.logger as logger -from DocParser.vrdu.config import config, envs +import vrdu.utils as utils +import vrdu.logger as logger +from vrdu.config import config, envs -from DocParser.TexSoup.TexSoup import TexSoup -import DocParser.TexSoup.app.conversion as conversion +from TexSoup.TexSoup import TexSoup +import TexSoup.app.conversion as conversion import bibtexparser from bibtexparser.bparser import BibTexParser diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py index a5f300f..a4d1e9f 100755 --- a/DocParser/vrdu/utils.py +++ b/DocParser/vrdu/utils.py @@ -7,8 +7,8 @@ from pdf2image import pdf2image from pdf2image import generators -from DocParser.vrdu.block import Block -from DocParser.vrdu.config import config +from vrdu.block import Block +from vrdu.config import config def export_to_json(data: Union[Dict, List], file_path: str) -> None: diff --git a/scripts/app.py b/scripts/app.py index 54b4a1c..549d682 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -3,8 +3,8 @@ import glob from PIL import Image, ImageDraw -from DocParser.vrdu import utils -from DocParser.vrdu.config import config +from vrdu import utils +from vrdu.config import config pn.extension() diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py index 971f779..c7c9e10 100644 --- a/scripts/arxiv_download.py +++ b/scripts/arxiv_download.py @@ -5,7 +5,7 @@ import tarfile -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(logger_name="arxiv_download.log") diff --git a/scripts/batch_process.py b/scripts/batch_process.py index 78dbe8d..e357ac4 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -5,8 +5,8 @@ from typing import List import pandas as pd -from DocParser.vrdu import logger -from DocParser.main import process_one_file +from vrdu import logger +from main import process_one_file log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO") diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py index f8c41d8..fafb3d2 100644 --- a/scripts/export_to_dataset.py +++ b/scripts/export_to_dataset.py @@ -6,7 +6,7 @@ import pandas as pd import multiprocessing -from DocParser.vrdu import logger +from vrdu import logger log = logger.setup_app_level_logger(file_name="export_to_dataset.log") diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py index f098d64..a4104b7 100644 --- a/scripts/generate_reading_annotation.py +++ b/scripts/generate_reading_annotation.py @@ -4,8 +4,8 @@ import os from pathlib import Path -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log") diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py index 6897c67..bf97df2 100644 --- a/scripts/retrieve_metadata.py +++ b/scripts/retrieve_metadata.py @@ -6,8 +6,8 @@ import argparse -from DocParser.vrdu import utils -from DocParser.vrdu import logger +from vrdu import utils +from vrdu import logger log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py index b59b365..2f5bc5b 100644 --- a/scripts/visualize_order_annotations.py +++ b/scripts/visualize_order_annotations.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw from matplotlib import pyplot as plt -from DocParser.vrdu import utils +from vrdu import utils def draw_arrow_line( diff --git a/setup.py b/setup.py index ad473aa..ba3749e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name="vrdu_data_process", + name="DocParser", version="1.0.0", description="process the academic papers with .tex source files", author="Mao Song", diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py index 096ca65..f3ca221 100644 --- a/tests/test_add_definitions.py +++ b/tests/test_add_definitions.py @@ -1,7 +1,7 @@ import unittest import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer def test_add_color_definition1(): diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py index 14a2cd5..8335db3 100644 --- a/tests/test_extract_graphics.py +++ b/tests/test_extract_graphics.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestGraphics(unittest.TestCase): diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py index 3baa280..6426411 100644 --- a/tests/test_is_text_eq.py +++ b/tests/test_is_text_eq.py @@ -1,6 +1,6 @@ import unittest -from DocParser.vrdu.renderer import is_text_eq +from vrdu.renderer import is_text_eq class TestTextEq(unittest.TestCase): diff --git a/tests/test_remove_predefined_color.py b/tests/test_remove_predefined_color.py index 356f378..fdc1b34 100644 --- a/tests/test_remove_predefined_color.py +++ b/tests/test_remove_predefined_color.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestHyperref(unittest.TestCase): diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py index 405f6da..16f2cb9 100644 --- a/tests/test_render_abstract.py +++ b/tests/test_render_abstract.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAbstract(unittest.TestCase): diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py index a4cf6ad..c15821e 100644 --- a/tests/test_render_algorithm.py +++ b/tests/test_render_algorithm.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestAlgorithm(unittest.TestCase): diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py index b526f60..eb21de8 100644 --- a/tests/test_render_caption.py +++ b/tests/test_render_caption.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCaption(unittest.TestCase): diff --git a/tests/test_render_code.py b/tests/test_render_code.py index 55082de..79dae23 100644 --- a/tests/test_render_code.py +++ b/tests/test_render_code.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestCode(unittest.TestCase): @@ -71,7 +71,7 @@ def test_no_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content1), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( """\\documentclass{article}\\begin{document}\\end{document}""" @@ -83,7 +83,7 @@ def test_remove_lstset(self): new=unittest.mock.mock_open(read_data=self.mock_file_content5), create=True, ) as file_mock: - self.renderer.remove_lstlisting_color(file_mock) + self.renderer.remove_predefined_color(file_mock) file_mock.assert_called_with(file_mock, "w") file_mock().write.assert_called_with( r"""\documentclass{article}\n\usepackage{listings}\n\usepackage{xcolor}\n\n\definecolor{codegreen}{rgb}{0,0.6,0}\n\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\n\lstdefinestyle{mystyle}{\n backgroundcolor=\color{backcolour}, \n commentstyle=\color{codegreen},\n keywordstyle=\color{magenta},\n numberstyle=\tiny\color{codegray},\n stringstyle=\color{codepurple},\n basicstyle=\ttfamily\footnotesize,\n breakatwhitespace=false, \n breaklines=true, \n captionpos=b, \n keepspaces=true, \n numbers=left, \n numbersep=5pt, \n showspaces=false, \n showstringspaces=false,\n showtabs=false, \n tabsize=2\n}\n\n\n\n\begin{document}\nThe next code will be directly imported from a file\n\n\lstinputlisting[language=Octave]{BitXorMatrix.m}\n\end{document}""" diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py index e81e0fd..e0fcebd 100644 --- a/tests/test_render_footnote.py +++ b/tests/test_render_footnote.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestFootnote(unittest.TestCase): diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py index e57f363..7cb1e52 100644 --- a/tests/test_render_tabular.py +++ b/tests/test_render_tabular.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTabular(unittest.TestCase): diff --git a/tests/test_render_title.py b/tests/test_render_title.py index 122063b..343714e 100644 --- a/tests/test_render_title.py +++ b/tests/test_render_title.py @@ -2,7 +2,7 @@ import unittest.mock -from DocParser.vrdu.renderer import Renderer +from vrdu.renderer import Renderer class TestTitle(unittest.TestCase): From e7f903dcea4ed381a3b5a46a9665ecdbbf7e94a6 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 20 Jun 2024 11:12:27 +0800 Subject: [PATCH 35/39] refactor(preprocess.py): use a more robust to replace figure --- vrdu/preprocess.py | 161 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 vrdu/preprocess.py diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py new file mode 100644 index 0000000..a3dc264 --- /dev/null +++ b/vrdu/preprocess.py @@ -0,0 +1,161 @@ +import os +import re + +from arxiv_cleaner.cleaner import Cleaner + +from vrdu.config import envs, config +from vrdu import utils +import vrdu.logger as logger + + +log = logger.get_logger(__name__) + + +def remove_comments(original_tex: str) -> None: + """ + Removes comments from a TeX file. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None + """ + with open(original_tex, "r") as file: + content = file.read() + + # Remove LaTeX comments + pattern = r"\\begin{comment}(.*?)\\end{comment}" + removed_comments = re.sub(pattern, "", content, flags=re.DOTALL) + + with open(original_tex, "w") as file: + file.write(removed_comments) + + +def clean_tex(original_tex: str) -> None: + """ + Clean the given TeX file by creating a cleaner object and running the clean method. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None + """ + main_directory = os.path.dirname(original_tex) + tex = os.path.basename(original_tex) + + # Create the cleaner + cleaner = Cleaner( + input_dir=main_directory, + output_dir=main_directory, + tex=tex, + command_options=config.command_options, + verbose=False, + ) + + # Run the cleaner + cleaner.clean() + + # remove comments + remove_comments(original_tex) + + +def replace_figures_extension_with_png(original_tex: str) -> None: + """ + Replaces PDF, ps, eps figures' extension with PNG in a TeX file + to support pdfminer detecting bounding box. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None: This function does not return anything. + """ + main_directory = os.path.dirname(original_tex) + image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] + image_files = {} + for root, _, files in os.walk(main_directory): + for file in files: + if any(file.endswith(ext) for ext in image_extensions): + image_name, ext = os.path.splitext(file) + # Store the relative path of the image as the value + image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory) + + with open(original_tex, 'r') as f: + content = f.read() + + # Replace \psfig and \epsfig commands with \includegraphics command + def custom_replace(match): + options = match.group(1) or '' + filepath = match.group(2) + if options: + return f"\\includegraphics[{options}]{{{filepath}}}" + else: + return f"\\includegraphics{{{filepath}}}" + + content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) + + # Traverse the image_files dictionary to update file extensions + for image_name, file_path in image_files.items(): + base_name, current_extension = os.path.splitext(image_name) + correct_extension = os.path.splitext(file_path)[1] + + if correct_extension not in ['.jpg', '.jpeg']: + correct_extension = '.png' + + # Build a regular expression to match image files including optional extensions + pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}') + replacement = rf'\1{correct_extension}}}' + content = pattern.sub(replacement, content) + + # Write the updated content back to the file + with open(original_tex, 'w') as f: + f.write(content) + + +def delete_table_of_contents(original_tex: str) -> None: + """ + Deletes the table of contents from the given original_tex file. + This includes table of contents, list of figures, list of tables, and list of algorithms. + + Parameters: + original_tex (str): The path to the original .tex file. + + Returns: + None + """ + with open(original_tex, "r") as file: + latex_content = file.read() + + pattern = r"\\(" + "|".join(envs.table_of_contents) + r")" + modified_content = re.sub(pattern, "", latex_content) + + with open(original_tex, "w") as file: + file.write(modified_content) + + +def run(original_tex: str) -> None: + """ + Generates a modified version of the given LaTeX document by performing the following steps: + + Step 0: Clean the LaTeX document with arxiv_cleaner package. + Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex. + Step 2: Replace PDF figures with PNG to make pdfminer work. + Step 3: Delete the table of contents from the LaTeX document. + + Args: + original_tex (str): The original LaTeX document. + + Returns: + None + """ + # Step 0: clean tex + clean_tex(original_tex) + + # Step 1: process images + replace_figures_extension_with_png(original_tex) + + # Step 3: delete table of contents + delete_table_of_contents(original_tex) From 8b80b23c1083c9d57c9a3d121e28b8f83a4b2f8c Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Thu, 4 Jul 2024 10:51:31 +0800 Subject: [PATCH 36/39] feat(feat(render.py): add function to handle reference render): --- DocParser/vrdu/renderer.py | 52 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 9e7a408..49b3b39 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -457,6 +457,10 @@ def render_one_env(self, main_directory: str) -> None: # handle latex file color_tex_file = os.path.join(main_directory, "paper_colored.tex") white_tex_file = os.path.join(main_directory, "paper_white.tex") + + paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") + shutil.copyfile(color_tex_file, paper_bib_white) + self.modify_color_definitions(color_tex_file, white_tex_file) ordered_env_colors = self.get_env_orders(white_tex_file) suffix = "_color" @@ -481,24 +485,22 @@ def render_one_env(self, main_directory: str) -> None: f.write(new_content) # handle bib file - paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") - shutil.copyfile(white_tex_file, paper_bib_white) color_bib_file = os.path.join(main_directory, "bib_colored.bib") white_bib_file = os.path.join(main_directory, "bib_white.bib") self.modify_color_definitions(color_bib_file, white_bib_file) - ordered_env_colors = self.get_bib_env_orders(white_bib_file) + ordered_env_colors = self.get_env_orders(white_bib_file) + index_map = defaultdict(int) with open(white_bib_file, "r") as f: bib_content = f.read() - - index_map = defaultdict(int) for index, env_color in enumerate(ordered_env_colors): env = env_color[: -len(suffix)] + # the first one is the color definition, skip it bib_new_content = replace_nth( - bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 + bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2 ) - bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}") + bib_output_file = os.path.join( main_directory, f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib", @@ -510,6 +512,7 @@ def render_one_env(self, main_directory: str) -> None: tex_content = f.read() bib_file_name = os.path.basename(bib_output_file).split(".")[0] + # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content) tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content) tex_output_file = os.path.join( @@ -712,7 +715,7 @@ def colorize(text: str, category_name: str) -> str: if category_name == "Reference": # Define regex patterns author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]") - year_pattern = re.compile(r"\byear\s*=\s*[\{\"]") + note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]") # Find the position of the author and year author_match = author_pattern.search(text) @@ -720,22 +723,18 @@ def colorize(text: str, category_name: str) -> str: # Find the start of the author field author_start = author_match.end() - 1 author_end = text.find("}", author_start) - author_mid = text.find(",", author_start) if author_end == -1: author_end = text.find("\"", author_start) if author_end == -1: author_end = text.find("\"", author_start) + 1 # Replace author field with colorized version if author_end != -1: - if author_mid != -1 and author_mid < author_end: - text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:] - else: - text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] + text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] - year_match = year_pattern.search(text) - if year_match: + note_match = note_pattern.search(text) + if note_match: # Find the start of the year field - year_start = year_match.end() - 1 + year_start = note_match.end() - 1 year_end_1 = text.find("\"", year_start + 1) year_end_2 = text.find("}", year_start + 1) # find the before year_end @@ -745,18 +744,19 @@ def colorize(text: str, category_name: str) -> str: year_end = max(year_end_1, year_end_2) # Replace year field with black color if year_end != -1: - text = text[:year_end] + "\\color{white}" + text[year_end:] + text = text[:year_end] + "\\color{black}" + text[year_end:] + + else: + # Check if text ends with "}" + if text.endswith("}"): + # Check if the character before the last "}" is "," + if text[-2] == ",": + text = text[:-2] + ",note={\\color{black}}}" + else: + text = text[:-1] + ",note={\\color{black}}}" return text - with open(white_bib, 'r') as bib_file: - bib_content = bib_file.read() - - # use bibtexparser to parse the bib file - bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL) - for item in bib_entries: - self.texts["Reference"].append(item) - # Read BibTeX file with open(color_bib, 'r', encoding='utf-8') as bib_f: bibtex_entries = bib_f.readlines() @@ -768,7 +768,7 @@ def colorize(text: str, category_name: str) -> str: formatted_entry = f"{entry.strip()}" else: formatted_entry = f" {entry.strip()}" - # self.texts["Reference"].append(formatted_entry) + self.texts["Reference"].append(formatted_entry) colored_ref = colorize(formatted_entry, "Reference") colored_references.append(colored_ref) # Write back to the BibTeX file From 05a3b5f42c6adf3f611c1d728258473099225e5e Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Tue, 16 Jul 2024 11:21:01 +0800 Subject: [PATCH 37/39] feat(preprocess.py): add function remove vskip --- DocParser/vrdu/preprocess.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py index 4d02e9c..9e825bf 100644 --- a/DocParser/vrdu/preprocess.py +++ b/DocParser/vrdu/preprocess.py @@ -32,6 +32,25 @@ def remove_comments(original_tex: str) -> None: with open(original_tex, "w") as file: file.write(removed_comments) +def remove_skip(original_tex: str) -> None: + """ + Removes skip from a TeX file. + + Args: + original_tex (str): The path to the original TeX file. + + Returns: + None + """ + with open(original_tex, "r") as file: + content = file.read() + + pattern = r"\\vskip .*|\\vspace{.*}|\\vglue .*" + removed_skip = re.sub(pattern, '', content) + + with open(original_tex, "w") as file: + file.write(removed_skip) + def clean_tex(original_tex: str) -> None: """ @@ -61,6 +80,9 @@ def clean_tex(original_tex: str) -> None: # remove comments remove_comments(original_tex) + # remove skip + remove_skip(original_tex) + def replace_non_png_jpg_figures(original_tex: str) -> None: """ From d693f4735e51eb7e25106e2a1721fe1b86a79fa8 Mon Sep 17 00:00:00 2001 From: CHEN YANG <1402375027@qq.com> Date: Tue, 16 Jul 2024 11:26:11 +0800 Subject: [PATCH 38/39] refactor(render.py): modify the method handle bib_file --- DocParser/vrdu/renderer.py | 92 +++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py index 49b3b39..88c26d9 100644 --- a/DocParser/vrdu/renderer.py +++ b/DocParser/vrdu/renderer.py @@ -12,13 +12,8 @@ from TexSoup.TexSoup import TexSoup import TexSoup.app.conversion as conversion -import bibtexparser -from bibtexparser.bparser import BibTexParser -from bibtexparser.customization import convert_to_unicode - log = logger.get_logger(__name__) - class Renderer: def __init__(self) -> None: self.texts = defaultdict(list) @@ -55,7 +50,8 @@ def render(self, origin_tex: str) -> None: self.add_layout_definition(color_tex) # remove color definitions to prevent conflict - self.remove_predefined_color(color_tex) + self.remove_hyperref_color(color_tex) + self.remove_lstlisting_color(color_tex) self.render_all_env(color_tex) @@ -334,8 +330,8 @@ def add_layout_definition(self, color_tex: str) -> None: with open(color_tex, "w") as f: f.write(content) - def remove_predefined_color(self, color_tex: str) -> None: - """Removes hyperref and lstlisting color settings from a LaTeX file. + def remove_hyperref_color(self, color_tex: str) -> None: + """Removes hyperref color settings from a LaTeX file. Args: color_tex (str): The path to the LaTeX file to modify. @@ -366,14 +362,31 @@ def remove_predefined_color(self, color_tex: str) -> None: if re.search(pattern, content[:preamble_loc]): content = content[:preamble_loc] + hyper_setup + content[preamble_loc:] - # delete the lstlisting color definitions - pattern = r"\\lstset\{.*?\}" - content = re.sub(pattern, "", content) - # Write the modified content back to the input file with open(color_tex, "w") as file: file.write(content) + def remove_lstlisting_color(self, color_tex: str) -> None: + """Remove color definitions from a LaTeX file. + + Args: + color_tex (str): The path to the LaTeX file. + + Returns: + None + """ + # Read the content of the input file + with open(color_tex, "r") as file: + content = file.read() + + # delete the color definitions + pattern = r"\\lstset\{.*?\}" + modified_content = re.sub(pattern, "", content) + + # Write the modified content to the output file + with open(color_tex, "w") as file: + file.write(modified_content) + def modify_color_definitions(self, input_file: str, output_file: str) -> None: """Modify the pre-defined color definitions in the input file and write the modified content to the output file. @@ -457,10 +470,6 @@ def render_one_env(self, main_directory: str) -> None: # handle latex file color_tex_file = os.path.join(main_directory, "paper_colored.tex") white_tex_file = os.path.join(main_directory, "paper_white.tex") - - paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") - shutil.copyfile(color_tex_file, paper_bib_white) - self.modify_color_definitions(color_tex_file, white_tex_file) ordered_env_colors = self.get_env_orders(white_tex_file) suffix = "_color" @@ -485,34 +494,35 @@ def render_one_env(self, main_directory: str) -> None: f.write(new_content) # handle bib file + paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex") + shutil.copyfile(white_tex_file, paper_bib_white) color_bib_file = os.path.join(main_directory, "bib_colored.bib") white_bib_file = os.path.join(main_directory, "bib_white.bib") self.modify_color_definitions(color_bib_file, white_bib_file) - ordered_env_colors = self.get_env_orders(white_bib_file) - index_map = defaultdict(int) + ordered_env_colors = self.get_bib_env_orders(white_bib_file) + # print(ordered_env_colors) with open(white_bib_file, "r") as f: bib_content = f.read() + + index_map = defaultdict(int) for index, env_color in enumerate(ordered_env_colors): env = env_color[: -len(suffix)] - # the first one is the color definition, skip it bib_new_content = replace_nth( - bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2 + bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 ) - + bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}") bib_output_file = os.path.join( main_directory, f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib", ) # change the bib file name in paper_bib_white.tex - # \bibliographystyle{bib file name} with open(paper_bib_white, "r") as f: tex_content = f.read() bib_file_name = os.path.basename(bib_output_file).split(".")[0] - # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content) tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content) tex_output_file = os.path.join( @@ -715,7 +725,7 @@ def colorize(text: str, category_name: str) -> str: if category_name == "Reference": # Define regex patterns author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]") - note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]") + year_pattern = re.compile(r"\byear\s*=\s*[\{\"]") # Find the position of the author and year author_match = author_pattern.search(text) @@ -723,18 +733,22 @@ def colorize(text: str, category_name: str) -> str: # Find the start of the author field author_start = author_match.end() - 1 author_end = text.find("}", author_start) + author_mid = text.find(",", author_start) if author_end == -1: author_end = text.find("\"", author_start) if author_end == -1: author_end = text.find("\"", author_start) + 1 # Replace author field with colorized version if author_end != -1: - text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] - - note_match = note_pattern.search(text) - if note_match: + if author_mid != -1 and author_mid < author_end: + text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:] + else: + text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:] + + year_match = year_pattern.search(text) + if year_match: # Find the start of the year field - year_start = note_match.end() - 1 + year_start = year_match.end() - 1 year_end_1 = text.find("\"", year_start + 1) year_end_2 = text.find("}", year_start + 1) # find the before year_end @@ -744,19 +758,18 @@ def colorize(text: str, category_name: str) -> str: year_end = max(year_end_1, year_end_2) # Replace year field with black color if year_end != -1: - text = text[:year_end] + "\\color{black}" + text[year_end:] - - else: - # Check if text ends with "}" - if text.endswith("}"): - # Check if the character before the last "}" is "," - if text[-2] == ",": - text = text[:-2] + ",note={\\color{black}}}" - else: - text = text[:-1] + ",note={\\color{black}}}" + text = text[:year_end] + "\\color{white}" + text[year_end:] return text + with open(white_bib, 'r') as bib_file: + bib_content = bib_file.read() + + # use bibtexparser to parse the bib file + bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL) + for item in bib_entries: + self.texts["Reference"].append(item) + # Read BibTeX file with open(color_bib, 'r', encoding='utf-8') as bib_f: bibtex_entries = bib_f.readlines() @@ -768,7 +781,6 @@ def colorize(text: str, category_name: str) -> str: formatted_entry = f"{entry.strip()}" else: formatted_entry = f" {entry.strip()}" - self.texts["Reference"].append(formatted_entry) colored_ref = colorize(formatted_entry, "Reference") colored_references.append(colored_ref) # Write back to the BibTeX file From 16a2cf8f920c6abb23a7a7c94ee56888b4544c44 Mon Sep 17 00:00:00 2001 From: MaoSong2022 Date: Wed, 17 Jul 2024 14:59:18 +0800 Subject: [PATCH 39/39] refactor(vrdu/): remove redundant folders --- vrdu/preprocess.py | 161 --------------------------------------------- 1 file changed, 161 deletions(-) delete mode 100644 vrdu/preprocess.py diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py deleted file mode 100644 index a3dc264..0000000 --- a/vrdu/preprocess.py +++ /dev/null @@ -1,161 +0,0 @@ -import os -import re - -from arxiv_cleaner.cleaner import Cleaner - -from vrdu.config import envs, config -from vrdu import utils -import vrdu.logger as logger - - -log = logger.get_logger(__name__) - - -def remove_comments(original_tex: str) -> None: - """ - Removes comments from a TeX file. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None - """ - with open(original_tex, "r") as file: - content = file.read() - - # Remove LaTeX comments - pattern = r"\\begin{comment}(.*?)\\end{comment}" - removed_comments = re.sub(pattern, "", content, flags=re.DOTALL) - - with open(original_tex, "w") as file: - file.write(removed_comments) - - -def clean_tex(original_tex: str) -> None: - """ - Clean the given TeX file by creating a cleaner object and running the clean method. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None - """ - main_directory = os.path.dirname(original_tex) - tex = os.path.basename(original_tex) - - # Create the cleaner - cleaner = Cleaner( - input_dir=main_directory, - output_dir=main_directory, - tex=tex, - command_options=config.command_options, - verbose=False, - ) - - # Run the cleaner - cleaner.clean() - - # remove comments - remove_comments(original_tex) - - -def replace_figures_extension_with_png(original_tex: str) -> None: - """ - Replaces PDF, ps, eps figures' extension with PNG in a TeX file - to support pdfminer detecting bounding box. - - Args: - original_tex (str): The path to the original TeX file. - - Returns: - None: This function does not return anything. - """ - main_directory = os.path.dirname(original_tex) - image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"] - image_files = {} - for root, _, files in os.walk(main_directory): - for file in files: - if any(file.endswith(ext) for ext in image_extensions): - image_name, ext = os.path.splitext(file) - # Store the relative path of the image as the value - image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory) - - with open(original_tex, 'r') as f: - content = f.read() - - # Replace \psfig and \epsfig commands with \includegraphics command - def custom_replace(match): - options = match.group(1) or '' - filepath = match.group(2) - if options: - return f"\\includegraphics[{options}]{{{filepath}}}" - else: - return f"\\includegraphics{{{filepath}}}" - - content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) - content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content) - - # Traverse the image_files dictionary to update file extensions - for image_name, file_path in image_files.items(): - base_name, current_extension = os.path.splitext(image_name) - correct_extension = os.path.splitext(file_path)[1] - - if correct_extension not in ['.jpg', '.jpeg']: - correct_extension = '.png' - - # Build a regular expression to match image files including optional extensions - pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}') - replacement = rf'\1{correct_extension}}}' - content = pattern.sub(replacement, content) - - # Write the updated content back to the file - with open(original_tex, 'w') as f: - f.write(content) - - -def delete_table_of_contents(original_tex: str) -> None: - """ - Deletes the table of contents from the given original_tex file. - This includes table of contents, list of figures, list of tables, and list of algorithms. - - Parameters: - original_tex (str): The path to the original .tex file. - - Returns: - None - """ - with open(original_tex, "r") as file: - latex_content = file.read() - - pattern = r"\\(" + "|".join(envs.table_of_contents) + r")" - modified_content = re.sub(pattern, "", latex_content) - - with open(original_tex, "w") as file: - file.write(modified_content) - - -def run(original_tex: str) -> None: - """ - Generates a modified version of the given LaTeX document by performing the following steps: - - Step 0: Clean the LaTeX document with arxiv_cleaner package. - Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex. - Step 2: Replace PDF figures with PNG to make pdfminer work. - Step 3: Delete the table of contents from the LaTeX document. - - Args: - original_tex (str): The original LaTeX document. - - Returns: - None - """ - # Step 0: clean tex - clean_tex(original_tex) - - # Step 1: process images - replace_figures_extension_with_png(original_tex) - - # Step 3: delete table of contents - delete_table_of_contents(original_tex)