From 4e8c728760cbe24091c7bbb52c6bcc97b48f3910 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:12:27 +0800
Subject: [PATCH 01/39] refactor(preprocess.py): use a more robust to replace
 figure

---
 vrdu/preprocess.py | 111 +++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 70 deletions(-)

diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py
index f4f4003..a3dc264 100644
--- a/vrdu/preprocess.py
+++ b/vrdu/preprocess.py
@@ -61,9 +61,9 @@ def clean_tex(original_tex: str) -> None:
     remove_comments(original_tex)
 
 
-def replace_pdf_ps_figures_with_png(original_tex: str) -> None:
+def replace_figures_extension_with_png(original_tex: str) -> None:
     """
-    Replaces PDF, ps, eps figures with PNG figures in a TeX file
+    Replaces PDF, ps, eps figures' extension with PNG in a TeX file
     to support pdfminer detecting bounding box.
 
     Args:
@@ -71,76 +71,47 @@ def replace_pdf_ps_figures_with_png(original_tex: str) -> None:
 
     Returns:
         None: This function does not return anything.
-
-    Raises:
-        FileNotFoundError: If a PDF file specified in the TeX file is not found.
     """
-
-    # FIXME: use more robust way, since the path to images may not exists.
     main_directory = os.path.dirname(original_tex)
-    with open(original_tex) as f:
+    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
+    image_files = {}
+    for root, _, files in os.walk(main_directory):
+        for file in files:
+            if any(file.endswith(ext) for ext in image_extensions):
+                image_name, ext = os.path.splitext(file)
+                # Store the relative path of the image as the value
+                image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory)
+
+    with open(original_tex, 'r') as f:
         content = f.read()
 
-    graphicspath_pattern = r"\\graphicspath\{\{(.+?)}"
-    match = re.search(graphicspath_pattern, content, re.DOTALL)
-    if match:
-        graphic_path = match.group(1)
-    else:
-        graphic_path = ""
-
-    # Replace \psfig{...} with \includegraphics{...}
-    content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content)
-
-    # Replace \epsfig{...} with \includegraphics{...}
-    content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content)
-
-    # Regular expression pattern to match \includegraphics
-    # commands with PDF files
-    pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}"
-
-    # Find all matches of \includegraphics with PDF files
-    matches = re.findall(pattern, content)
-
-    # Replace PDF paths with PNG paths
-    ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
-    for match in matches:
-        image_name = match[1]
-        if not any(ext in image_name for ext in ext_patterns):
-            for ext in ext_patterns:
-                image_file = os.path.join(main_directory, graphic_path, image_name, ext)
-                if os.path.exists(image_file):
-                    image_name = image_name + ext
-                    break
-
-        # detectable image type, see pdfminer.six for details
-        if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]):
-            content = content.replace(match[1], image_name)
-            continue
-
-        # convert eps to pdf
-        if any(ext in image_name for ext in [".eps", ".ps"]):
-            eps_image = os.path.join(main_directory, graphic_path, image_name)
-            if not os.path.exists(eps_image):
-                log.error(f"File not found: {eps_image}")
-                continue
-            pdf_image = os.path.splitext(eps_image)[0] + ".pdf"
-            utils.convert_eps_image_to_pdf_image(eps_image, pdf_image)
-            image_name = os.path.basename(pdf_image)
-
-        # convert pdf to png
-        if image_name.endswith(".pdf"):
-            pdf_image = os.path.join(main_directory, graphic_path, image_name)
-            if not os.path.exists(pdf_image):
-                log.error(f"File not found: {pdf_image}")
-                continue
-            png_image = os.path.splitext(pdf_image)[0] + ".png"
-            utils.convert_pdf_figure_to_png_image(pdf_image, png_image)
-            image_name = os.path.splitext(image_name)[0] + ".png"
-
-        # replace the reference in tex file
-        content = content.replace(match[1], image_name)
-
-    with open(original_tex, "w") as f:
+    # Replace \psfig and \epsfig commands with \includegraphics command
+    def custom_replace(match):
+        options = match.group(1) or ''
+        filepath = match.group(2)
+        if options:
+            return f"\\includegraphics[{options}]{{{filepath}}}"
+        else:
+            return f"\\includegraphics{{{filepath}}}"
+
+    content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+    content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+
+    # Traverse the image_files dictionary to update file extensions
+    for image_name, file_path in image_files.items():
+        base_name, current_extension = os.path.splitext(image_name)
+        correct_extension = os.path.splitext(file_path)[1]
+
+        if correct_extension not in ['.jpg', '.jpeg']:
+            correct_extension = '.png'
+
+        # Build a regular expression to match image files including optional extensions
+        pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}')
+        replacement = rf'\1{correct_extension}}}'
+        content = pattern.sub(replacement, content)
+
+    # Write the updated content back to the file
+    with open(original_tex, 'w') as f:
         f.write(content)
 
 
@@ -183,8 +154,8 @@ def run(original_tex: str) -> None:
     # Step 0: clean tex
     clean_tex(original_tex)
 
-    # Step 2: process images
-    replace_pdf_ps_figures_with_png(original_tex)
+    # Step 1: process images
+    replace_figures_extension_with_png(original_tex)
 
     # Step 3: delete table of contents
     delete_table_of_contents(original_tex)

From 278d6644714b9106771d796fc8afcbaefc4f35f7 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:18:02 +0800
Subject: [PATCH 02/39] feat(preprocess.py): generate png figures

---
 vrdu/preprocess.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py
index a3dc264..d776b4d 100644
--- a/vrdu/preprocess.py
+++ b/vrdu/preprocess.py
@@ -115,6 +115,41 @@ def custom_replace(match):
         f.write(content)
 
 
+
+def generate_png_figure(original_tex: str) -> None:
+    """
+    Generate PNG figures for PDF, ps, eps figures.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None: This function does not return anything.
+    """
+    main_directory = os.path.dirname(original_tex)
+    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
+    image_files = {}
+    for root, _, files in os.walk(main_directory):
+        for file in files:
+            if any(file.endswith(ext) for ext in image_extensions):
+                image_name, ext = os.path.splitext(file)
+                image_files[image_name] = os.path.join(root, file)
+
+    for image_name, file_path in image_files.items():
+        if file_path.endswith(".eps") or file_path.endswith(".ps"):
+            output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")
+            temp_pdf = os.path.join(os.path.dirname(file_path), image_name + ".pdf")
+            # convert eps to pdf
+            utils.convert_eps_image_to_pdf_image(file_path, temp_pdf)
+            # convert pdf to png
+            utils.convert_pdf_figure_to_png_image(temp_pdf, output_png)
+        elif file_path.endswith(".pdf"):
+            output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")
+            # convert pdf to png
+            utils.convert_pdf_figure_to_png_image(file_path, output_png)
+
+
+
 def delete_table_of_contents(original_tex: str) -> None:
     """
     Deletes the table of contents from the given original_tex file.
@@ -157,5 +192,8 @@ def run(original_tex: str) -> None:
     # Step 1: process images
     replace_figures_extension_with_png(original_tex)
 
+    # Step 2: generate png figures
+    generate_png_figure(original_tex)
+
     # Step 3: delete table of contents
     delete_table_of_contents(original_tex)

From 4293732b914286721bd4c35b1465f2e0ba802016 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:19:36 +0800
Subject: [PATCH 03/39] test(test_extension.py): add test for
 replace_figures_extension_with_png

---
 tests/test_extension.py | 60 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 tests/test_extension.py

diff --git a/tests/test_extension.py b/tests/test_extension.py
new file mode 100644
index 0000000..243fc3c
--- /dev/null
+++ b/tests/test_extension.py
@@ -0,0 +1,60 @@
+import unittest
+import os
+import unittest.mock
+
+
+from replace_figure_extension import replace_figures_extension_with_png
+
+
+class TestAbstract(unittest.TestCase):
+    def setUp(self) -> None:
+        
+        # 测试环境的设置，包括创建测试文件夹和文件
+        self.test_dir = 'test_directory'
+        self.original_tex = os.path.join(self.test_dir, 'test.tex')
+        os.makedirs(self.test_dir, exist_ok=True)
+        with open(self.original_tex, 'w') as f:
+            f.write(r'''
+                    \\begin{figure}[ht]
+                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
+                    \\subfigure[]{\\epsfig{figures/iterate_error.eps}} 
+                    \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}}
+                    \\label{fig:iteration_information}
+                    ''')
+
+        # 模拟图片文件
+        self.image_files = [
+            'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg'
+        ]
+        for file_name in self.image_files:
+            with open(os.path.join(self.test_dir, file_name), 'w') as f:
+                f.write('dummy content')
+
+    def tearDown(self):
+        # 清理测试创建的文件和目录
+        for root, dirs, files in os.walk(self.test_dir, topdown=False):
+            for name in files:
+                os.remove(os.path.join(root, name))
+            for name in dirs:
+                os.rmdir(os.path.join(root, name))
+        os.rmdir(self.test_dir)
+
+    def test(self):
+        replace_figures_extension_with_png(self.original_tex)
+        with open(self.original_tex, 'r') as f:
+            content = f.read()
+        self.assertEqual(content, r'''
+                    \\begin{figure}[ht]
+                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
+                    \\subfigure[]{\\includegraphics{figures/iterate_error.png}} 
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}}
+                    \\label{fig:iteration_information}
+                    ''')

From 94b64554e5642253aebee2f30e08755b291d776a Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:20:05 +0800
Subject: [PATCH 04/39] test(test_folder.py): add test for generate_png_figure

---
 tests/test_folder.py | 50 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 tests/test_folder.py

diff --git a/tests/test_folder.py b/tests/test_folder.py
new file mode 100644
index 0000000..40386e0
--- /dev/null
+++ b/tests/test_folder.py
@@ -0,0 +1,50 @@
+import unittest
+import os
+from unittest.mock import patch, MagicMock
+from generate_figure import generate_png_figure
+
+class TestGeneratePngFigure(unittest.TestCase):
+    def setUp(self):
+        # 设置测试环境，模拟有各种类型文件的目录
+        self.test_dir = 'test_directory_1'
+        self.original_tex = os.path.join(self.test_dir, 'test.tex')
+        os.makedirs(self.test_dir, exist_ok=True)
+        self.image_files = [
+            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf'
+        ]
+        for file_name in self.image_files:
+            with open(os.path.join(self.test_dir, file_name), 'w') as f:
+                f.write('dummy content')
+
+    def tearDown(self):
+        # 清理测试创建的文件和目录
+        for root, dirs, files in os.walk(self.test_dir, topdown=False):
+            for name in files:
+                os.remove(os.path.join(root, name))
+            for name in dirs:
+                os.rmdir(os.path.join(root, name))
+        os.rmdir(self.test_dir)
+
+    @patch('vrdu.utils.convert_eps_image_to_pdf_image')
+    @patch('vrdu.utils.convert_pdf_figure_to_png_image')
+    def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf):
+        generate_png_figure(self.original_tex)
+
+        # 检查文件生成情况
+        expected_files = [
+            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', 
+            'image1.png', 'image2.png', 'image6.png'
+        ]
+        # 获取当前目录下所有文件
+        generated_files = os.listdir(self.test_dir)
+    
+
+        # 目前模拟的测试环境中，无法真的生成文件,导致expected_files和generated_files不一致
+
+        # print("Expected Files:", expected_files)
+        # print("Generated Files:", generated_files)
+        # self.assertCountEqual(expected_files, generated_files)
+
+        # 检查函数调用
+        self.assertEqual(mock_convert_eps_to_pdf.call_count, 2)  # 对于两个EPS/PS文件的调用
+        self.assertEqual(mock_convert_pdf_to_png.call_count, 3)  # 对于三个PDF文件的调用
\ No newline at end of file

From d174257cb6ae63eed4274230d6c82e4229f92ee0 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:12:27 +0800
Subject: [PATCH 05/39] refactor(preprocess.py): use a more robust to replace
 figure

---
 DocParser/vrdu/preprocess.py | 111 +++++++++++++----------------------
 1 file changed, 41 insertions(+), 70 deletions(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index f4f4003..a3dc264 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -61,9 +61,9 @@ def clean_tex(original_tex: str) -> None:
     remove_comments(original_tex)
 
 
-def replace_pdf_ps_figures_with_png(original_tex: str) -> None:
+def replace_figures_extension_with_png(original_tex: str) -> None:
     """
-    Replaces PDF, ps, eps figures with PNG figures in a TeX file
+    Replaces PDF, ps, eps figures' extension with PNG in a TeX file
     to support pdfminer detecting bounding box.
 
     Args:
@@ -71,76 +71,47 @@ def replace_pdf_ps_figures_with_png(original_tex: str) -> None:
 
     Returns:
         None: This function does not return anything.
-
-    Raises:
-        FileNotFoundError: If a PDF file specified in the TeX file is not found.
     """
-
-    # FIXME: use more robust way, since the path to images may not exists.
     main_directory = os.path.dirname(original_tex)
-    with open(original_tex) as f:
+    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
+    image_files = {}
+    for root, _, files in os.walk(main_directory):
+        for file in files:
+            if any(file.endswith(ext) for ext in image_extensions):
+                image_name, ext = os.path.splitext(file)
+                # Store the relative path of the image as the value
+                image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory)
+
+    with open(original_tex, 'r') as f:
         content = f.read()
 
-    graphicspath_pattern = r"\\graphicspath\{\{(.+?)}"
-    match = re.search(graphicspath_pattern, content, re.DOTALL)
-    if match:
-        graphic_path = match.group(1)
-    else:
-        graphic_path = ""
-
-    # Replace \psfig{...} with \includegraphics{...}
-    content = re.sub(r"\\psfig{([^}]*)}", r"\\includegraphics{\1}", content)
-
-    # Replace \epsfig{...} with \includegraphics{...}
-    content = re.sub(r"\\epsfig{([^}]*)}", r"\\includegraphics{\1}", content)
-
-    # Regular expression pattern to match \includegraphics
-    # commands with PDF files
-    pattern = r"\\includegraphics(\[.*?\])?\{(.*?)\}"
-
-    # Find all matches of \includegraphics with PDF files
-    matches = re.findall(pattern, content)
-
-    # Replace PDF paths with PNG paths
-    ext_patterns = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
-    for match in matches:
-        image_name = match[1]
-        if not any(ext in image_name for ext in ext_patterns):
-            for ext in ext_patterns:
-                image_file = os.path.join(main_directory, graphic_path, image_name, ext)
-                if os.path.exists(image_file):
-                    image_name = image_name + ext
-                    break
-
-        # detectable image type, see pdfminer.six for details
-        if any(ext in image_name for ext in [".jpg", ".jpeg", "png"]):
-            content = content.replace(match[1], image_name)
-            continue
-
-        # convert eps to pdf
-        if any(ext in image_name for ext in [".eps", ".ps"]):
-            eps_image = os.path.join(main_directory, graphic_path, image_name)
-            if not os.path.exists(eps_image):
-                log.error(f"File not found: {eps_image}")
-                continue
-            pdf_image = os.path.splitext(eps_image)[0] + ".pdf"
-            utils.convert_eps_image_to_pdf_image(eps_image, pdf_image)
-            image_name = os.path.basename(pdf_image)
-
-        # convert pdf to png
-        if image_name.endswith(".pdf"):
-            pdf_image = os.path.join(main_directory, graphic_path, image_name)
-            if not os.path.exists(pdf_image):
-                log.error(f"File not found: {pdf_image}")
-                continue
-            png_image = os.path.splitext(pdf_image)[0] + ".png"
-            utils.convert_pdf_figure_to_png_image(pdf_image, png_image)
-            image_name = os.path.splitext(image_name)[0] + ".png"
-
-        # replace the reference in tex file
-        content = content.replace(match[1], image_name)
-
-    with open(original_tex, "w") as f:
+    # Replace \psfig and \epsfig commands with \includegraphics command
+    def custom_replace(match):
+        options = match.group(1) or ''
+        filepath = match.group(2)
+        if options:
+            return f"\\includegraphics[{options}]{{{filepath}}}"
+        else:
+            return f"\\includegraphics{{{filepath}}}"
+
+    content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+    content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+
+    # Traverse the image_files dictionary to update file extensions
+    for image_name, file_path in image_files.items():
+        base_name, current_extension = os.path.splitext(image_name)
+        correct_extension = os.path.splitext(file_path)[1]
+
+        if correct_extension not in ['.jpg', '.jpeg']:
+            correct_extension = '.png'
+
+        # Build a regular expression to match image files including optional extensions
+        pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}')
+        replacement = rf'\1{correct_extension}}}'
+        content = pattern.sub(replacement, content)
+
+    # Write the updated content back to the file
+    with open(original_tex, 'w') as f:
         f.write(content)
 
 
@@ -183,8 +154,8 @@ def run(original_tex: str) -> None:
     # Step 0: clean tex
     clean_tex(original_tex)
 
-    # Step 2: process images
-    replace_pdf_ps_figures_with_png(original_tex)
+    # Step 1: process images
+    replace_figures_extension_with_png(original_tex)
 
     # Step 3: delete table of contents
     delete_table_of_contents(original_tex)

From 7467858a08b012483fed8a61d596021329382e43 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:18:02 +0800
Subject: [PATCH 06/39] feat(preprocess.py): generate png figures

---
 DocParser/vrdu/preprocess.py | 38 ++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index a3dc264..d776b4d 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -115,6 +115,41 @@ def custom_replace(match):
         f.write(content)
 
 
+
+def generate_png_figure(original_tex: str) -> None:
+    """
+    Generate PNG figures for PDF, ps, eps figures.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None: This function does not return anything.
+    """
+    main_directory = os.path.dirname(original_tex)
+    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
+    image_files = {}
+    for root, _, files in os.walk(main_directory):
+        for file in files:
+            if any(file.endswith(ext) for ext in image_extensions):
+                image_name, ext = os.path.splitext(file)
+                image_files[image_name] = os.path.join(root, file)
+
+    for image_name, file_path in image_files.items():
+        if file_path.endswith(".eps") or file_path.endswith(".ps"):
+            output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")
+            temp_pdf = os.path.join(os.path.dirname(file_path), image_name + ".pdf")
+            # convert eps to pdf
+            utils.convert_eps_image_to_pdf_image(file_path, temp_pdf)
+            # convert pdf to png
+            utils.convert_pdf_figure_to_png_image(temp_pdf, output_png)
+        elif file_path.endswith(".pdf"):
+            output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")
+            # convert pdf to png
+            utils.convert_pdf_figure_to_png_image(file_path, output_png)
+
+
+
 def delete_table_of_contents(original_tex: str) -> None:
     """
     Deletes the table of contents from the given original_tex file.
@@ -157,5 +192,8 @@ def run(original_tex: str) -> None:
     # Step 1: process images
     replace_figures_extension_with_png(original_tex)
 
+    # Step 2: generate png figures
+    generate_png_figure(original_tex)
+
     # Step 3: delete table of contents
     delete_table_of_contents(original_tex)

From afa40757f1c2319d99ade9e6d59a373c25d8c565 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:19:36 +0800
Subject: [PATCH 07/39] test(test_extension.py): add test for
 replace_figures_extension_with_png

---
 tests/test_extension.py | 60 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 tests/test_extension.py

diff --git a/tests/test_extension.py b/tests/test_extension.py
new file mode 100644
index 0000000..243fc3c
--- /dev/null
+++ b/tests/test_extension.py
@@ -0,0 +1,60 @@
+import unittest
+import os
+import unittest.mock
+
+
+from replace_figure_extension import replace_figures_extension_with_png
+
+
+class TestAbstract(unittest.TestCase):
+    def setUp(self) -> None:
+        
+        # 测试环境的设置，包括创建测试文件夹和文件
+        self.test_dir = 'test_directory'
+        self.original_tex = os.path.join(self.test_dir, 'test.tex')
+        os.makedirs(self.test_dir, exist_ok=True)
+        with open(self.original_tex, 'w') as f:
+            f.write(r'''
+                    \\begin{figure}[ht]
+                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
+                    \\subfigure[]{\\epsfig{figures/iterate_error.eps}} 
+                    \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}}
+                    \\label{fig:iteration_information}
+                    ''')
+
+        # 模拟图片文件
+        self.image_files = [
+            'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg'
+        ]
+        for file_name in self.image_files:
+            with open(os.path.join(self.test_dir, file_name), 'w') as f:
+                f.write('dummy content')
+
+    def tearDown(self):
+        # 清理测试创建的文件和目录
+        for root, dirs, files in os.walk(self.test_dir, topdown=False):
+            for name in files:
+                os.remove(os.path.join(root, name))
+            for name in dirs:
+                os.rmdir(os.path.join(root, name))
+        os.rmdir(self.test_dir)
+
+    def test(self):
+        replace_figures_extension_with_png(self.original_tex)
+        with open(self.original_tex, 'r') as f:
+            content = f.read()
+        self.assertEqual(content, r'''
+                    \\begin{figure}[ht]
+                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
+                    \\subfigure[]{\\includegraphics{figures/iterate_error.png}} 
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}}
+                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}}
+                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}}
+                    \\label{fig:iteration_information}
+                    ''')

From e8a9c5f19e2c0cec57c5694c54e89439474aaa2c Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:20:05 +0800
Subject: [PATCH 08/39] test(test_folder.py): add test for generate_png_figure

---
 tests/test_folder.py | 50 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 tests/test_folder.py

diff --git a/tests/test_folder.py b/tests/test_folder.py
new file mode 100644
index 0000000..40386e0
--- /dev/null
+++ b/tests/test_folder.py
@@ -0,0 +1,50 @@
+import unittest
+import os
+from unittest.mock import patch, MagicMock
+from generate_figure import generate_png_figure
+
+class TestGeneratePngFigure(unittest.TestCase):
+    def setUp(self):
+        # 设置测试环境，模拟有各种类型文件的目录
+        self.test_dir = 'test_directory_1'
+        self.original_tex = os.path.join(self.test_dir, 'test.tex')
+        os.makedirs(self.test_dir, exist_ok=True)
+        self.image_files = [
+            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf'
+        ]
+        for file_name in self.image_files:
+            with open(os.path.join(self.test_dir, file_name), 'w') as f:
+                f.write('dummy content')
+
+    def tearDown(self):
+        # 清理测试创建的文件和目录
+        for root, dirs, files in os.walk(self.test_dir, topdown=False):
+            for name in files:
+                os.remove(os.path.join(root, name))
+            for name in dirs:
+                os.rmdir(os.path.join(root, name))
+        os.rmdir(self.test_dir)
+
+    @patch('vrdu.utils.convert_eps_image_to_pdf_image')
+    @patch('vrdu.utils.convert_pdf_figure_to_png_image')
+    def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf):
+        generate_png_figure(self.original_tex)
+
+        # 检查文件生成情况
+        expected_files = [
+            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', 
+            'image1.png', 'image2.png', 'image6.png'
+        ]
+        # 获取当前目录下所有文件
+        generated_files = os.listdir(self.test_dir)
+    
+
+        # 目前模拟的测试环境中，无法真的生成文件,导致expected_files和generated_files不一致
+
+        # print("Expected Files:", expected_files)
+        # print("Generated Files:", generated_files)
+        # self.assertCountEqual(expected_files, generated_files)
+
+        # 检查函数调用
+        self.assertEqual(mock_convert_eps_to_pdf.call_count, 2)  # 对于两个EPS/PS文件的调用
+        self.assertEqual(mock_convert_pdf_to_png.call_count, 3)  # 对于三个PDF文件的调用
\ No newline at end of file

From 2af62ad37f932682d8d77812c39705a2e198579a Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Thu, 20 Jun 2024 18:08:08 +0800
Subject: [PATCH 09/39] test(test_folder.py): test without creating actual
 files

---
 tests/test_folder.py | 57 +++++++++++++-------------------------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/tests/test_folder.py b/tests/test_folder.py
index 40386e0..835d6a3 100644
--- a/tests/test_folder.py
+++ b/tests/test_folder.py
@@ -1,50 +1,27 @@
 import unittest
 import os
 from unittest.mock import patch, MagicMock
-from generate_figure import generate_png_figure
 
-class TestGeneratePngFigure(unittest.TestCase):
-    def setUp(self):
-        # 设置测试环境，模拟有各种类型文件的目录
-        self.test_dir = 'test_directory_1'
-        self.original_tex = os.path.join(self.test_dir, 'test.tex')
-        os.makedirs(self.test_dir, exist_ok=True)
-        self.image_files = [
-            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf'
-        ]
-        for file_name in self.image_files:
-            with open(os.path.join(self.test_dir, file_name), 'w') as f:
-                f.write('dummy content')
 
-    def tearDown(self):
-        # 清理测试创建的文件和目录
-        for root, dirs, files in os.walk(self.test_dir, topdown=False):
-            for name in files:
-                os.remove(os.path.join(root, name))
-            for name in dirs:
-                os.rmdir(os.path.join(root, name))
-        os.rmdir(self.test_dir)
+from DocParser.vrdu.preprocess import generate_png_figure
 
-    @patch('vrdu.utils.convert_eps_image_to_pdf_image')
-    @patch('vrdu.utils.convert_pdf_figure_to_png_image')
-    def test_png_generation(self, mock_convert_pdf_to_png, mock_convert_eps_to_pdf):
-        generate_png_figure(self.original_tex)
 
-        # 检查文件生成情况
-        expected_files = [
-            'image1.eps', 'image2.ps', 'image3.jpg', 'image4.jpeg', 'image5.png', 'image6.pdf', 
-            'image1.png', 'image2.png', 'image6.png'
+class TestGeneratePngFigure(unittest.TestCase):
+    @patch("os.path.dirname", return_value="/mocked/dir/")
+    @patch("os.walk")
+    @patch("DocParser.vrdu.utils.convert_pdf_figure_to_png_image")
+    def test_single_pdf_generation(self, mock_save, mock_walk, mock_dirname):
+        mocked_file = "/mocked/dir/original.tex"
+        mock_walk.return_value = [
+            ("/mocked/dir/", ["dir1", "dir2"], ["file1.txt", "file2.csv"]),
+            ("/mocked/dir/dir1", [], ["file3.json"]),
+            ("/mocked/dir/dir2", [], ["file4.pdf"]),
         ]
-        # 获取当前目录下所有文件
-        generated_files = os.listdir(self.test_dir)
-    
-
-        # 目前模拟的测试环境中，无法真的生成文件,导致expected_files和generated_files不一致
+        generate_png_figure(mocked_file)
+        # mock_dirname.assert_called_once_with(mocked_file)
 
-        # print("Expected Files:", expected_files)
-        # print("Generated Files:", generated_files)
-        # self.assertCountEqual(expected_files, generated_files)
+        mock_walk.assert_called_once_with("/mocked/dir/")
 
-        # 检查函数调用
-        self.assertEqual(mock_convert_eps_to_pdf.call_count, 2)  # 对于两个EPS/PS文件的调用
-        self.assertEqual(mock_convert_pdf_to_png.call_count, 3)  # 对于三个PDF文件的调用
\ No newline at end of file
+        mock_save.assert_called_once_with(
+            "/mocked/dir/dir2/file4.pdf", "/mocked/dir/dir2/file4.png"
+        )

From 001847a49518bd129f5642003bba905c1708344f Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Fri, 21 Jun 2024 10:36:52 +0800
Subject: [PATCH 10/39] refactor(preprocess.py): use image_files as argument to
 prevent repeat code

---
 DocParser/vrdu/preprocess.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index d776b4d..2bb179a 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -115,26 +115,7 @@ def custom_replace(match):
         f.write(content)
 
 
-
-def generate_png_figure(original_tex: str) -> None:
-    """
-    Generate PNG figures for PDF, ps, eps figures.
-
-    Args:
-        original_tex (str): The path to the original TeX file.
-
-    Returns:
-        None: This function does not return anything.
-    """
-    main_directory = os.path.dirname(original_tex)
-    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
-    image_files = {}
-    for root, _, files in os.walk(main_directory):
-        for file in files:
-            if any(file.endswith(ext) for ext in image_extensions):
-                image_name, ext = os.path.splitext(file)
-                image_files[image_name] = os.path.join(root, file)
-
+def replace_figures_in_folders(image_files: Dict[str, str]) -> None:
     for image_name, file_path in image_files.items():
         if file_path.endswith(".eps") or file_path.endswith(".ps"):
             output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")

From 3ba9de545fb10cf831497e17763f3ab13cbc4ee3 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Fri, 21 Jun 2024 10:39:06 +0800
Subject: [PATCH 11/39] refactor(preprocess.py): enclose replacing figures in
 tex file as a function

---
 DocParser/vrdu/preprocess.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index 2bb179a..cee78ea 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -80,14 +80,21 @@ def replace_figures_extension_with_png(original_tex: str) -> None:
             if any(file.endswith(ext) for ext in image_extensions):
                 image_name, ext = os.path.splitext(file)
                 # Store the relative path of the image as the value
-                image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory)
+                image_files[image_name] = os.path.relpath(
+                    os.path.join(root, file), main_directory
+                )
 
-    with open(original_tex, 'r') as f:
+    replace_figures_in_tex_files(original_tex, image_files)
+
+def replace_figures_in_tex_files(
+    original_tex: str, image_files: Dict[str, str]
+) -> None:
+    with open(original_tex, "r") as f:
         content = f.read()
 
     # Replace \psfig and \epsfig commands with \includegraphics command
     def custom_replace(match):
-        options = match.group(1) or ''
+        options = match.group(1) or ""
         filepath = match.group(2)
         if options:
             return f"\\includegraphics[{options}]{{{filepath}}}"
@@ -102,16 +109,20 @@ def custom_replace(match):
         base_name, current_extension = os.path.splitext(image_name)
         correct_extension = os.path.splitext(file_path)[1]
 
-        if correct_extension not in ['.jpg', '.jpeg']:
-            correct_extension = '.png'
+        if correct_extension not in [".jpg", ".jpeg"]:
+            correct_extension = ".png"
 
         # Build a regular expression to match image files including optional extensions
-        pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}')
-        replacement = rf'\1{correct_extension}}}'
+        pattern = re.compile(
+            r"(\\includegraphics(?:\[[^\]]*\])?\{.*?"
+            + re.escape(base_name)
+            + r")(\.\w+)?\}"
+        )
+        replacement = rf"\1{correct_extension}}}"
         content = pattern.sub(replacement, content)
 
     # Write the updated content back to the file
-    with open(original_tex, 'w') as f:
+    with open(original_tex, "w") as f:
         f.write(content)
 
 

From b69c55aed083770c489fc8abe49cb27bf9e08ca3 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Fri, 21 Jun 2024 10:40:15 +0800
Subject: [PATCH 12/39] feat(preprocess.py): remove intermediate generated pdf

---
 DocParser/vrdu/preprocess.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index cee78ea..6483b65 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -1,5 +1,6 @@
 import os
 import re
+from typing import Dict
 
 from arxiv_cleaner.cleaner import Cleaner
 
@@ -85,6 +86,8 @@ def replace_figures_extension_with_png(original_tex: str) -> None:
                 )
 
     replace_figures_in_tex_files(original_tex, image_files)
+    replace_figures_in_folders(image_files)
+
 
 def replace_figures_in_tex_files(
     original_tex: str, image_files: Dict[str, str]
@@ -135,13 +138,14 @@ def replace_figures_in_folders(image_files: Dict[str, str]) -> None:
             utils.convert_eps_image_to_pdf_image(file_path, temp_pdf)
             # convert pdf to png
             utils.convert_pdf_figure_to_png_image(temp_pdf, output_png)
+            # remove redundant files
+            os.remove(temp_pdf)
         elif file_path.endswith(".pdf"):
             output_png = os.path.join(os.path.dirname(file_path), image_name + ".png")
             # convert pdf to png
             utils.convert_pdf_figure_to_png_image(file_path, output_png)
 
 
-
 def delete_table_of_contents(original_tex: str) -> None:
     """
     Deletes the table of contents from the given original_tex file.

From 3c41811b64178a510b38811ac46882d9f39d8d09 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Fri, 21 Jun 2024 10:42:19 +0800
Subject: [PATCH 13/39] refactor(preprocess.py): use meaningful function name

---
 DocParser/vrdu/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index 6483b65..c63a1c2 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -62,7 +62,7 @@ def clean_tex(original_tex: str) -> None:
     remove_comments(original_tex)
 
 
-def replace_figures_extension_with_png(original_tex: str) -> None:
+def replace_non_png_jpg_figures(original_tex: str) -> None:
     """
     Replaces PDF, ps, eps figures' extension with PNG in a TeX file
     to support pdfminer detecting bounding box.
@@ -186,7 +186,7 @@ def run(original_tex: str) -> None:
     clean_tex(original_tex)
 
     # Step 1: process images
-    replace_figures_extension_with_png(original_tex)
+    replace_non_png_jpg_figures(original_tex)
 
     # Step 2: generate png figures
     generate_png_figure(original_tex)

From c3e0ba6b295213b441b2f6e420bd300ad14a3bd2 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Fri, 21 Jun 2024 10:42:44 +0800
Subject: [PATCH 14/39] refactor(preprocess.py): delete unused function call

---
 DocParser/vrdu/preprocess.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index c63a1c2..4d02e9c 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -188,8 +188,5 @@ def run(original_tex: str) -> None:
     # Step 1: process images
     replace_non_png_jpg_figures(original_tex)
 
-    # Step 2: generate png figures
-    generate_png_figure(original_tex)
-
-    # Step 3: delete table of contents
+    # Step 2: delete table of contents
     delete_table_of_contents(original_tex)

From c565dfccf5aeb8e66a3811e34ba0343cc000c797 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 21 Jun 2024 18:30:53 +0800
Subject: [PATCH 15/39] test(test_extension): modify the content to test the
 function of replace_figures_in_tex_files

---
 tests/test_extension.py | 97 +++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 53 deletions(-)

diff --git a/tests/test_extension.py b/tests/test_extension.py
index 243fc3c..2c00812 100644
--- a/tests/test_extension.py
+++ b/tests/test_extension.py
@@ -1,60 +1,51 @@
 import unittest
-import os
 import unittest.mock
-
-
-from replace_figure_extension import replace_figures_extension_with_png
-
-
+from DocParser.vrdu.preprocess import replace_figures_in_tex_files
 class TestAbstract(unittest.TestCase):
     def setUp(self) -> None:
-        
-        # 测试环境的设置，包括创建测试文件夹和文件
-        self.test_dir = 'test_directory'
-        self.original_tex = os.path.join(self.test_dir, 'test.tex')
-        os.makedirs(self.test_dir, exist_ok=True)
-        with open(self.original_tex, 'w') as f:
-            f.write(r'''
-                    \\begin{figure}[ht]
-                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.pdf}}
-                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
-                    \\subfigure[]{\\epsfig{figures/iterate_error.eps}} 
-                    \\subfigure[]{\\psfig[width=0.48\\columnwidth]{figures/time_constraint.ps}}
-                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
-                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error}}
-                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1}}
-                    \\label{fig:iteration_information}
-                    ''')
+        self.initial_content = """
+            \\begin{figure}[ht]
+            \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.pdf}}
+            \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} 
+            \\subfigure[]{\\epsfig{dir2/iterate_error.eps}} 
+            \\subfigure[]{\\psfig[width=0.48\\columnwidth]{time_constraint.es}}
+            \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}}
+            \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error}}
+            \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1}}
+            \\label{fig:iteration_information}
+        """
 
-        # 模拟图片文件
-        self.image_files = [
-            'time_vs_dimension.pdf', 'iterate_constraint.jpg', 'iterate_error.eps', 'time_constraint.ps', 'iterate_correct.png', 'time_error.pdf', 'time_error_1.jpeg'
-        ]
-        for file_name in self.image_files:
-            with open(os.path.join(self.test_dir, file_name), 'w') as f:
-                f.write('dummy content')
+        # Simulate image files with correct extensions
+        self.image_files = {
+            'time_vs_dimension': 'dir1/time_vs_dimension.pdf',
+            'iterate_constraint': 'dir2/iterate_constraint.jpg',
+            'iterate_error': 'dir2/iterate_error.eps',
+            'time_constraint': 'time_constraint.es',
+            'iterate_correct': 'dir3/dir4/iterate_correct.png',
+            'time_error': 'dir3/time_error.pdf',
+            'time_error_1': 'dir3/time_error_1.jpeg'
+        }
 
-    def tearDown(self):
-        # 清理测试创建的文件和目录
-        for root, dirs, files in os.walk(self.test_dir, topdown=False):
-            for name in files:
-                os.remove(os.path.join(root, name))
-            for name in dirs:
-                os.rmdir(os.path.join(root, name))
-        os.rmdir(self.test_dir)
+    def test_replace_figures(self):
+        expected_content = """
+            \\begin{figure}[ht]
+            \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.png}}
+            \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir2/iterate_constraint.jpg}} 
+            \\subfigure[]{\\includegraphics{dir2/iterate_error.png}} 
+            \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{time_constraint.png}}
+            \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{dir3/dir4/iterate_correct.png}}
+            \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error.png}}
+            \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{dir3/time_error_1.jpeg}}
+            \\label{fig:iteration_information}
+        """
 
-    def test(self):
-        replace_figures_extension_with_png(self.original_tex)
-        with open(self.original_tex, 'r') as f:
-            content = f.read()
-        self.assertEqual(content, r'''
-                    \\begin{figure}[ht]
-                    \\centerline{\\includegraphics[width=\\columnwidth]{figures/time_vs_dimension.png}}
-                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_constraint.jpg}} 
-                    \\subfigure[]{\\includegraphics{figures/iterate_error.png}} 
-                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/time_constraint.png}}
-                    \\subfigure[]{\\includegraphics[width=0.48\\columnwidth]{figures/iterate_correct.png}}
-                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error.png}}
-                    \\subfigure[]{\\includegraphics[width=0.5\\columnwidth]{figures/time_error_1.jpeg}}
-                    \\label{fig:iteration_information}
-                    ''')
+        with unittest.mock.patch(
+            "builtins.open",
+            new=unittest.mock.mock_open(read_data=self.initial_content),
+            create=True,
+        ) as file_mock:
+            replace_figures_in_tex_files(file_mock,self.image_files)
+            file_mock.assert_called_with(file_mock, "w")
+            file_mock().write.assert_called_with(
+                expected_content
+            )
\ No newline at end of file

From 8d4bc5f582cb34575fe232f25c7e51f06e9a24df Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 21 Jun 2024 18:31:39 +0800
Subject: [PATCH 16/39] test(test_folder): modify the content to test the
 function of replace_figures_in_folders

---
 tests/test_folder.py | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/tests/test_folder.py b/tests/test_folder.py
index 835d6a3..34e3b0a 100644
--- a/tests/test_folder.py
+++ b/tests/test_folder.py
@@ -1,27 +1,30 @@
 import unittest
 import os
 from unittest.mock import patch, MagicMock
+from DocParser.vrdu.preprocess import replace_figures_in_folders
 
+class TestGeneratePngFigure(unittest.TestCase):
+    def setUp(self):
+        # Simulate image files
+        self.image_files = {
+            "file1": "dir1/file1.eps",
+            "file2": "dir/dir2/file2.png",
+            "file3": "dir1/file3.jpg",
+            "file4": "file4.jpeg",
+            "file5": "dir/dir2/dir5/file5.ps",
+            "file6": "dir/dir2/dir5/file6.pdf"
+        }
 
-from DocParser.vrdu.preprocess import generate_png_figure
-
+    @patch('vrdu.utils.convert_eps_image_to_pdf_image')
+    @patch('vrdu.utils.convert_pdf_figure_to_png_image')
+    @patch('os.remove')
+    def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf):
 
-class TestGeneratePngFigure(unittest.TestCase):
-    @patch("os.path.dirname", return_value="/mocked/dir/")
-    @patch("os.walk")
-    @patch("DocParser.vrdu.utils.convert_pdf_figure_to_png_image")
-    def test_single_pdf_generation(self, mock_save, mock_walk, mock_dirname):
-        mocked_file = "/mocked/dir/original.tex"
-        mock_walk.return_value = [
-            ("/mocked/dir/", ["dir1", "dir2"], ["file1.txt", "file2.csv"]),
-            ("/mocked/dir/dir1", [], ["file3.json"]),
-            ("/mocked/dir/dir2", [], ["file4.pdf"]),
-        ]
-        generate_png_figure(mocked_file)
-        # mock_dirname.assert_called_once_with(mocked_file)
+        # Mock os.remove to do nothing
+        mock_os_remove.side_effect = lambda x: None
 
-        mock_walk.assert_called_once_with("/mocked/dir/")
+        replace_figures_in_folders(self.image_files)
 
-        mock_save.assert_called_once_with(
-            "/mocked/dir/dir2/file4.pdf", "/mocked/dir/dir2/file4.png"
-        )
+        # Test the number of times the file conversion function is called
+        self.assertEqual(mock_convert_eps_to_pdf.call_count, 2)
+        self.assertEqual(mock_convert_pdf_to_png.call_count, 3)

From b57be7892746a42968ed181934a346d1e05ad289 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Mon, 24 Jun 2024 10:39:06 +0800
Subject: [PATCH 17/39] test(test_folder.py): assert temp pdf is deleted

---
 tests/test_folder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_folder.py b/tests/test_folder.py
index 34e3b0a..381ba77 100644
--- a/tests/test_folder.py
+++ b/tests/test_folder.py
@@ -27,4 +27,5 @@ def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_conv
 
         # Test the number of times the file conversion function is called
         self.assertEqual(mock_convert_eps_to_pdf.call_count, 2)
+        self.assertEqual(mock_os_remove.call_count, 2)
         self.assertEqual(mock_convert_pdf_to_png.call_count, 3)

From 4625088161a52ffabfd745a511222ce79e81d3f3 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Mon, 24 Jun 2024 10:39:46 +0800
Subject: [PATCH 18/39] style(test_folder.py, test_extension.py): format file

---
 tests/test_extension.py | 24 ++++++++++++------------
 tests/test_folder.py    | 16 +++++++++-------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/tests/test_extension.py b/tests/test_extension.py
index 2c00812..f39fca6 100644
--- a/tests/test_extension.py
+++ b/tests/test_extension.py
@@ -1,8 +1,10 @@
 import unittest
 import unittest.mock
 from DocParser.vrdu.preprocess import replace_figures_in_tex_files
+
+
 class TestAbstract(unittest.TestCase):
-    def setUp(self) -> None:
+    def setUp(self):
         self.initial_content = """
             \\begin{figure}[ht]
             \\centerline{\\includegraphics[width=\\columnwidth]{dir1/time_vs_dimension.pdf}}
@@ -17,13 +19,13 @@ def setUp(self) -> None:
 
         # Simulate image files with correct extensions
         self.image_files = {
-            'time_vs_dimension': 'dir1/time_vs_dimension.pdf',
-            'iterate_constraint': 'dir2/iterate_constraint.jpg',
-            'iterate_error': 'dir2/iterate_error.eps',
-            'time_constraint': 'time_constraint.es',
-            'iterate_correct': 'dir3/dir4/iterate_correct.png',
-            'time_error': 'dir3/time_error.pdf',
-            'time_error_1': 'dir3/time_error_1.jpeg'
+            "time_vs_dimension": "dir1/time_vs_dimension.pdf",
+            "iterate_constraint": "dir2/iterate_constraint.jpg",
+            "iterate_error": "dir2/iterate_error.eps",
+            "time_constraint": "time_constraint.es",
+            "iterate_correct": "dir3/dir4/iterate_correct.png",
+            "time_error": "dir3/time_error.pdf",
+            "time_error_1": "dir3/time_error_1.jpeg",
         }
 
     def test_replace_figures(self):
@@ -44,8 +46,6 @@ def test_replace_figures(self):
             new=unittest.mock.mock_open(read_data=self.initial_content),
             create=True,
         ) as file_mock:
-            replace_figures_in_tex_files(file_mock,self.image_files)
+            replace_figures_in_tex_files(file_mock, self.image_files)
             file_mock.assert_called_with(file_mock, "w")
-            file_mock().write.assert_called_with(
-                expected_content
-            )
\ No newline at end of file
+            file_mock().write.assert_called_with(expected_content)
diff --git a/tests/test_folder.py b/tests/test_folder.py
index 381ba77..fcfec6a 100644
--- a/tests/test_folder.py
+++ b/tests/test_folder.py
@@ -1,8 +1,8 @@
 import unittest
-import os
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 from DocParser.vrdu.preprocess import replace_figures_in_folders
 
+
 class TestGeneratePngFigure(unittest.TestCase):
     def setUp(self):
         # Simulate image files
@@ -12,13 +12,15 @@ def setUp(self):
             "file3": "dir1/file3.jpg",
             "file4": "file4.jpeg",
             "file5": "dir/dir2/dir5/file5.ps",
-            "file6": "dir/dir2/dir5/file6.pdf"
+            "file6": "dir/dir2/dir5/file6.pdf",
         }
 
-    @patch('vrdu.utils.convert_eps_image_to_pdf_image')
-    @patch('vrdu.utils.convert_pdf_figure_to_png_image')
-    @patch('os.remove')
-    def test_png_generation(self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf):
+    @patch("vrdu.utils.convert_eps_image_to_pdf_image")
+    @patch("vrdu.utils.convert_pdf_figure_to_png_image")
+    @patch("os.remove")
+    def test_png_generation(
+        self, mock_os_remove, mock_convert_pdf_to_png, mock_convert_eps_to_pdf
+    ):
 
         # Mock os.remove to do nothing
         mock_os_remove.side_effect = lambda x: None

From aafd06d244d3f05ca41c734a3802cf00cd45c606 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <MaoSong2022@pjlab.org.cn>
Date: Mon, 24 Jun 2024 10:40:48 +0800
Subject: [PATCH 19/39] refactor(tests/): use meaningful file name

---
 tests/{test_folder.py => test_replace_figures_in_folders.py}     | 0
 tests/{test_extension.py => test_replace_figures_in_tex_file.py} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_folder.py => test_replace_figures_in_folders.py} (100%)
 rename tests/{test_extension.py => test_replace_figures_in_tex_file.py} (100%)

diff --git a/tests/test_folder.py b/tests/test_replace_figures_in_folders.py
similarity index 100%
rename from tests/test_folder.py
rename to tests/test_replace_figures_in_folders.py
diff --git a/tests/test_extension.py b/tests/test_replace_figures_in_tex_file.py
similarity index 100%
rename from tests/test_extension.py
rename to tests/test_replace_figures_in_tex_file.py

From 762aa2e9f95b16476426f2a81710f4a9e7bd67ff Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 4 Jul 2024 10:51:31 +0800
Subject: [PATCH 20/39] feat(feat(render.py): add function to handle reference
 render):

---
 DocParser/vrdu/renderer.py | 157 +++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 5737bc7..a9ac1b6 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -12,6 +12,10 @@
 from DocParser.TexSoup.TexSoup import TexSoup
 import DocParser.TexSoup.app.conversion as conversion
 
+import bibtexparser
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.customization import convert_to_unicode
+
 log = logger.get_logger(__name__)
 
 
@@ -74,6 +78,7 @@ def render_all_env(self, color_tex: str) -> None:
         """
         self.render_simple_envs(color_tex)
         self.render_float_envs(color_tex)
+        self.render_reference(color_tex)
 
     def render_simple_envs(self, color_tex: str) -> None:
         """Renders simple environments in a LaTeX file.
@@ -445,8 +450,14 @@ def render_one_env(self, main_directory: str) -> None:
         Returns:
             None: This function does not return anything.
         """
+
+        # handle latex file
         color_tex_file = os.path.join(main_directory, "paper_colored.tex")
         white_tex_file = os.path.join(main_directory, "paper_white.tex")
+        
+        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
+        shutil.copyfile(color_tex_file, paper_bib_white)
+
         self.modify_color_definitions(color_tex_file, white_tex_file)
         ordered_env_colors = self.get_env_orders(white_tex_file)
         suffix = "_color"
@@ -470,6 +481,50 @@ def render_one_env(self, main_directory: str) -> None:
             with open(output_file, "w") as f:
                 f.write(new_content)
 
+        # handle bib file
+        color_bib_file = os.path.join(main_directory, "bib_colored.bib")
+        white_bib_file = os.path.join(main_directory, "bib_white.bib")
+        self.modify_color_definitions(color_bib_file, white_bib_file)
+        ordered_env_colors = self.get_env_orders(white_bib_file)
+        index_map = defaultdict(int)
+
+        with open(white_bib_file, "r") as f:
+            bib_content = f.read()
+        
+        for index, env_color in enumerate(ordered_env_colors):
+            env = env_color[: -len(suffix)]
+            # the first one is the color definition, skip it
+            bib_new_content = replace_nth(
+                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2
+            )
+
+            bib_output_file = os.path.join(
+                main_directory,
+                f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib",
+            )
+
+            # change the bib file name in paper_bib_white.tex
+            # \bibliographystyle{bib file name}
+            with open(paper_bib_white, "r") as f:
+                tex_content = f.read()
+
+            bib_file_name = os.path.basename(bib_output_file).split(".")[0]
+            # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content)
+            tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content)
+
+            tex_output_file = os.path.join(
+                main_directory,
+                f"paper_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.tex",
+            )
+            
+            index_map[env] += 1
+            with open(bib_output_file, "w") as f:
+                f.write(bib_new_content)
+
+            with open(tex_output_file, "w", encoding='utf-8') as f:
+                f.write(tex_new_content)
+        
+
     def render_caption(self, tex_file: str) -> None:
         """Renders captions in a LaTeX file.
 
@@ -616,6 +671,108 @@ def render_abstract(self, tex_file: str) -> None:
         with open(tex_file, "w") as f:
             f.write(result)
 
+    def render_reference(self, tex_file: str) -> None:
+        """
+        Renders the reference section based on a BibTeX (.bib) file.
+
+        Args:
+            tex_file (str): The path to the LaTex file.
+
+        Returns:
+            None
+        """
+        bib_pattern = r'\\bibliography\s*{\s*([^}]+)\s*}'
+        # Extract directory and filename from LaTeX file path
+        tex_dir, tex_filename = os.path.split(tex_file)
+
+        # Extract BibTeX file path from LaTeX file
+        bib_file = None
+        with open(tex_file, 'r', encoding='utf-8') as tex_f:
+            tex_content = tex_f.read()
+
+            # Search for bibliography command
+            match = re.search(bib_pattern, tex_content)
+            if match:
+                bib_filename = match.group(1) + '.bib'
+                bib_file = os.path.join(tex_dir, bib_filename)
+
+        if not bib_file:
+            print("BibTeX file not found in the LaTeX file.")
+            return
+        main_directory = os.path.dirname(tex_file)
+
+        # copy the original tex file
+        color_bib = os.path.join(main_directory, "bib_colored.bib")
+        white_bib = os.path.join(main_directory, "bib_white.bib")
+        shutil.copyfile(bib_file, color_bib)
+        shutil.copyfile(bib_file, white_bib)
+        
+        # Define colorize function inline
+        def colorize(text: str, category_name: str) -> str:
+            if category_name == "Reference":
+                # Define regex patterns
+                author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]")
+                note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]")
+
+                # Find the position of the author and year
+                author_match = author_pattern.search(text)
+                if author_match:
+                    # Find the start of the author field
+                    author_start = author_match.end() - 1
+                    author_end = text.find("}", author_start)
+                    if author_end == -1:
+                        author_end = text.find("\"", author_start)
+                        if author_end == -1:
+                            author_end = text.find("\"", author_start) + 1
+                    # Replace author field with colorized version
+                    if author_end != -1:
+                        text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
+
+                note_match = note_pattern.search(text)
+                if note_match:
+                    # Find the start of the year field
+                    year_start = note_match.end() - 1
+                    year_end_1 = text.find("\"", year_start + 1)
+                    year_end_2 = text.find("}", year_start + 1)
+                    # find the before year_end
+                    if year_end_1 != -1 and year_end_2 != -1:
+                        year_end = min(year_end_1, year_end_2)
+                    else:
+                        year_end = max(year_end_1, year_end_2)
+                    # Replace year field with black color
+                    if year_end != -1:
+                        text = text[:year_end] + "\\color{black}" + text[year_end:]
+                
+                else:
+                    # Check if text ends with "}"
+                    if text.endswith("}"):
+                        # Check if the character before the last "}" is ","
+                        if text[-2] == ",":
+                            text = text[:-2] + ",note={\\color{black}}}"
+                        else:
+                            text = text[:-1] + ",note={\\color{black}}}"
+
+            return text
+
+        # Read BibTeX file
+        with open(color_bib, 'r', encoding='utf-8') as bib_f:
+            bibtex_entries = bib_f.readlines()
+
+        # Colorize and format references in LaTeX format
+        colored_references = []
+        for entry in bibtex_entries:
+            if entry.strip().startswith('@'):
+                formatted_entry = f"{entry.strip()}"
+            else:
+                formatted_entry = f"  {entry.strip()}"
+            self.texts["Reference"].append(formatted_entry)
+            colored_ref = colorize(formatted_entry, "Reference")
+            colored_references.append(colored_ref)
+        # Write back to the BibTeX file
+        with open(color_bib, 'w', encoding='utf-8') as bib_f:
+            for ref in colored_references:
+                bib_f.write(ref + "\n")
+
     def render_tabular(self, tex_file: str) -> None:
         """Renders tabular environments in a LaTeX file.
 

From e0dbb67094d47311c4e7e6ba520cc4051fd7ae57 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 4 Jul 2024 10:54:54 +0800
Subject: [PATCH 21/39] refactor(utils.py): add bib compilation during
 compilation

---
 DocParser/vrdu/utils.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py
index be6fa51..a5f300f 100755
--- a/DocParser/vrdu/utils.py
+++ b/DocParser/vrdu/utils.py
@@ -40,7 +40,7 @@ def load_json(file_path: str) -> Union[Dict, List]:
 
 def compile_latex(file: str) -> None:
     """
-    Compile a LaTeX file using  pdflatex engine.
+    Compile a LaTeX file using pdflatex and bibtex engines.
 
     Parameters:
         file (str): The path to the LaTeX file to be compiled.
@@ -49,19 +49,38 @@ def compile_latex(file: str) -> None:
         None
     """
     file_name = os.path.basename(file)
+    base_name, _ = os.path.splitext(file_name)
 
+    # First compilation with SyncTeX
     subprocess.run(
         ["pdflatex", "-interaction=nonstopmode", file_name],
         timeout=1000,
         stdout=subprocess.DEVNULL,
     )
 
+    # Compile BibTeX if .aux file exists
+    if os.path.exists(base_name + ".aux"):
+        subprocess.run(
+            ["bibtex", base_name],
+            timeout=1000,
+            stdout=subprocess.DEVNULL,
+        )
+
+    # Second compilation to include bibliography
     subprocess.run(
         ["pdflatex", "-interaction=nonstopmode", file_name],
         timeout=1000,
         stdout=subprocess.DEVNULL,
     )
 
+    # Third compilation to finalize references and SyncTeX
+    subprocess.run(
+        ["pdflatex", "-interaction=nonstopmode",  file_name],
+        timeout=1000,
+        stdout=subprocess.DEVNULL,
+    )
+
+    # Additional compilation for specific file
     if file_name == "paper_colored.tex":
         subprocess.run(
             ["pdflatex", "-interaction=nonstopmode", "-synctex=1", file_name],
@@ -69,7 +88,6 @@ def compile_latex(file: str) -> None:
             stdout=subprocess.DEVNULL,
         )
 
-
 def pdf2jpg(pdf_path: str, output_directory: str) -> None:
     """
     Convert a PDF file into a series of jpg images.
@@ -251,4 +269,5 @@ def colorize(text: str, category_name: str) -> str:
     if category_name == "Code":
         return "{\\color{" + color + "}" + text + "}"
 
+
     raise NotImplementedError(f"Invalid category name: {category_name}")

From 9b1bdb1362c1108938d9c94e6f08c0a51d652d86 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 4 Jul 2024 10:58:04 +0800
Subject: [PATCH 22/39] fix(main.py): fix folder already exists

---
 DocParser/main.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/DocParser/main.py b/DocParser/main.py
index 2abedd8..299f13a 100644
--- a/DocParser/main.py
+++ b/DocParser/main.py
@@ -58,6 +58,10 @@ def remove_redundant_stuff(main_directory: str) -> None:
     for file in redundant_files:
         os.remove(file)
 
+    redundant_bib_files = glob.glob(f"{main_directory}/bib_*")
+    for file in redundant_bib_files:
+        os.remove(file)
+
     # remove useless pdf and image files
     # TODO: move this name pattern into config
     redundant_folders = glob.glob(
@@ -110,10 +114,23 @@ def process_one_file(file_name: str) -> None:
     cwd = os.getcwd()
 
     try:
-        # change the working directory to the main directory of the paper
+        # # change the working directory to the main directory of the paper
+        # os.chdir(main_directory)
+        # # create output folder
+        # os.makedirs(os.path.join(main_directory, "output/result"))
+
+        # Save current working directory
+        cwd = os.getcwd()
+
+        # Change the working directory to the main directory of the paper
         os.chdir(main_directory)
-        # create output folder
-        os.makedirs(os.path.join(main_directory, "output/result"))
+
+        # Create output folder if it doesn't exist
+        output_folder = os.path.join(main_directory, "output/result")
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+        else:
+            print(f"Output folder '{output_folder}' already exists.")
 
         # step 1: preprocess the paper
         preprocess.run(original_tex)

From 497da712c3cd0dd97e0ec99f5eee4192fb2c5e51 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 4 Jul 2024 10:59:19 +0800
Subject: [PATCH 23/39] refactor(config.py): add reference category

---
 DocParser/vrdu/config/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DocParser/vrdu/config/config.py b/DocParser/vrdu/config/config.py
index 081f4ec..9028fba 100644
--- a/DocParser/vrdu/config/config.py
+++ b/DocParser/vrdu/config/config.py
@@ -79,6 +79,7 @@
     "Equation",
     "Footnote",
     "List",
+    "Reference"
 ]
 
 

From 201295638f33bc4ffe882c343c05dd4106a9892f Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Mon, 8 Jul 2024 15:26:02 +0800
Subject: [PATCH 24/39] refactor(renderer.py): modify the method of handling
 Reference

---
 DocParser/vrdu/renderer.py | 48 +++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index a9ac1b6..be5b9cd 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -416,6 +416,7 @@ def modify_color_definitions(self, input_file: str, output_file: str) -> None:
                 r"\\definecolor{" + color_name + r"}{RGB}{255, 255, 255}",
                 content,
             )
+        content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{bib_white}", content)
 
         with open(output_file, "w") as file:
             file.write(content)
@@ -450,14 +451,9 @@ def render_one_env(self, main_directory: str) -> None:
         Returns:
             None: This function does not return anything.
         """
-
         # handle latex file
         color_tex_file = os.path.join(main_directory, "paper_colored.tex")
         white_tex_file = os.path.join(main_directory, "paper_white.tex")
-        
-        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
-        shutil.copyfile(color_tex_file, paper_bib_white)
-
         self.modify_color_definitions(color_tex_file, white_tex_file)
         ordered_env_colors = self.get_env_orders(white_tex_file)
         suffix = "_color"
@@ -482,22 +478,24 @@ def render_one_env(self, main_directory: str) -> None:
                 f.write(new_content)
 
         # handle bib file
+        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
+        shutil.copyfile(white_tex_file, paper_bib_white)
         color_bib_file = os.path.join(main_directory, "bib_colored.bib")
         white_bib_file = os.path.join(main_directory, "bib_white.bib")
         self.modify_color_definitions(color_bib_file, white_bib_file)
         ordered_env_colors = self.get_env_orders(white_bib_file)
-        index_map = defaultdict(int)
 
         with open(white_bib_file, "r") as f:
-            bib_content = f.read()
+                bib_content = f.read()
+
+        index_map = defaultdict(int)
         
         for index, env_color in enumerate(ordered_env_colors):
             env = env_color[: -len(suffix)]
-            # the first one is the color definition, skip it
             bib_new_content = replace_nth(
-                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2
+                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 
             )
-
+            bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}")
             bib_output_file = os.path.join(
                 main_directory,
                 f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib",
@@ -509,7 +507,6 @@ def render_one_env(self, main_directory: str) -> None:
                 tex_content = f.read()
 
             bib_file_name = os.path.basename(bib_output_file).split(".")[0]
-            # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content)
             tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content)
 
             tex_output_file = os.path.join(
@@ -712,7 +709,7 @@ def colorize(text: str, category_name: str) -> str:
             if category_name == "Reference":
                 # Define regex patterns
                 author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]")
-                note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]")
+                year_pattern = re.compile(r"\byear\s*=\s*[\{\"]")
 
                 # Find the position of the author and year
                 author_match = author_pattern.search(text)
@@ -728,10 +725,10 @@ def colorize(text: str, category_name: str) -> str:
                     if author_end != -1:
                         text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
 
-                note_match = note_pattern.search(text)
-                if note_match:
+                year_match = year_pattern.search(text)
+                if year_match:
                     # Find the start of the year field
-                    year_start = note_match.end() - 1
+                    year_start = year_match.end() - 1
                     year_end_1 = text.find("\"", year_start + 1)
                     year_end_2 = text.find("}", year_start + 1)
                     # find the before year_end
@@ -741,19 +738,18 @@ def colorize(text: str, category_name: str) -> str:
                         year_end = max(year_end_1, year_end_2)
                     # Replace year field with black color
                     if year_end != -1:
-                        text = text[:year_end] + "\\color{black}" + text[year_end:]
-                
-                else:
-                    # Check if text ends with "}"
-                    if text.endswith("}"):
-                        # Check if the character before the last "}" is ","
-                        if text[-2] == ",":
-                            text = text[:-2] + ",note={\\color{black}}}"
-                        else:
-                            text = text[:-1] + ",note={\\color{black}}}"
+                        text = text[:year_end] + "\\color{white}" + text[year_end:]
 
             return text
 
+        with open(white_bib, 'r') as bib_file:
+            bib_content = bib_file.read()
+
+        # use bibtexparser to parse the bib file
+        bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL)
+        for item in bib_entries:
+            self.texts["Reference"].append(item)
+
         # Read BibTeX file
         with open(color_bib, 'r', encoding='utf-8') as bib_f:
             bibtex_entries = bib_f.readlines()
@@ -765,7 +761,7 @@ def colorize(text: str, category_name: str) -> str:
                 formatted_entry = f"{entry.strip()}"
             else:
                 formatted_entry = f"  {entry.strip()}"
-            self.texts["Reference"].append(formatted_entry)
+            # self.texts["Reference"].append(formatted_entry)
             colored_ref = colorize(formatted_entry, "Reference")
             colored_references.append(colored_ref)
         # Write back to the BibTeX file

From d59eae1b8363578e7eaed15cee7165dabbc6ec52 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Mon, 8 Jul 2024 18:03:12 +0800
Subject: [PATCH 25/39] refactor(main.py): use unsrt as the default
 bibliography style

---
 DocParser/main.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/DocParser/main.py b/DocParser/main.py
index 299f13a..18873a7 100644
--- a/DocParser/main.py
+++ b/DocParser/main.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 from tqdm import tqdm
+import re
 
 
 from vrdu import logger
@@ -90,6 +91,13 @@ def process_one_file(file_name: str) -> None:
     main_directory = os.path.dirname(file_name)
     log.info(f"[VRDU] file: {file_name}, start processing.")
 
+    # use unsrt as the default bibliography style
+    with open(file_name, "r") as file:
+            content = file.read()
+    content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content)
+    with open(file_name, "w") as file:
+        file.write(content)
+
     # check if this paper has been processed
     quality_report_file = os.path.join(
         main_directory, "output/result/quality_report.json"

From 36af9cf8068d76f52ef407b69ec106fc9697c46b Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 12 Jul 2024 09:50:55 +0800
Subject: [PATCH 26/39] feat(render.py): add a function for bib to get env
 orders

---
 DocParser/vrdu/renderer.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index be5b9cd..5636ece 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -441,6 +441,27 @@ def get_env_orders(self, tex_file: str) -> List[str]:
 
         # the definitions are discarded
         return matches[len(colors) :]
+    
+    def get_bib_env_orders(self, tex_file: str) -> List[str]:
+        """Returns a list of environment orders based on the contents of the given `tex_file`.
+
+        Args:
+            tex_file (str): The path to the .tex file.
+
+        Returns:
+            List[str]: A list of environment orders.
+        """
+        with open(tex_file) as f:
+            contents = f.read()
+        colors = list(config.name2color.values())
+        matches = []
+
+        pattern = "|".join(rf"\b{re.escape(term)}\b" for term in colors)
+        for m in re.finditer(pattern, contents):
+            matches.append(m.group(0))
+
+        # the definitions are discarded
+        return matches
 
     def render_one_env(self, main_directory: str) -> None:
         """Render one environment by modifying the corresponding rendering color to black.
@@ -483,10 +504,10 @@ def render_one_env(self, main_directory: str) -> None:
         color_bib_file = os.path.join(main_directory, "bib_colored.bib")
         white_bib_file = os.path.join(main_directory, "bib_white.bib")
         self.modify_color_definitions(color_bib_file, white_bib_file)
-        ordered_env_colors = self.get_env_orders(white_bib_file)
+        ordered_env_colors = self.get_bib_env_orders(white_bib_file)
 
         with open(white_bib_file, "r") as f:
-                bib_content = f.read()
+            bib_content = f.read()
 
         index_map = defaultdict(int)
         

From 6bd14c134d351a5566e6dbc576090c8da10bf08c Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 12 Jul 2024 09:51:51 +0800
Subject: [PATCH 27/39] refactor(render.py): modify the method color the author

---
 DocParser/vrdu/renderer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 5636ece..86906f7 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -738,13 +738,17 @@ def colorize(text: str, category_name: str) -> str:
                     # Find the start of the author field
                     author_start = author_match.end() - 1
                     author_end = text.find("}", author_start)
+                    author_mid = text.find(",", author_start)
                     if author_end == -1:
                         author_end = text.find("\"", author_start)
                         if author_end == -1:
                             author_end = text.find("\"", author_start) + 1
                     # Replace author field with colorized version
                     if author_end != -1:
-                        text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
+                        if author_mid != -1 and author_mid < author_end:
+                            text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:]
+                        else:
+                            text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
 
                 year_match = year_pattern.search(text)
                 if year_match:

From 6aafbb753e8744f2136b7e0be9632ec6fb9be828 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 12 Jul 2024 09:55:03 +0800
Subject: [PATCH 28/39] refactor(layout_annotation.py): add error catching
 mechanism

---
 DocParser/vrdu/layout_annotation.py | 172 +++++++++++++++-------------
 1 file changed, 91 insertions(+), 81 deletions(-)

diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py
index 31b7a1f..772d952 100644
--- a/DocParser/vrdu/layout_annotation.py
+++ b/DocParser/vrdu/layout_annotation.py
@@ -278,97 +278,107 @@ def generate_non_figure_bb(self) -> Dict[int, List[Block]]:
             log.debug(f"category: {category}, index: {index}")
 
             elements = []
-            for image_pair in image_pairs:
-                page_index = image_pair[0]
+            try:
+                for image_pair in image_pairs:
+                    page_index = image_pair[0]
 
-                image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8)
-                image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8)
+                    image1_array = np.array(plt.imread(image_pair[1]), dtype=np.uint8)
+                    image2_array = np.array(plt.imread(image_pair[2]), dtype=np.uint8)
 
-                diff_image = np.abs(image2_array - image1_array, dtype=np.uint8)
-                if np.all(diff_image == 0):
-                    continue
-                labeled_image, num = label(
-                    diff_image > config.threshold, return_num=True
-                )
-                if num == 0:
-                    continue
-
-                regions = regionprops(labeled_image)
-                bounding_boxes = [region.bbox for region in regions]
-
-                if len(bounding_boxes) == 0:
-                    continue
-
-                separations = self.layout_metadata[page_index]["separations"]
-                top_margin = self.layout_metadata[page_index]["top_margin"]
-
-                # We do not consider the cross column case for these envs.
-                if category in envs.one_column_envs:
-                    bounding_boxes = [bb for bb in bounding_boxes]
-                    if len(bounding_boxes) == 0:
+                    diff_image = np.abs(image2_array - image1_array, dtype=np.uint8)
+                    if np.all(diff_image == 0):
                         continue
-                    element = Block(
-                        bounding_box=BoundingBox.from_list(bounding_boxes),
-                        source_code=self.text_info[category][index],
-                        category=config.name2category[category],
-                        page_index=page_index,
+                    labeled_image, num = label(
+                        diff_image > config.threshold, return_num=True
                     )
-                    if elements:
-                        element.parent_block = elements[-1].block_id
-                    elements.append(element)
-                    continue
+                    if num == 0:
+                        continue
 
-                # consider possible cross column case
-                for column in range(self.layout_metadata["num_columns"]):
-                    # min_x: bb[1], min_y: bb[0], max_x: bb[4], max_y: bb[3]
-                    column_boxes = [
-                        bb
-                        for bb in bounding_boxes
-                        if bb[1] >= separations[column]
-                        and bb[1] <= separations[column + 1]
-                    ]
-                    if not column_boxes:
+                    regions = regionprops(labeled_image)
+                    bounding_boxes = [region.bbox for region in regions]
+
+                    if len(bounding_boxes) == 0:
                         continue
 
-                    element = Block(
-                        bounding_box=BoundingBox.from_list(column_boxes),
-                        source_code=self.text_info[category][index],
-                        category=config.name2category[category],
-                        page_index=page_index,
-                    )
-                    if elements:
-                        element.parent_block = elements[-1].block_id
-
-                    if (
-                        len(elements) > 0
-                        and elements[-1].category == element.category
-                        and elements[-1].page_index == element.page_index
-                        and elements[-1].source_code == element.source_code
-                        and elements[-1].bbox.overlap(element.bbox)
-                    ):
-                        elements[-1].bbox = BoundingBox(
-                            min(
-                                elements[-1].bbox.x0,
-                                element.bbox.x0,
-                            ),
-                            min(
-                                elements[-1].bbox.y0,
-                                element.bbox.y0,
-                            ),
-                            max(
-                                elements[-1].bbox.x1,
-                                element.bbox.x1,
-                            ),
-                            max(
-                                elements[-1].bbox.y1,
-                                element.bbox.y1,
-                            ),
+                    separations = self.layout_metadata[page_index]["separations"]
+                    top_margin = self.layout_metadata[page_index]["top_margin"]
+
+                    # We do not consider the cross column case for these envs.
+                    if category in envs.one_column_envs:
+                        bounding_boxes = [bb for bb in bounding_boxes]
+                        if len(bounding_boxes) == 0:
+                            continue
+                        element = Block(
+                            bounding_box=BoundingBox.from_list(bounding_boxes),
+                            source_code=self.text_info[category][index],
+                            category=config.name2category[category],
+                            page_index=page_index,
                         )
+                        if elements:
+                            element.parent_block = elements[-1].block_id
+                        elements.append(element)
                         continue
-                    elements.append(element)
 
-            for element in elements:
-                layout_info[element.page_index].append(element)
+                    # consider possible cross column case
+                    for column in range(self.layout_metadata["num_columns"]):
+                        try:
+                            column_boxes = [
+                                bb
+                                for bb in bounding_boxes
+                                if bb[1] >= separations[column]
+                                and bb[1] <= separations[column + 1]
+                            ]
+                            if not column_boxes:
+                                continue
+
+                            element = Block(
+                                bounding_box=BoundingBox.from_list(column_boxes),
+                                source_code=self.text_info[category][index],
+                                category=config.name2category[category],
+                                page_index=page_index,
+                            )
+                            if elements:
+                                element.parent_block = elements[-1].block_id
+
+                            if (
+                                len(elements) > 0
+                                and elements[-1].category == element.category
+                                and elements[-1].page_index == element.page_index
+                                and elements[-1].source_code == element.source_code
+                                and elements[-1].bbox.overlap(element.bbox)
+                            ):
+                                elements[-1].bbox = BoundingBox(
+                                    min(
+                                        elements[-1].bbox.x0,
+                                        element.bbox.x0,
+                                    ),
+                                    min(
+                                        elements[-1].bbox.y0,
+                                        element.bbox.y0,
+                                    ),
+                                    max(
+                                        elements[-1].bbox.x1,
+                                        element.bbox.x1,
+                                    ),
+                                    max(
+                                        elements[-1].bbox.y1,
+                                        element.bbox.y1,
+                                    ),
+                                )
+                                continue
+                            elements.append(element)
+                        except IndexError:
+                            log.error(f"IndexError: {column}")
+                            continue  # Skip processing for this column if index is out of range
+
+                for element in elements:
+                    layout_info[element.page_index].append(element)
+
+            except Exception as e:
+                # Handle the exception as per your application's requirements
+                log.error(f"Error processing block directory {block_directory}: {str(e)}")
+                # Optionally, you can raise the exception to stop further processing
+                # raise
 
         return layout_info
 

From bd3eb24d18005cd1b8923bdf1c629dcaa3da9922 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Fri, 12 Jul 2024 09:56:36 +0800
Subject: [PATCH 29/39] refactor(main.py): modify the method of change
 bibliographystyle

---
 DocParser/main.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/DocParser/main.py b/DocParser/main.py
index 18873a7..9082faa 100644
--- a/DocParser/main.py
+++ b/DocParser/main.py
@@ -94,7 +94,11 @@ def process_one_file(file_name: str) -> None:
     # use unsrt as the default bibliography style
     with open(file_name, "r") as file:
             content = file.read()
-    content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content)
+    # if cant find bibliographystyle, add it
+    if not re.search(r"\\bibliographystyle", content):
+        content = re.sub(r"\\end{document}", "\\\\bibliographystyle{unsrt}\n\\\\end{document}", content)
+    else:
+        content = re.sub(r"\\bibliographystyle\s*{\s*([^}]+)\s*}", "\\\\bibliographystyle{unsrt}", content)
     with open(file_name, "w") as file:
         file.write(content)
 
@@ -169,11 +173,12 @@ def process_one_file(file_name: str) -> None:
         log.info(f"[VRDU] file: {original_tex}, successfully processed.")
 
     except Exception as e:
-        error_type = e.__class__.__name__
-        error_info = str(e)
-        log.error(
-            f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
-        )
+        # error_type = e.__class__.__name__
+        # error_info = str(e)
+        # log.error(
+        #     f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
+        # )
+        raise e
 
     finally:
         # remove redundant files

From 77d06dc94180b5502665ce540e1489785a5526ea Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Mon, 17 Jun 2024 11:29:25 +0800
Subject: [PATCH 30/39] refactor(layout_annotation.py): rm reading_annotation

the reading annotation result is already contained in order annotation
---
 DocParser/vrdu/layout_annotation.py | 67 -----------------------------
 1 file changed, 67 deletions(-)

diff --git a/DocParser/vrdu/layout_annotation.py b/DocParser/vrdu/layout_annotation.py
index 772d952..6fbb8c2 100644
--- a/DocParser/vrdu/layout_annotation.py
+++ b/DocParser/vrdu/layout_annotation.py
@@ -409,66 +409,6 @@ def generate_layout_info(self) -> Dict[int, List[Block]]:
             layout_info[page_index].extend(figure_layout_info[page_index])
         return layout_info
 
-    def generate_reading_annotation(
-        self, layout_info: Dict[int, List[Block]]
-    ) -> DefaultDict[str, List]:
-        """Generate a reading annotation based on the layout information.
-
-        Args:
-            layout_info (Dict[int, List[Block]]): A dictionary containing the layout information
-                for each page index. The keys are the page indices and the values are lists of
-                `Block` objects representing the blocks on each page.
-
-        Returns:
-            DefaultDict[str, List]: A defaultdict containing the reading annotation. The keys
-            of the defaultdict are the page indices and the values are lists of dictionaries
-            representing the reading annotation for each block on the page. Each dictionary
-            contains the following keys:
-                - "source_code": The source code of the block.
-                - "image_path": The path to the saved image of the block.
-                - "category": The category of the block.
-
-            The defaultdict also contains the following keys:
-                - "categories": A list of dictionaries representing the categories. Each
-                  dictionary contains the following keys:
-                      - "id": The ID of the category.
-                      - "name": The name of the category.
-                - "macros": A dictionary containing the macro definitions extracted from
-                  the original tex file.
-        """
-        reading_annotation = defaultdict(list)
-
-        # sort all images by page index, see utils.pdf2jpg for details
-        image_files = sorted(
-            glob.glob(os.path.join(self.pdf_images_path, "*.jpg")),
-            key=lambda x: x[-6:-4],
-        )
-        count = 0
-        for page_index in layout_info.keys():
-            page_image = Image.open(image_files[page_index])
-            for block in layout_info[page_index]:
-                cropped_image = page_image.crop(block.bbox)
-
-                image_name = config.folder_prefix + str(count).zfill(4) + ".jpg"
-                count += 1
-                image_path = os.path.join(self.result_directory, image_name)
-                cropped_image.save(image_path)
-                reading_annotation[page_index].append(
-                    {
-                        "source_code": block.source_code,
-                        "image_path": image_name,
-                        "category": block.category,
-                    }
-                )
-            page_image.close()
-
-        reading_annotation["categories"] = [
-            {"id": index, "name": category}
-            for index, category, _ in config.config["category_name"]
-        ]
-
-        return reading_annotation
-
     def generate_image_annotation(
         self, layout_info: Dict[int, List[Block]]
     ) -> Dict[int, Dict[str, Any]]:
@@ -546,13 +486,6 @@ def annotate(self):
             layout_info, image_annotation, file_path=layout_annotation_file
         )
 
-        # step3: generate reading annotation
-        reading_annotation = self.generate_reading_annotation(layout_info)
-        reading_annotation_file = os.path.join(
-            self.result_directory, "reading_annotation.json"
-        )
-        utils.export_to_json(reading_annotation, reading_annotation_file)
-
 
 def get_image_pairs(dir1: str, dir2: str):
     """

From 39b37f9967a3486e636657dc98a9b784fe25a47b Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Mon, 17 Jun 2024 15:12:56 +0800
Subject: [PATCH 31/39] fix(main.py): make dirs twice

---
 DocParser/main.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/DocParser/main.py b/DocParser/main.py
index 9082faa..09c6c34 100644
--- a/DocParser/main.py
+++ b/DocParser/main.py
@@ -119,10 +119,6 @@ def process_one_file(file_name: str) -> None:
     if os.path.exists(output_directory):
         shutil.rmtree(output_directory)
 
-    # output_directory stores the intermediate results
-    # result_directory stores the final results
-    os.makedirs(os.path.join(main_directory, "output/result"))
-
     cwd = os.getcwd()
 
     try:

From e6c750a845d18075b114d8af7eed64528ffeabf0 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Mon, 17 Jun 2024 15:26:58 +0800
Subject: [PATCH 32/39] refactor(renderer.py): merge logic of processing
 predefined color

---
 DocParser/vrdu/renderer.py | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 86906f7..638f056 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -367,30 +367,13 @@ def remove_hyperref_color(self, color_tex: str) -> None:
         if re.search(pattern, content[:preamble_loc]):
             content = content[:preamble_loc] + hyper_setup + content[preamble_loc:]
 
-        # Write the modified content back to the input file
-        with open(color_tex, "w") as file:
-            file.write(content)
-
-    def remove_lstlisting_color(self, color_tex: str) -> None:
-        """Remove color definitions from a LaTeX file.
-
-        Args:
-            color_tex (str): The path to the LaTeX file.
-
-        Returns:
-            None
-        """
-        # Read the content of the input file
-        with open(color_tex, "r") as file:
-            content = file.read()
-
-        # delete the color definitions
+        # delete the lstlisting color definitions
         pattern = r"\\lstset\{.*?\}"
-        modified_content = re.sub(pattern, "", content)
+        content = re.sub(pattern, "", content)
 
-        # Write the modified content to the output file
+        # Write the modified content back to the input file
         with open(color_tex, "w") as file:
-            file.write(modified_content)
+            file.write(content)
 
     def modify_color_definitions(self, input_file: str, output_file: str) -> None:
         """Modify the pre-defined color definitions in the input file and write the modified content to the output file.

From 4e46a5b566ff2e42c88d21bece090c6bc7268bef Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Mon, 17 Jun 2024 15:27:33 +0800
Subject: [PATCH 33/39] refactor(renderer.py, test/): use more meaningful name

---
 DocParser/vrdu/renderer.py                                | 7 +++----
 ..._hyperref_color.py => test_remove_predefined_color.py} | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)
 rename tests/{test_remove_hyperref_color.py => test_remove_predefined_color.py} (91%)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 638f056..f7da7d9 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -55,8 +55,7 @@ def render(self, origin_tex: str) -> None:
         self.add_layout_definition(color_tex)
 
         # remove color definitions to prevent conflict
-        self.remove_hyperref_color(color_tex)
-        self.remove_lstlisting_color(color_tex)
+        self.remove_predefined_color(color_tex)
 
         self.render_all_env(color_tex)
 
@@ -335,8 +334,8 @@ def add_layout_definition(self, color_tex: str) -> None:
         with open(color_tex, "w") as f:
             f.write(content)
 
-    def remove_hyperref_color(self, color_tex: str) -> None:
-        """Removes hyperref color settings from a LaTeX file.
+    def remove_predefined_color(self, color_tex: str) -> None:
+        """Removes hyperref and lstlisting color settings from a LaTeX file.
 
         Args:
             color_tex (str): The path to the LaTeX file to modify.
diff --git a/tests/test_remove_hyperref_color.py b/tests/test_remove_predefined_color.py
similarity index 91%
rename from tests/test_remove_hyperref_color.py
rename to tests/test_remove_predefined_color.py
index 3b6a287..356f378 100644
--- a/tests/test_remove_hyperref_color.py
+++ b/tests/test_remove_predefined_color.py
@@ -21,7 +21,7 @@ def test1(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content1),
             create=True,
         ) as file_mock:
-            self.renderer.remove_hyperref_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 """\\documentclass{article}\\begin{document}\\end{document}"""
@@ -33,7 +33,7 @@ def test2(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content2),
             create=True,
         ) as file_mock:
-            self.renderer.remove_hyperref_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 """\\documentclass{article}\\usepackage{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}"""
@@ -45,7 +45,7 @@ def test3(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content3),
             create=True,
         ) as file_mock:
-            self.renderer.remove_hyperref_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}"""
@@ -57,7 +57,7 @@ def test4(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content4),
             create=True,
         ) as file_mock:
-            self.renderer.remove_hyperref_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 """\\documentclass{article}\\usepackage[color_links=true]{hyperref}\\usepackage{amsmath}\\hypersetup{colorlinks=false}\n\\begin{document}\\end{document}"""

From 98bed2fc19854f5683dad27f7d9e31cf670c6d02 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Mon, 15 Jul 2024 16:55:15 +0800
Subject: [PATCH 34/39] fix(all): Module DocParser not found

---
 DocParser/TexSoup/app/conversion.py    |  8 ++++----
 DocParser/vrdu/renderer.py             | 10 +++++-----
 DocParser/vrdu/utils.py                |  4 ++--
 scripts/app.py                         |  4 ++--
 scripts/arxiv_download.py              |  2 +-
 scripts/batch_process.py               |  4 ++--
 scripts/export_to_dataset.py           |  2 +-
 scripts/generate_reading_annotation.py |  4 ++--
 scripts/retrieve_metadata.py           |  4 ++--
 scripts/visualize_order_annotations.py |  2 +-
 setup.py                               |  2 +-
 tests/test_add_definitions.py          |  2 +-
 tests/test_extract_graphics.py         |  2 +-
 tests/test_is_text_eq.py               |  2 +-
 tests/test_remove_predefined_color.py  |  2 +-
 tests/test_render_abstract.py          |  2 +-
 tests/test_render_algorithm.py         |  2 +-
 tests/test_render_caption.py           |  2 +-
 tests/test_render_code.py              |  6 +++---
 tests/test_render_footnote.py          |  2 +-
 tests/test_render_tabular.py           |  2 +-
 tests/test_render_title.py             |  2 +-
 22 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/DocParser/TexSoup/app/conversion.py b/DocParser/TexSoup/app/conversion.py
index 474c228..3ffe746 100644
--- a/DocParser/TexSoup/app/conversion.py
+++ b/DocParser/TexSoup/app/conversion.py
@@ -1,11 +1,11 @@
 import re
 
-from DocParser.TexSoup.TexSoup import TexSoup
-from DocParser.TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup
+from TexSoup.TexSoup import TexSoup
+from TexSoup.TexSoup.data import TexEnv, TexText, TexCmd, TexGroup
 
 
-from DocParser.vrdu import logger
-from DocParser.vrdu.config import envs
+from vrdu import logger
+from vrdu.config import envs
 
 log = logger.get_logger(__name__)
 
diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index f7da7d9..9e7a408 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -5,12 +5,12 @@
 import re
 
 
-import DocParser.vrdu.utils as utils
-import DocParser.vrdu.logger as logger
-from DocParser.vrdu.config import config, envs
+import vrdu.utils as utils
+import vrdu.logger as logger
+from vrdu.config import config, envs
 
-from DocParser.TexSoup.TexSoup import TexSoup
-import DocParser.TexSoup.app.conversion as conversion
+from TexSoup.TexSoup import TexSoup
+import TexSoup.app.conversion as conversion
 
 import bibtexparser
 from bibtexparser.bparser import BibTexParser
diff --git a/DocParser/vrdu/utils.py b/DocParser/vrdu/utils.py
index a5f300f..a4d1e9f 100755
--- a/DocParser/vrdu/utils.py
+++ b/DocParser/vrdu/utils.py
@@ -7,8 +7,8 @@
 from pdf2image import pdf2image
 from pdf2image import generators
 
-from DocParser.vrdu.block import Block
-from DocParser.vrdu.config import config
+from vrdu.block import Block
+from vrdu.config import config
 
 
 def export_to_json(data: Union[Dict, List], file_path: str) -> None:
diff --git a/scripts/app.py b/scripts/app.py
index 54b4a1c..549d682 100644
--- a/scripts/app.py
+++ b/scripts/app.py
@@ -3,8 +3,8 @@
 import glob
 from PIL import Image, ImageDraw
 
-from DocParser.vrdu import utils
-from DocParser.vrdu.config import config
+from vrdu import utils
+from vrdu.config import config
 
 pn.extension()
 
diff --git a/scripts/arxiv_download.py b/scripts/arxiv_download.py
index 971f779..c7c9e10 100644
--- a/scripts/arxiv_download.py
+++ b/scripts/arxiv_download.py
@@ -5,7 +5,7 @@
 import tarfile
 
 
-from DocParser.vrdu import logger
+from vrdu import logger
 
 
 log = logger.setup_app_level_logger(logger_name="arxiv_download.log")
diff --git a/scripts/batch_process.py b/scripts/batch_process.py
index 78dbe8d..e357ac4 100644
--- a/scripts/batch_process.py
+++ b/scripts/batch_process.py
@@ -5,8 +5,8 @@
 from typing import List
 import pandas as pd
 
-from DocParser.vrdu import logger
-from DocParser.main import process_one_file
+from vrdu import logger
+from main import process_one_file
 
 log = logger.setup_app_level_logger(file_name="batch_process.log", level="INFO")
 
diff --git a/scripts/export_to_dataset.py b/scripts/export_to_dataset.py
index f8c41d8..fafb3d2 100644
--- a/scripts/export_to_dataset.py
+++ b/scripts/export_to_dataset.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import multiprocessing
 
-from DocParser.vrdu import logger
+from vrdu import logger
 
 log = logger.setup_app_level_logger(file_name="export_to_dataset.log")
 
diff --git a/scripts/generate_reading_annotation.py b/scripts/generate_reading_annotation.py
index f098d64..a4104b7 100644
--- a/scripts/generate_reading_annotation.py
+++ b/scripts/generate_reading_annotation.py
@@ -4,8 +4,8 @@
 import os
 from pathlib import Path
 
-from DocParser.vrdu import utils
-from DocParser.vrdu import logger
+from vrdu import utils
+from vrdu import logger
 
 log = logger.setup_app_level_logger(file_name="generate_reading_annotation.log")
 
diff --git a/scripts/retrieve_metadata.py b/scripts/retrieve_metadata.py
index 6897c67..bf97df2 100644
--- a/scripts/retrieve_metadata.py
+++ b/scripts/retrieve_metadata.py
@@ -6,8 +6,8 @@
 import argparse
 
 
-from DocParser.vrdu import utils
-from DocParser.vrdu import logger
+from vrdu import utils
+from vrdu import logger
 
 log = logger.setup_app_level_logger(file_name="retrieve_metadata.log")
 
diff --git a/scripts/visualize_order_annotations.py b/scripts/visualize_order_annotations.py
index b59b365..2f5bc5b 100644
--- a/scripts/visualize_order_annotations.py
+++ b/scripts/visualize_order_annotations.py
@@ -7,7 +7,7 @@
 from PIL import Image, ImageDraw
 from matplotlib import pyplot as plt
 
-from DocParser.vrdu import utils
+from vrdu import utils
 
 
 def draw_arrow_line(
diff --git a/setup.py b/setup.py
index ad473aa..ba3749e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(
-    name="vrdu_data_process",
+    name="DocParser",
     version="1.0.0",
     description="process the academic papers with .tex source files",
     author="Mao Song",
diff --git a/tests/test_add_definitions.py b/tests/test_add_definitions.py
index 096ca65..f3ca221 100644
--- a/tests/test_add_definitions.py
+++ b/tests/test_add_definitions.py
@@ -1,7 +1,7 @@
 import unittest
 import unittest.mock
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 def test_add_color_definition1():
diff --git a/tests/test_extract_graphics.py b/tests/test_extract_graphics.py
index 14a2cd5..8335db3 100644
--- a/tests/test_extract_graphics.py
+++ b/tests/test_extract_graphics.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestGraphics(unittest.TestCase):
diff --git a/tests/test_is_text_eq.py b/tests/test_is_text_eq.py
index 3baa280..6426411 100644
--- a/tests/test_is_text_eq.py
+++ b/tests/test_is_text_eq.py
@@ -1,6 +1,6 @@
 import unittest
 
-from DocParser.vrdu.renderer import is_text_eq
+from vrdu.renderer import is_text_eq
 
 
 class TestTextEq(unittest.TestCase):
diff --git a/tests/test_remove_predefined_color.py b/tests/test_remove_predefined_color.py
index 356f378..fdc1b34 100644
--- a/tests/test_remove_predefined_color.py
+++ b/tests/test_remove_predefined_color.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestHyperref(unittest.TestCase):
diff --git a/tests/test_render_abstract.py b/tests/test_render_abstract.py
index 405f6da..16f2cb9 100644
--- a/tests/test_render_abstract.py
+++ b/tests/test_render_abstract.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestAbstract(unittest.TestCase):
diff --git a/tests/test_render_algorithm.py b/tests/test_render_algorithm.py
index a4cf6ad..c15821e 100644
--- a/tests/test_render_algorithm.py
+++ b/tests/test_render_algorithm.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestAlgorithm(unittest.TestCase):
diff --git a/tests/test_render_caption.py b/tests/test_render_caption.py
index b526f60..eb21de8 100644
--- a/tests/test_render_caption.py
+++ b/tests/test_render_caption.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestCaption(unittest.TestCase):
diff --git a/tests/test_render_code.py b/tests/test_render_code.py
index 55082de..79dae23 100644
--- a/tests/test_render_code.py
+++ b/tests/test_render_code.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestCode(unittest.TestCase):
@@ -71,7 +71,7 @@ def test_no_lstset(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content1),
             create=True,
         ) as file_mock:
-            self.renderer.remove_lstlisting_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 """\\documentclass{article}\\begin{document}\\end{document}"""
@@ -83,7 +83,7 @@ def test_remove_lstset(self):
             new=unittest.mock.mock_open(read_data=self.mock_file_content5),
             create=True,
         ) as file_mock:
-            self.renderer.remove_lstlisting_color(file_mock)
+            self.renderer.remove_predefined_color(file_mock)
             file_mock.assert_called_with(file_mock, "w")
             file_mock().write.assert_called_with(
                 r"""\documentclass{article}\n\usepackage{listings}\n\usepackage{xcolor}\n\n\definecolor{codegreen}{rgb}{0,0.6,0}\n\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\n\lstdefinestyle{mystyle}{\n    backgroundcolor=\color{backcolour},   \n    commentstyle=\color{codegreen},\n    keywordstyle=\color{magenta},\n    numberstyle=\tiny\color{codegray},\n    stringstyle=\color{codepurple},\n    basicstyle=\ttfamily\footnotesize,\n    breakatwhitespace=false,         \n    breaklines=true,                 \n    captionpos=b,                    \n    keepspaces=true,                 \n    numbers=left,                    \n    numbersep=5pt,                  \n    showspaces=false,                \n    showstringspaces=false,\n    showtabs=false,                  \n    tabsize=2\n}\n\n\n\n\begin{document}\nThe next code will be directly imported from a file\n\n\lstinputlisting[language=Octave]{BitXorMatrix.m}\n\end{document}"""
diff --git a/tests/test_render_footnote.py b/tests/test_render_footnote.py
index e81e0fd..e0fcebd 100644
--- a/tests/test_render_footnote.py
+++ b/tests/test_render_footnote.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestFootnote(unittest.TestCase):
diff --git a/tests/test_render_tabular.py b/tests/test_render_tabular.py
index e57f363..7cb1e52 100644
--- a/tests/test_render_tabular.py
+++ b/tests/test_render_tabular.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestTabular(unittest.TestCase):
diff --git a/tests/test_render_title.py b/tests/test_render_title.py
index 122063b..343714e 100644
--- a/tests/test_render_title.py
+++ b/tests/test_render_title.py
@@ -2,7 +2,7 @@
 import unittest.mock
 
 
-from DocParser.vrdu.renderer import Renderer
+from vrdu.renderer import Renderer
 
 
 class TestTitle(unittest.TestCase):

From e7f903dcea4ed381a3b5a46a9665ecdbbf7e94a6 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 20 Jun 2024 11:12:27 +0800
Subject: [PATCH 35/39] refactor(preprocess.py): use a more robust to replace
 figure

---
 vrdu/preprocess.py | 161 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 vrdu/preprocess.py

diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py
new file mode 100644
index 0000000..a3dc264
--- /dev/null
+++ b/vrdu/preprocess.py
@@ -0,0 +1,161 @@
+import os
+import re
+
+from arxiv_cleaner.cleaner import Cleaner
+
+from vrdu.config import envs, config
+from vrdu import utils
+import vrdu.logger as logger
+
+
+log = logger.get_logger(__name__)
+
+
+def remove_comments(original_tex: str) -> None:
+    """
+    Removes comments from a TeX file.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None
+    """
+    with open(original_tex, "r") as file:
+        content = file.read()
+
+    # Remove LaTeX comments
+    pattern = r"\\begin{comment}(.*?)\\end{comment}"
+    removed_comments = re.sub(pattern, "", content, flags=re.DOTALL)
+
+    with open(original_tex, "w") as file:
+        file.write(removed_comments)
+
+
+def clean_tex(original_tex: str) -> None:
+    """
+    Clean the given TeX file by creating a cleaner object and running the clean method.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None
+    """
+    main_directory = os.path.dirname(original_tex)
+    tex = os.path.basename(original_tex)
+
+    # Create the cleaner
+    cleaner = Cleaner(
+        input_dir=main_directory,
+        output_dir=main_directory,
+        tex=tex,
+        command_options=config.command_options,
+        verbose=False,
+    )
+
+    # Run the cleaner
+    cleaner.clean()
+
+    # remove comments
+    remove_comments(original_tex)
+
+
+def replace_figures_extension_with_png(original_tex: str) -> None:
+    """
+    Replaces PDF, ps, eps figures' extension with PNG in a TeX file
+    to support pdfminer detecting bounding box.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None: This function does not return anything.
+    """
+    main_directory = os.path.dirname(original_tex)
+    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
+    image_files = {}
+    for root, _, files in os.walk(main_directory):
+        for file in files:
+            if any(file.endswith(ext) for ext in image_extensions):
+                image_name, ext = os.path.splitext(file)
+                # Store the relative path of the image as the value
+                image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory)
+
+    with open(original_tex, 'r') as f:
+        content = f.read()
+
+    # Replace \psfig and \epsfig commands with \includegraphics command
+    def custom_replace(match):
+        options = match.group(1) or ''
+        filepath = match.group(2)
+        if options:
+            return f"\\includegraphics[{options}]{{{filepath}}}"
+        else:
+            return f"\\includegraphics{{{filepath}}}"
+
+    content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+    content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
+
+    # Traverse the image_files dictionary to update file extensions
+    for image_name, file_path in image_files.items():
+        base_name, current_extension = os.path.splitext(image_name)
+        correct_extension = os.path.splitext(file_path)[1]
+
+        if correct_extension not in ['.jpg', '.jpeg']:
+            correct_extension = '.png'
+
+        # Build a regular expression to match image files including optional extensions
+        pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}')
+        replacement = rf'\1{correct_extension}}}'
+        content = pattern.sub(replacement, content)
+
+    # Write the updated content back to the file
+    with open(original_tex, 'w') as f:
+        f.write(content)
+
+
+def delete_table_of_contents(original_tex: str) -> None:
+    """
+    Deletes the table of contents from the given original_tex file.
+    This includes table of contents, list of figures, list of tables, and list of algorithms.
+
+    Parameters:
+        original_tex (str): The path to the original .tex file.
+
+    Returns:
+        None
+    """
+    with open(original_tex, "r") as file:
+        latex_content = file.read()
+
+    pattern = r"\\(" + "|".join(envs.table_of_contents) + r")"
+    modified_content = re.sub(pattern, "", latex_content)
+
+    with open(original_tex, "w") as file:
+        file.write(modified_content)
+
+
+def run(original_tex: str) -> None:
+    """
+    Generates a modified version of the given LaTeX document by performing the following steps:
+
+    Step 0: Clean the LaTeX document with arxiv_cleaner package.
+    Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex.
+    Step 2: Replace PDF figures with PNG to make pdfminer work.
+    Step 3: Delete the table of contents from the LaTeX document.
+
+    Args:
+        original_tex (str): The original LaTeX document.
+
+    Returns:
+        None
+    """
+    # Step 0: clean tex
+    clean_tex(original_tex)
+
+    # Step 1: process images
+    replace_figures_extension_with_png(original_tex)
+
+    # Step 3: delete table of contents
+    delete_table_of_contents(original_tex)

From 8b80b23c1083c9d57c9a3d121e28b8f83a4b2f8c Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Thu, 4 Jul 2024 10:51:31 +0800
Subject: [PATCH 36/39] feat(feat(render.py): add function to handle reference
 render):

---
 DocParser/vrdu/renderer.py | 52 +++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 9e7a408..49b3b39 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -457,6 +457,10 @@ def render_one_env(self, main_directory: str) -> None:
         # handle latex file
         color_tex_file = os.path.join(main_directory, "paper_colored.tex")
         white_tex_file = os.path.join(main_directory, "paper_white.tex")
+        
+        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
+        shutil.copyfile(color_tex_file, paper_bib_white)
+
         self.modify_color_definitions(color_tex_file, white_tex_file)
         ordered_env_colors = self.get_env_orders(white_tex_file)
         suffix = "_color"
@@ -481,24 +485,22 @@ def render_one_env(self, main_directory: str) -> None:
                 f.write(new_content)
 
         # handle bib file
-        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
-        shutil.copyfile(white_tex_file, paper_bib_white)
         color_bib_file = os.path.join(main_directory, "bib_colored.bib")
         white_bib_file = os.path.join(main_directory, "bib_white.bib")
         self.modify_color_definitions(color_bib_file, white_bib_file)
-        ordered_env_colors = self.get_bib_env_orders(white_bib_file)
+        ordered_env_colors = self.get_env_orders(white_bib_file)
+        index_map = defaultdict(int)
 
         with open(white_bib_file, "r") as f:
             bib_content = f.read()
-
-        index_map = defaultdict(int)
         
         for index, env_color in enumerate(ordered_env_colors):
             env = env_color[: -len(suffix)]
+            # the first one is the color definition, skip it
             bib_new_content = replace_nth(
-                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 
+                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2
             )
-            bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}")
+
             bib_output_file = os.path.join(
                 main_directory,
                 f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib",
@@ -510,6 +512,7 @@ def render_one_env(self, main_directory: str) -> None:
                 tex_content = f.read()
 
             bib_file_name = os.path.basename(bib_output_file).split(".")[0]
+            # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content)
             tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content)
 
             tex_output_file = os.path.join(
@@ -712,7 +715,7 @@ def colorize(text: str, category_name: str) -> str:
             if category_name == "Reference":
                 # Define regex patterns
                 author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]")
-                year_pattern = re.compile(r"\byear\s*=\s*[\{\"]")
+                note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]")
 
                 # Find the position of the author and year
                 author_match = author_pattern.search(text)
@@ -720,22 +723,18 @@ def colorize(text: str, category_name: str) -> str:
                     # Find the start of the author field
                     author_start = author_match.end() - 1
                     author_end = text.find("}", author_start)
-                    author_mid = text.find(",", author_start)
                     if author_end == -1:
                         author_end = text.find("\"", author_start)
                         if author_end == -1:
                             author_end = text.find("\"", author_start) + 1
                     # Replace author field with colorized version
                     if author_end != -1:
-                        if author_mid != -1 and author_mid < author_end:
-                            text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:]
-                        else:
-                            text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
+                        text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
 
-                year_match = year_pattern.search(text)
-                if year_match:
+                note_match = note_pattern.search(text)
+                if note_match:
                     # Find the start of the year field
-                    year_start = year_match.end() - 1
+                    year_start = note_match.end() - 1
                     year_end_1 = text.find("\"", year_start + 1)
                     year_end_2 = text.find("}", year_start + 1)
                     # find the before year_end
@@ -745,18 +744,19 @@ def colorize(text: str, category_name: str) -> str:
                         year_end = max(year_end_1, year_end_2)
                     # Replace year field with black color
                     if year_end != -1:
-                        text = text[:year_end] + "\\color{white}" + text[year_end:]
+                        text = text[:year_end] + "\\color{black}" + text[year_end:]
+                
+                else:
+                    # Check if text ends with "}"
+                    if text.endswith("}"):
+                        # Check if the character before the last "}" is ","
+                        if text[-2] == ",":
+                            text = text[:-2] + ",note={\\color{black}}}"
+                        else:
+                            text = text[:-1] + ",note={\\color{black}}}"
 
             return text
 
-        with open(white_bib, 'r') as bib_file:
-            bib_content = bib_file.read()
-
-        # use bibtexparser to parse the bib file
-        bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL)
-        for item in bib_entries:
-            self.texts["Reference"].append(item)
-
         # Read BibTeX file
         with open(color_bib, 'r', encoding='utf-8') as bib_f:
             bibtex_entries = bib_f.readlines()
@@ -768,7 +768,7 @@ def colorize(text: str, category_name: str) -> str:
                 formatted_entry = f"{entry.strip()}"
             else:
                 formatted_entry = f"  {entry.strip()}"
-            # self.texts["Reference"].append(formatted_entry)
+            self.texts["Reference"].append(formatted_entry)
             colored_ref = colorize(formatted_entry, "Reference")
             colored_references.append(colored_ref)
         # Write back to the BibTeX file

From 05a3b5f42c6adf3f611c1d728258473099225e5e Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Tue, 16 Jul 2024 11:21:01 +0800
Subject: [PATCH 37/39] feat(preprocess.py): add function remove vskip

---
 DocParser/vrdu/preprocess.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/DocParser/vrdu/preprocess.py b/DocParser/vrdu/preprocess.py
index 4d02e9c..9e825bf 100644
--- a/DocParser/vrdu/preprocess.py
+++ b/DocParser/vrdu/preprocess.py
@@ -32,6 +32,25 @@ def remove_comments(original_tex: str) -> None:
     with open(original_tex, "w") as file:
         file.write(removed_comments)
 
+def remove_skip(original_tex: str) -> None:
+    """
+    Removes skip from a TeX file.
+
+    Args:
+        original_tex (str): The path to the original TeX file.
+
+    Returns:
+        None
+    """
+    with open(original_tex, "r") as file:
+        content = file.read()
+
+    pattern = r"\\vskip .*|\\vspace{.*}|\\vglue .*"
+    removed_skip = re.sub(pattern, '', content)
+
+    with open(original_tex, "w") as file:
+        file.write(removed_skip)
+    
 
 def clean_tex(original_tex: str) -> None:
     """
@@ -61,6 +80,9 @@ def clean_tex(original_tex: str) -> None:
     # remove comments
     remove_comments(original_tex)
 
+    # remove skip
+    remove_skip(original_tex)
+
 
 def replace_non_png_jpg_figures(original_tex: str) -> None:
     """

From d693f4735e51eb7e25106e2a1721fe1b86a79fa8 Mon Sep 17 00:00:00 2001
From: CHEN YANG <1402375027@qq.com>
Date: Tue, 16 Jul 2024 11:26:11 +0800
Subject: [PATCH 38/39] refactor(render.py): modify the method handle bib_file

---
 DocParser/vrdu/renderer.py | 92 +++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/DocParser/vrdu/renderer.py b/DocParser/vrdu/renderer.py
index 49b3b39..88c26d9 100644
--- a/DocParser/vrdu/renderer.py
+++ b/DocParser/vrdu/renderer.py
@@ -12,13 +12,8 @@
 from TexSoup.TexSoup import TexSoup
 import TexSoup.app.conversion as conversion
 
-import bibtexparser
-from bibtexparser.bparser import BibTexParser
-from bibtexparser.customization import convert_to_unicode
-
 log = logger.get_logger(__name__)
 
-
 class Renderer:
     def __init__(self) -> None:
         self.texts = defaultdict(list)
@@ -55,7 +50,8 @@ def render(self, origin_tex: str) -> None:
         self.add_layout_definition(color_tex)
 
         # remove color definitions to prevent conflict
-        self.remove_predefined_color(color_tex)
+        self.remove_hyperref_color(color_tex)
+        self.remove_lstlisting_color(color_tex)
 
         self.render_all_env(color_tex)
 
@@ -334,8 +330,8 @@ def add_layout_definition(self, color_tex: str) -> None:
         with open(color_tex, "w") as f:
             f.write(content)
 
-    def remove_predefined_color(self, color_tex: str) -> None:
-        """Removes hyperref and lstlisting color settings from a LaTeX file.
+    def remove_hyperref_color(self, color_tex: str) -> None:
+        """Removes hyperref color settings from a LaTeX file.
 
         Args:
             color_tex (str): The path to the LaTeX file to modify.
@@ -366,14 +362,31 @@ def remove_predefined_color(self, color_tex: str) -> None:
         if re.search(pattern, content[:preamble_loc]):
             content = content[:preamble_loc] + hyper_setup + content[preamble_loc:]
 
-        # delete the lstlisting color definitions
-        pattern = r"\\lstset\{.*?\}"
-        content = re.sub(pattern, "", content)
-
         # Write the modified content back to the input file
         with open(color_tex, "w") as file:
             file.write(content)
 
+    def remove_lstlisting_color(self, color_tex: str) -> None:
+        """Remove color definitions from a LaTeX file.
+
+        Args:
+            color_tex (str): The path to the LaTeX file.
+
+        Returns:
+            None
+        """
+        # Read the content of the input file
+        with open(color_tex, "r") as file:
+            content = file.read()
+
+        # delete the color definitions
+        pattern = r"\\lstset\{.*?\}"
+        modified_content = re.sub(pattern, "", content)
+
+        # Write the modified content to the output file
+        with open(color_tex, "w") as file:
+            file.write(modified_content)
+
     def modify_color_definitions(self, input_file: str, output_file: str) -> None:
         """Modify the pre-defined color definitions in the input file and write the modified content to the output file.
 
@@ -457,10 +470,6 @@ def render_one_env(self, main_directory: str) -> None:
         # handle latex file
         color_tex_file = os.path.join(main_directory, "paper_colored.tex")
         white_tex_file = os.path.join(main_directory, "paper_white.tex")
-        
-        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
-        shutil.copyfile(color_tex_file, paper_bib_white)
-
         self.modify_color_definitions(color_tex_file, white_tex_file)
         ordered_env_colors = self.get_env_orders(white_tex_file)
         suffix = "_color"
@@ -485,34 +494,35 @@ def render_one_env(self, main_directory: str) -> None:
                 f.write(new_content)
 
         # handle bib file
+        paper_bib_white = os.path.join(main_directory, "paper_bib_white.tex")
+        shutil.copyfile(white_tex_file, paper_bib_white)
         color_bib_file = os.path.join(main_directory, "bib_colored.bib")
         white_bib_file = os.path.join(main_directory, "bib_white.bib")
         self.modify_color_definitions(color_bib_file, white_bib_file)
-        ordered_env_colors = self.get_env_orders(white_bib_file)
-        index_map = defaultdict(int)
+        ordered_env_colors = self.get_bib_env_orders(white_bib_file)
+        # print(ordered_env_colors)
 
         with open(white_bib_file, "r") as f:
             bib_content = f.read()
+
+        index_map = defaultdict(int)
         
         for index, env_color in enumerate(ordered_env_colors):
             env = env_color[: -len(suffix)]
-            # the first one is the color definition, skip it
             bib_new_content = replace_nth(
-                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 2
+                bib_content, "{" + env_color + "}", r"{black}", index_map[env] + 1 
             )
-
+            bib_new_content = bib_new_content.replace("{" + env_color + "}", "{white}")
             bib_output_file = os.path.join(
                 main_directory,
                 f"bib_{config.folder_prefix}_{str(index).zfill(5)}_{env}_{str(index_map[env]).zfill(5)}.bib",
             )
 
             # change the bib file name in paper_bib_white.tex
-            # \bibliographystyle{bib file name}
             with open(paper_bib_white, "r") as f:
                 tex_content = f.read()
 
             bib_file_name = os.path.basename(bib_output_file).split(".")[0]
-            # tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", f"\\bibliography{{{bib_file_name}}}", tex_content)
             tex_new_content = re.sub(r"\\bibliography\s*{\s*([^}]+)\s*}", "\\\\bibliography{{{}}}".format(bib_file_name), tex_content)
 
             tex_output_file = os.path.join(
@@ -715,7 +725,7 @@ def colorize(text: str, category_name: str) -> str:
             if category_name == "Reference":
                 # Define regex patterns
                 author_pattern = re.compile(r"\bauthor\s*=\s*[\{\"]")
-                note_pattern = re.compile(r"\bnote\s*=\s*[\{\"]")
+                year_pattern = re.compile(r"\byear\s*=\s*[\{\"]")
 
                 # Find the position of the author and year
                 author_match = author_pattern.search(text)
@@ -723,18 +733,22 @@ def colorize(text: str, category_name: str) -> str:
                     # Find the start of the author field
                     author_start = author_match.end() - 1
                     author_end = text.find("}", author_start)
+                    author_mid = text.find(",", author_start)
                     if author_end == -1:
                         author_end = text.find("\"", author_start)
                         if author_end == -1:
                             author_end = text.find("\"", author_start) + 1
                     # Replace author field with colorized version
                     if author_end != -1:
-                        text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
-
-                note_match = note_pattern.search(text)
-                if note_match:
+                        if author_mid != -1 and author_mid < author_end:
+                            text = text[:author_mid] + ",\\color{Reference_color}" + text[author_mid + 1:]
+                        else:
+                            text = text[:author_start + 1] + "\\color{Reference_color}" + text[author_start + 1:]
+                
+                year_match = year_pattern.search(text)
+                if year_match:
                     # Find the start of the year field
-                    year_start = note_match.end() - 1
+                    year_start = year_match.end() - 1
                     year_end_1 = text.find("\"", year_start + 1)
                     year_end_2 = text.find("}", year_start + 1)
                     # find the before year_end
@@ -744,19 +758,18 @@ def colorize(text: str, category_name: str) -> str:
                         year_end = max(year_end_1, year_end_2)
                     # Replace year field with black color
                     if year_end != -1:
-                        text = text[:year_end] + "\\color{black}" + text[year_end:]
-                
-                else:
-                    # Check if text ends with "}"
-                    if text.endswith("}"):
-                        # Check if the character before the last "}" is ","
-                        if text[-2] == ",":
-                            text = text[:-2] + ",note={\\color{black}}}"
-                        else:
-                            text = text[:-1] + ",note={\\color{black}}}"
+                        text = text[:year_end] + "\\color{white}" + text[year_end:]
 
             return text
 
+        with open(white_bib, 'r') as bib_file:
+            bib_content = bib_file.read()
+
+        # use bibtexparser to parse the bib file
+        bib_entries = re.findall(r'@.*?\{([^,]*),\n(.*?)[\n, \"]\}', bib_content, re.DOTALL)
+        for item in bib_entries:
+            self.texts["Reference"].append(item)
+
         # Read BibTeX file
         with open(color_bib, 'r', encoding='utf-8') as bib_f:
             bibtex_entries = bib_f.readlines()
@@ -768,7 +781,6 @@ def colorize(text: str, category_name: str) -> str:
                 formatted_entry = f"{entry.strip()}"
             else:
                 formatted_entry = f"  {entry.strip()}"
-            self.texts["Reference"].append(formatted_entry)
             colored_ref = colorize(formatted_entry, "Reference")
             colored_references.append(colored_ref)
         # Write back to the BibTeX file

From 16a2cf8f920c6abb23a7a7c94ee56888b4544c44 Mon Sep 17 00:00:00 2001
From: MaoSong2022 <maosong@pjlab.org.cn>
Date: Wed, 17 Jul 2024 14:59:18 +0800
Subject: [PATCH 39/39] refactor(vrdu/): remove redundant folders

---
 vrdu/preprocess.py | 161 ---------------------------------------------
 1 file changed, 161 deletions(-)
 delete mode 100644 vrdu/preprocess.py

diff --git a/vrdu/preprocess.py b/vrdu/preprocess.py
deleted file mode 100644
index a3dc264..0000000
--- a/vrdu/preprocess.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import os
-import re
-
-from arxiv_cleaner.cleaner import Cleaner
-
-from vrdu.config import envs, config
-from vrdu import utils
-import vrdu.logger as logger
-
-
-log = logger.get_logger(__name__)
-
-
-def remove_comments(original_tex: str) -> None:
-    """
-    Removes comments from a TeX file.
-
-    Args:
-        original_tex (str): The path to the original TeX file.
-
-    Returns:
-        None
-    """
-    with open(original_tex, "r") as file:
-        content = file.read()
-
-    # Remove LaTeX comments
-    pattern = r"\\begin{comment}(.*?)\\end{comment}"
-    removed_comments = re.sub(pattern, "", content, flags=re.DOTALL)
-
-    with open(original_tex, "w") as file:
-        file.write(removed_comments)
-
-
-def clean_tex(original_tex: str) -> None:
-    """
-    Clean the given TeX file by creating a cleaner object and running the clean method.
-
-    Args:
-        original_tex (str): The path to the original TeX file.
-
-    Returns:
-        None
-    """
-    main_directory = os.path.dirname(original_tex)
-    tex = os.path.basename(original_tex)
-
-    # Create the cleaner
-    cleaner = Cleaner(
-        input_dir=main_directory,
-        output_dir=main_directory,
-        tex=tex,
-        command_options=config.command_options,
-        verbose=False,
-    )
-
-    # Run the cleaner
-    cleaner.clean()
-
-    # remove comments
-    remove_comments(original_tex)
-
-
-def replace_figures_extension_with_png(original_tex: str) -> None:
-    """
-    Replaces PDF, ps, eps figures' extension with PNG in a TeX file
-    to support pdfminer detecting bounding box.
-
-    Args:
-        original_tex (str): The path to the original TeX file.
-
-    Returns:
-        None: This function does not return anything.
-    """
-    main_directory = os.path.dirname(original_tex)
-    image_extensions = [".eps", ".ps", ".jpg", ".jpeg", ".png", ".pdf"]
-    image_files = {}
-    for root, _, files in os.walk(main_directory):
-        for file in files:
-            if any(file.endswith(ext) for ext in image_extensions):
-                image_name, ext = os.path.splitext(file)
-                # Store the relative path of the image as the value
-                image_files[image_name] = os.path.relpath(os.path.join(root, file), main_directory)
-
-    with open(original_tex, 'r') as f:
-        content = f.read()
-
-    # Replace \psfig and \epsfig commands with \includegraphics command
-    def custom_replace(match):
-        options = match.group(1) or ''
-        filepath = match.group(2)
-        if options:
-            return f"\\includegraphics[{options}]{{{filepath}}}"
-        else:
-            return f"\\includegraphics{{{filepath}}}"
-
-    content = re.sub(r"\\psfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
-    content = re.sub(r"\\epsfig(?:\[(.*?)\])?{(.+?)}", custom_replace, content)
-
-    # Traverse the image_files dictionary to update file extensions
-    for image_name, file_path in image_files.items():
-        base_name, current_extension = os.path.splitext(image_name)
-        correct_extension = os.path.splitext(file_path)[1]
-
-        if correct_extension not in ['.jpg', '.jpeg']:
-            correct_extension = '.png'
-
-        # Build a regular expression to match image files including optional extensions
-        pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?\{.*?' + re.escape(base_name) + r')(\.\w+)?\}')
-        replacement = rf'\1{correct_extension}}}'
-        content = pattern.sub(replacement, content)
-
-    # Write the updated content back to the file
-    with open(original_tex, 'w') as f:
-        f.write(content)
-
-
-def delete_table_of_contents(original_tex: str) -> None:
-    """
-    Deletes the table of contents from the given original_tex file.
-    This includes table of contents, list of figures, list of tables, and list of algorithms.
-
-    Parameters:
-        original_tex (str): The path to the original .tex file.
-
-    Returns:
-        None
-    """
-    with open(original_tex, "r") as file:
-        latex_content = file.read()
-
-    pattern = r"\\(" + "|".join(envs.table_of_contents) + r")"
-    modified_content = re.sub(pattern, "", latex_content)
-
-    with open(original_tex, "w") as file:
-        file.write(modified_content)
-
-
-def run(original_tex: str) -> None:
-    """
-    Generates a modified version of the given LaTeX document by performing the following steps:
-
-    Step 0: Clean the LaTeX document with arxiv_cleaner package.
-    Step 1: Replace EPS figures with PDF to make the LaTeX document compilable with pdflatex.
-    Step 2: Replace PDF figures with PNG to make pdfminer work.
-    Step 3: Delete the table of contents from the LaTeX document.
-
-    Args:
-        original_tex (str): The original LaTeX document.
-
-    Returns:
-        None
-    """
-    # Step 0: clean tex
-    clean_tex(original_tex)
-
-    # Step 1: process images
-    replace_figures_extension_with_png(original_tex)
-
-    # Step 3: delete table of contents
-    delete_table_of_contents(original_tex)