Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
a60c8cf
fix(preprocess.py): variable name error
MaoSong2022 Mar 6, 2024
007809c
refactor(batch_process.py, run_statistics.py): normalize log info
MaoSong2022 Mar 6, 2024
8eed6da
refactor(run_statistics.py): remove unused log
MaoSong2022 Mar 6, 2024
3bc1be8
fix(preprocess.py): TypeError
MaoSong2022 Mar 7, 2024
f3bc245
refactor(run_statistics.py): use more robust parser
MaoSong2022 Mar 7, 2024
f3f6f53
feat(utils.py): add timeout to prevent stuck
MaoSong2022 Mar 11, 2024
e2217e7
feature(config.py): add papertitle and abstract for order annotation
Mar 19, 2024
d2c8030
refactor(annotation.py): rewrite OrderAnnotation
Mar 19, 2024
7939562
Merge branch 'dev' of https://github.com/MaoSong2022/vrdu_data_proces…
MaoSong2022 Mar 19, 2024
11de6e8
refactor(export_to_dataset.py): simplify code logic
MaoSong2022 Mar 19, 2024
a28beb4
fix(batch_process.py): filter previous redundant files
MaoSong2022 Mar 19, 2024
ae6d448
feat(run_statistics.py): simplify code logic
MaoSong2022 Mar 19, 2024
892a1d3
fix(block.py): variable name error
MaoSong2022 Mar 22, 2024
b762f86
fix(config.py): variable name error
MaoSong2022 Mar 22, 2024
c2a96d9
fix(block.py): argument missing error
MaoSong2022 Mar 27, 2024
981605e
feat(block.py): enable to set block_id
MaoSong2022 Mar 27, 2024
cfc0344
fix(annotation.py): type error
MaoSong2022 Mar 27, 2024
1fbe06b
fix(annotation.py): variable shadow error
MaoSong2022 Mar 27, 2024
addf72f
fix(annotation.py): adjcent block relationship error
MaoSong2022 Mar 27, 2024
ce13e14
fix(annotation.py): infinite loop error
MaoSong2022 Mar 27, 2024
bbc11fd
fix(annotation.py): mising annotation for peer titles
MaoSong2022 Mar 27, 2024
fa445bd
fix(annotation.py): variable shadow error
MaoSong2022 Mar 27, 2024
309bd22
feat(block.py): add labels and references fields
MaoSong2022 Mar 28, 2024
25f9144
fix(block.py): variable type error
MaoSong2022 Mar 28, 2024
f8544b9
fix(block.py): list of group error
MaoSong2022 Mar 28, 2024
fe96622
fix(block.py): variable type error
MaoSong2022 Mar 28, 2024
2c296be
refactor(utils.py): simplify code logic
MaoSong2022 Mar 28, 2024
f8f72bf
docs(batch_process.py): add and remove docs
MaoSong2022 Mar 28, 2024
85462ba
refactor(batch_process.py): simplify code logic
MaoSong2022 Mar 28, 2024
838a59b
refactor(batch_process.py): delete unused conditions
MaoSong2022 Mar 28, 2024
26d33a0
refactor(main.py ): add type hint
MaoSong2022 Mar 28, 2024
50bc528
refactor(main.py): simplify code logic
MaoSong2022 Mar 28, 2024
0066c16
feat(app.py): visualize the result
MaoSong2022 Mar 29, 2024
b156e65
fix(app.py): index error
MaoSong2022 Mar 29, 2024
0a4df79
refactor(convert_coco_to_yolo.py): reorg files
MaoSong2022 Mar 29, 2024
682cd99
refactor(annotation.py, order_annotation.py, main.py): move OrderAnno…
MaoSong2022 Mar 29, 2024
161af7b
refactor(layout_annotation.py, main.py): rename file
MaoSong2022 Mar 29, 2024
6c4d718
docs(export_to_dataset.py): add debuging info
MaoSong2022 Mar 29, 2024
950472c
build(.gitignore): ignore data folder
MaoSong2022 Mar 29, 2024
049e8a7
refactor(layout_annotation.py, quality_check.py, main.py): split logi…
MaoSong2022 Mar 29, 2024
df53e67
fix(layout_annotation.py, order_annotation.py): data transformation e…
MaoSong2022 Mar 29, 2024
69b5283
feat(quality_check.py): support order annotation loading
MaoSong2022 Apr 1, 2024
3427d25
feat(clean_data.py, extract_category_block.py): remove unrelated files
Apr 3, 2024
abe0744
refactor(utils.py): use meaningful variable names
Apr 3, 2024
79ec1c7
feat(vrdu_table_augmentation.py): remove unrelated files
Apr 3, 2024
5057830
fix(render.py): table render error
MaoSong2022 Apr 7, 2024
247b662
Merge branch 'dev' of https://github.com/MaoSong2022/vrdu_data_proces…
MaoSong2022 Apr 7, 2024
96b4424
fix(batch_process.py): file name pattern error
MaoSong2022 May 17, 2024
dddb66d
refactor(arxiv_download.py): download paper files for a given paper p…
MaoSong2022 May 17, 2024
6917d15
refactor(export_to_dataset.py): simplify code logic
MaoSong2022 May 17, 2024
56b98eb
refactor(batch_process.py): make the main_path required
MaoSong2022 May 17, 2024
39c9ec9
docs(main.py): add docstrings
MaoSong2022 May 17, 2024
5ab0d80
feat(preprocess.py): add fixme for image preprocess
MaoSong2022 May 22, 2024
4be0817
refactor(utils.py): remove xelatex compiling
MaoSong2022 May 22, 2024
4c94d9e
refactor(main.py): simplify comments
MaoSong2022 May 22, 2024
4f81dd8
refactor(main.py, preprocess.py): move mkdir to main.py for clarity
MaoSong2022 May 22, 2024
0502413
feat(scripts): remove unused scripts
MaoSong2022 Jun 12, 2024
0da6604
refactor(run_statistics.py): simplify code logic
MaoSong2022 Jun 12, 2024
931098c
refactor(export_to_dataset.py): simplify code logic
MaoSong2022 Jun 12, 2024
35b500e
feat(generate_reading_annotation.py): change the format of reading an…
MaoSong2022 Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,6 @@ TexSoup/tests/

# jupyter notebook
*.ipynb

data/
data/discpline_info.csv
72 changes: 59 additions & 13 deletions batch_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,54 +16,83 @@
database = "data/processed_paper_database.csv"


def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
"""extract all MAIN.tex files for processing, if main_path is not None, then
only extract MAIN.tex files in the main_path (not recursive)
def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
"""extract all MAIN.tex files for processing,
only MAIN.tex files in the main_path (not recursive) are extracted

Args:
tex_files (List[str]): list of tex files
main_path (str, optional): path to main directory. Defaults to None.
main_path (str): path to main directory.

Returns:
List[str]: list of tex files that are compiable.
"""

# TODO: move this to config
redundant_tex_files = [
"paper_colored.tex",
"paper_white.tex",
"paper_original.tex",
]

result = []
for tex_file in tex_files:
if "paper_block_" in tex_file:
continue

if os.path.basename(tex_file) in redundant_tex_files:
continue

# ensure the tex files inside a subfolder is not included
# ex: cs.AI/1234.4567/figs/draw.tex will be excluded
if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path:
continue
# prevent processing previous generated files

# make sure the tex file is compiable (main document)
try:
with open(tex_file) as f:
content = f.read()
if "\\begin{document}" not in content:
continue
result.append(tex_file)
except UnicodeDecodeError:
log.debug(f"failed to read tex file: {tex_file}")
log.debug(f"failed to read tex file: {tex_file} due to UnicodeDecodeError")
continue

log.info(f"Before filtering, Found {len(result)} tex files")
# skip processed papers
log.info(f"[VRDU] Before filtering, found {len(result)} tex files")
if os.path.exists(database):
df = pd.read_csv(database)
processed_papers = set(df[df["status"] != "processing"]["path"])
processed_papers = set(df["path"])
result = [x for x in result if os.path.dirname(x) not in processed_papers]

log.info(f"After filtering, Found {len(result)} tex files")
log.info(f"[VRDU] After filtering, found {len(result)} tex files")
return result


def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
"""Process the data in a specific discpline.

Args:
path (str): The path to the raw data.
cpu_count (int): The number of CPUs to use for multiprocessing.
discpline (str): The discpline to process.

Raises:
Exception: If the processing fails.

Returns:
None
"""
discpline_path = os.path.join(path, discpline)
log.info(f"path to raw data: {discpline_path}")
log.info(f"Using cpu counts: {cpu_count}")
log.info(f"[VRDU] Path to raw data: {discpline_path}")
log.info(f"[VRDU] Using cpu counts: {cpu_count}")
tex_files = utils.extract_all_tex_files(discpline_path)
tex_files = filter_tex_files(tex_files, discpline_path)
log.info(f"Found {len(tex_files)} tex files")

try:
with multiprocessing.Pool(cpu_count) as pool:
pool.map(process_one_file, tex_files)
# save log file
except Exception:
log.exception(f"[VRDU] discpline: {discpline}, failed to process.")
finally:
Expand All @@ -73,6 +102,23 @@ def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:


def main():
"""This function is the entry point of the application.

Args:
path (str): The path to the raw data.
cpu_count (int): The number of CPUs to use for multiprocessing.
discpline (str): The discpline to process.

Raises:
Exception: If the processing fails.

Returns:
None

References:
https://arxiv.org/category_taxonomy
"""
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"-p", "--path", type=str, required=True, help="path to raw data"
Expand Down
Loading