### 0. Import libraries and implement functions

In [1]:
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexCharsNode, LatexCommentNode,\
                                    LatexGroupNode, LatexMathNode, LatexMacroNode, LatexSpecialsNode
import sys, re, os, json

In [2]:
def find_tex_files(dir, max_depth):
    if max_depth == 0 or not os.path.isdir(dir):
        return []

    fs = os.listdir(dir)

    if dir != ".":
        fs = [dir + "/" + f for f in fs]

    sub = [f for f in fs if os.path.isdir(f)]
    fs = [f for f in fs if os.path.isfile(f) and f.endswith(".tex")]

    for d in sub:
        fs.extend(find_tex_files(d, max_depth - 1))
    return fs

In [3]:
def find_begin_document(files):
    for file in files:
        with open(file, "r") as f:
            text = f.read()
            if r"\begin{document}" in text:
                return file
    return None

In [4]:
def get_latex_nodes(fp):
    with open(fp, "r") as f:
        text = f.read()

    newcommand_pattern = r"^\\newcommand\{([^}]+)\}(?:\[[^]]+\])?\{(.+)\}$"
    newcommands = re.findall(newcommand_pattern, text, flags=re.M)
    # print(newcommands)
    for (short_hand, cmd) in newcommands:
        short_hand = short_hand.replace("\\", "\\\\")
        cmd = cmd.replace("\\", "\\\\")
        text = re.sub(short_hand, cmd, text)

    text = re.sub(newcommand_pattern, "", text)

    bibitem_pattern = r"^\\bibitem{([^}]+)}\s+(.+)$"
    text = re.sub(bibitem_pattern, "", text)

    cmds_to_remove = r"\\(centering|newpage|clearpage|tableofcontents|maketitle|hrule|vfill)"
    text = re.sub(cmds_to_remove, "", text)
    text = re.sub(r"\\[hv]space\{.*?\}", "", text)
    # text = re.sub(r"\s+:", " ", text).strip()
    
    w = LatexWalker(text)
    nodes, _, _ = w.get_latex_nodes()
    return nodes

In [5]:
SENTENCE_PATTERN = re.compile(r"(?<!\b[A-Z])(?<![Ee][Tt] [Aa][Ll])\.\s+(?=[A-Z])")

def split_sentences(text, level):
    sentences = SENTENCE_PATTERN.split(text)
    sentences = [(sentence.strip(), level) for sentence in sentences]

    return sentences

In [6]:
LEVELS = {
    "document": 0,
    "abstract": 1,
    "section": 1,
    "subsection": 2,
    "subsubsection": 3,
    "paragraph": 4,
    "subparagraph": 5,
    "itemize": 6,
    "item": 7,
    "leaf": 8,
}

In [7]:
def hierarchy_nodes(nodes, working_dir, append_trailing=False):
    text = ""
    tokens = []
    for node in nodes:
        if node == None or isinstance(node, LatexCommentNode):
            continue

        if isinstance(node, LatexCharsNode):
            chars = node.chars
            text += chars
        elif isinstance(node, LatexMathNode):
            text += node.latex_verbatim()
        elif isinstance(node, LatexGroupNode):
            text += node.latex_verbatim()
            # tokens.extend(node.nodelist)
        elif isinstance(node, LatexEnvironmentNode):
            env_name = node.environmentname.lower()
            if env_name in ["figure", "figure*", "equation", "equation*", "align", "align*", "table", "remark", "remark*"]:
                tokens.extend(split_sentences(text, LEVELS["leaf"]))
                tokens.append((node.latex_verbatim(), LEVELS["leaf"]))
                text = ""
            elif env_name == "itemize":
                tokens.extend(split_sentences(text, LEVELS["leaf"]))
                tokens.append((env_name, LEVELS[env_name]))
                latex = node.latex_verbatim()
                # pattern = r"\\(begin|end)\{" + env_name + r"\}" + r"(\[[^]]+\])?"
                pattern = r"\\(begin|end)\{itemize\}(\[[^]]+\])?"
                latex = re.sub(pattern, "", latex, flags=re.IGNORECASE).strip()

                items = re.split(r"\\item", latex, flags=re.IGNORECASE)
                for item in items:
                    item = item.strip()
                    if len(item) == 0:
                        continue

                    tokens.append(("item", LEVELS["item"]))
                    tokens.extend(split_sentences(item, LEVELS["leaf"]))

            elif env_name in "document":
                tokens.append((env_name, LEVELS[env_name]))
                sub_tokens = hierarchy_nodes(node.nodelist, working_dir, True)
                tokens.extend(sub_tokens)
            elif env_name in "abstract":
                tokens.append((env_name, LEVELS[env_name]))
                latex = node.latex_verbatim()
                latex = re.sub(r"\\(begin|end)\{abstract\}", "", latex, flags=re.IGNORECASE).strip()
                tokens.extend(split_sentences(latex, LEVELS["leaf"]))
            else:
                # print(env_name, "[Environment]")
                sub_tokens = hierarchy_nodes(node.nodelist, working_dir, True)
                tokens.extend(sub_tokens)
        elif isinstance(node, LatexMacroNode):
            if node.macroname in ["input", "include"]:
                if node.nodeargd and node.nodeargd.argnlist:
                    arg_node = node.nodeargd.argnlist[0]
                    fname = ""
                    if hasattr(arg_node, 'nodelist') and arg_node.nodelist:
                         if isinstance(arg_node.nodelist[0], LatexCharsNode):
                             fname = arg_node.nodelist[0].chars
                    elif isinstance(arg_node, LatexCharsNode):
                        fname = arg_node.chars
                        
                    if fname:
                        if not fname.endswith('.tex'): fname += '.tex'
                        
                        full_path = os.path.join(working_dir, fname)
                        if os.path.exists(full_path):
                            print(f"    -> Parsing input: {fname}")
                            try:
                                sub_tokens = get_latex_nodes(full_path)
                                tokens.extend(hierarchy_nodes(sub_tokens, working_dir, True))
                            except Exception as e:
                                print(f"    [Error] Failed to parse input {fname}: {e}")
            elif node.macroname in ["section", "subsection", "subsubsection", "paragraph", "subparagraph"]:
                tokens.extend(split_sentences(text, LEVELS["leaf"]))
                latex = node.latex_verbatim()
                latex = re.sub(r"\\" + node.macroname + r"\{", "", latex, flags=re.IGNORECASE).strip()
                tokens.append((latex[:-1], LEVELS[node.macroname]))
                text = ""
            elif node.macroname in ["label", "footnote"]:
                latex = node.latex_verbatim()
                pattern = fr"\\{node.macroname}" + "{[^}]+}"
                latex = re.sub(pattern, "", latex)
                text += latex
            elif node.macroname in ["cite", "citep", "citet"]:
                pass
            else:
                text += node.latex_verbatim()
        elif isinstance(node, LatexSpecialsNode):
            text += node.specials_chars

    if append_trailing and text != "":
        tokens.extend(split_sentences(text, LEVELS["leaf"]))

    return tokens

In [8]:
def add_node(root, stack, elements):
    key = elements[stack[0]][0]
    if len(stack) == 1:
        root[key] = {}
    else:
        if key in root:
            add_node(root[key], stack[1:], elements)

In [9]:
def find_version_elements(version_directory):
    main_tex_fp = os.path.join(version_directory, "main.tex")

    if not os.path.exists(main_tex_fp):
        depth1_tex_files = find_tex_files(version_directory, 1)
        main_tex_fp = find_begin_document(depth1_tex_files)
        if main_tex_fp == None:
            print("Not found main tex file")
            return None

    nodes = get_latex_nodes(main_tex_fp)
    nodes = hierarchy_nodes(nodes, version_directory)
    nodes = [node for node in nodes if len(node[0]) > 0]

    ignore_idx = 0
    while ignore_idx < len(nodes) and nodes[ignore_idx] != ("document", 0):
        ignore_idx += 1
    elements = nodes[ignore_idx:]

    return elements

def hierarchy_elements(elements):
    root = {}
    node_stack = []
    
    for i in range(len(elements)):
        if len(node_stack) > 0:
            last_node_level = elements[node_stack[-1]][1]
        else:
            last_node_level = 0

        current_node_level = elements[i][1]

        if current_node_level > last_node_level:
            node_stack.append(i)
        elif current_node_level == last_node_level:
            if len(node_stack) > 0:
                node_stack[-1] = i
            else:
                node_stack.append(i)
        else:
            while len(node_stack) > 0 and current_node_level <= elements[node_stack[-1]][1]:
                node_stack.pop()

            node_stack.append(i)
        add_node(root, node_stack, elements)

    return root

### 1. Testing

In [10]:
def replace_with_id(root, new_root, prefix, ids):
    for key, value in root.items():
        # print(key)
        new_key = prefix + str(ids[key])
        if value != {}:
            new_root[new_key] = {}
            replace_with_id(value, new_root[new_key], prefix, ids)
        else:
            new_root[new_key] = value

def flatten_tree(root):
    parents = {}

    def flatten_tree_recurse(node, parent):
        if isinstance(node, dict):
            for key, child in node.items():
                if parent != None:
                    parents[key] = parent
                flatten_tree_recurse(child, key)

    flatten_tree_recurse(root, None)
    return parents

In [11]:
# nodes, node_hierarchy, refs = hierarchy_version("../../23127247_milestone1/2210.16424/tex/2210.16424v1/")
raw_prefix_dir = os.path.join("..", "..", "..", "23127247_milestone1")
parsed_prefix_dir = os.path.join(".")

paper_ids = [d for d in os.listdir(raw_prefix_dir) if os.path.isdir(os.path.join(raw_prefix_dir, d))]
paper_ids.sort()

for paper_id in paper_ids[:5]:
    print(f"Processing {paper_id}...")
    version = 1
    
    all_version_elements = []
    paper_elements = set()
    while True:
        version_dir = os.path.join(raw_prefix_dir, paper_id, "tex", paper_id + "v" + str(version))
        print(version_dir)
        if not os.path.exists(version_dir):
            break
        version_elements = find_version_elements(version_dir)
        all_version_elements.append(version_elements)
        set_version_elements = [ele[0] for ele in version_elements]
        paper_elements |= set(set_version_elements)
        version += 1
    
    paper_elements = list(paper_elements)
    paper_elements_dict = {paper_elements[i]: i for i in range(len(paper_elements))}
    # print(json.dumps(paper_elements_dict, indent=4))
    
    hierarchy = {}
    for i in range(len(all_version_elements)):
        version_elements = all_version_elements[i]
        root = hierarchy_elements(version_elements)
        new_root = {}
        replace_with_id(root, new_root, paper_id + "_", paper_elements_dict)
        new_root = flatten_tree(new_root)
        hierarchy[str(i + 1)] = new_root
    
    elements = {paper_id + "_" + str(value): key for key, value in paper_elements_dict.items()}
    
    final_form = {
        "elements": elements,
        "hierarchy": hierarchy,
    }

    print("Saving data...")
    
    # print(json.dumps(final_form, indent=4))
    target_dir = os.path.join(parsed_prefix_dir, paper_id)
    os.makedirs(target_dir, exist_ok=True)
    with open(os.path.join(target_dir, "hierarchy.json"), "w", encoding="utf-8") as f:
        json.dump(final_form, f, ensure_ascii=False, indent=4)

Processing 2210.16298...
../../../23127247_milestone1/2210.16298/tex/2210.16298v1
    -> Parsing input: sections/intro.tex
    -> Parsing input: sections/methodology.tex
    -> Parsing input: sections/exp.tex
    -> Parsing input: sections/related_work.tex
    -> Parsing input: sections/discussion.tex
    -> Parsing input: sections/app.tex
../../../23127247_milestone1/2210.16298/tex/2210.16298v2
Saving data...
Processing 2210.16299...
../../../23127247_milestone1/2210.16299/tex/2210.16299v1
../../../23127247_milestone1/2210.16299/tex/2210.16299v2
../../../23127247_milestone1/2210.16299/tex/2210.16299v3
../../../23127247_milestone1/2210.16299/tex/2210.16299v4
../../../23127247_milestone1/2210.16299/tex/2210.16299v5
Saving data...
Processing 2210.16300...
../../../23127247_milestone1/2210.16300/tex/2210.16300v1
../../../23127247_milestone1/2210.16300/tex/2210.16300v2
../../../23127247_milestone1/2210.16300/tex/2210.16300v3
Saving data...
Processing 2210.16301...
../../../23127247_milesto