In [None]:
# # Notebook 1: Hierarchical Parsing & Standardization
# 
# **Objectives:**
# 1. Parse raw LaTeX into a structured JSON hierarchy.
# 2. Normalize text and math formulas.
# 3. Extract and deduplicate references (`refs.bib`), filtering only those cited in the text.
# 4. Generate unique IDs (Hashes) for content to handle deduplication across versions.

import os, re, json, hashlib, shutil, sys
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexCharsNode, \
                                    LatexGroupNode, LatexMathNode, LatexMacroNode, LatexCommentNode, LatexSpecialsNode
from concurrent.futures import ThreadPoolExecutor, as_completed

RAW_DATA_PATH = os.path.join("..", "data",'raw', 'auto') 
OUTPUT_PATH = os.path.join("..", 'data', 'processed', 'parsed', 'auto')

# RAW_DATA_PATH = os.path.join("..", "data",'testing', 'input') 
# OUTPUT_PATH = os.path.join("..", 'data', 'testing', 'output')

LEVELS = {
    "root": 0,
    "abstract": 1,
    "section": 2,
    "subsection": 3,
    "subsubsection": 4,
    "paragraph": 5,
    "subparagraph": 6,
    "itemize": 7,   
    "enumerate": 7,
    "item": 8,
    "leaf": 9       # Sentences, Figures, Tables, Formulas
}

print(f"Input: {os.path.abspath(RAW_DATA_PATH)}")
print(f"Output: {os.path.abspath(OUTPUT_PATH)}")

Input: /Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/testing/input
Output: /Users/thomas200905/Documents/Thomas/HCMUS/Third Year/Semester 7/Intro to Data Science/Milestones/MS02/data/testing/output


In [16]:
def clean_latex_string(text):
    if not text:
        return ""
    text = re.sub(r'\\bibinfo\{.*?\}\{(.*?)\}', r'\1', text)
    text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def clean_body_text(text):
    if not text:
        return ""
    cmds_to_remove = r'\\(centering|newpage|clearpage|tableofcontents|maketitle|hrule|vfill)'
    text = re.sub(cmds_to_remove, '', text)
    text = re.sub(r'\\[hv]space\{.*?\}', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_math(latex_code, is_inline=True):
    clean = latex_code.strip()
    clean = re.sub(r'^\\\(|\\\)$', '', clean)
    clean = re.sub(r'^\\\[|\\\]$', '', clean)
    clean = re.sub(r'^(\$)|(\$)$', '', clean)
    
    if is_inline:
        return f"${clean}$"
    else:
        if clean.startswith(r'\begin'):
            return clean
        return f"\\begin{{equation}}{clean}\\end{{equation}}"

def generate_id(content, paper_id):
    if not content: return None
    hash_obj = hashlib.md5(content.encode('utf-8'))
    return f"{paper_id}_{hash_obj.hexdigest()}"

# Avoids splitting on "et al.", "Fig.", "Eq."
SENTENCE_PATTERN = re.compile(r"(?<!\b[A-Z])(?<![Ee][Tt] [Aa][Ll])(?<![Ff][Ii][Gg])(?<![Ee][Qq])\.\s+(?=[A-Z])")

def split_sentences(text):
    if not text: return []
    sentences = SENTENCE_PATTERN.split(text)
    return [s.strip() for s in sentences if s.strip()]

In [17]:
def get_field(field_name, entry):
    pat_braces = re.compile(fr'\b{field_name}\s*=\s*\{{((?:[^{{}}]|\{{[^{{}}]*\}})*)\}}', re.IGNORECASE | re.DOTALL)
    match = pat_braces.search(entry)
    if match: return match.group(1)
    pat_quotes = re.compile(fr'\b{field_name}\s*=\s*\"(.*?)\"', re.IGNORECASE | re.DOTALL)
    match = pat_quotes.search(entry)
    if match: return match.group(1)
    return None

ADDITIONAL_FIELDS = [
    'journal', 'booktitle', 'volume', 'number', 'pages', 
    'publisher', 'organization', 'school', 'institution', 
    'doi', 'url', 'issn', 'isbn', 'month', 'editor', 'series'
]

def extract_from_bib(file_content):
    references = []
    raw_entries = re.split(r'^@', file_content, flags=re.MULTILINE)
    
    for entry in raw_entries:
        entry = entry.strip()
        if not entry or entry.startswith('%'): continue
            
        key_match = re.search(r'^(\w+)\s*\{\s*([^,]+),', entry)
        if not key_match: continue
            
        ref_type = key_match.group(1)
        ref_id = key_match.group(2).strip()

        if ref_type.lower() in ['string', 'comment', 'preamble']: continue
        ref_data = {
            "id": ref_id,
            "type": ref_type,
            "title": "",
            "authors": [],
            "year": "",
            "source_type": "bib"
        }
        raw_authors = get_field('author', entry)
        if raw_authors:
            cleaned_auth = clean_latex_string(raw_authors)
            ref_data["authors"] = [a.strip() for a in cleaned_auth.split(' and ')]
            
        ref_data["title"] = clean_latex_string(get_field('title', entry))
        ref_data["year"] = get_field('year', entry) or ""
        
        for field in ADDITIONAL_FIELDS:
            val = get_field(field, entry)
            if val:
                ref_data[field] = clean_latex_string(val)

        references.append(ref_data)
    return references

def extract_from_bibitem(file_content):
    bib_section = re.search(r'\\begin\{thebibliography\}.*?\n(.*?)\\end\{thebibliography\}', file_content, re.DOTALL)
    if not bib_section:
        bib_section = re.search(r'\\begin\{references\}.*?\n(.*?)\\end\{references\}', file_content, re.DOTALL)
    
    content_to_parse = bib_section.group(1) if bib_section else file_content
    raw_items = re.split(r'\\bibitem', content_to_parse)
    
    references = []
    pat_bibinfo = re.compile(r'\\bibinfo\{(.*?)\}\{((?:[^{}]|\{[^{}]*\})*)\}', re.DOTALL | re.IGNORECASE)
    pat_italic = re.compile(r'(?:\{\\em\s+|\\emph\{|\\textit\{)((?:[^{}]|\{[^{}]*\})*)\}', re.IGNORECASE | re.DOTALL)
    pat_quotes = re.compile(r'``(.*?)''|"(.*?)"', re.DOTALL)
    pat_year = re.compile(r'\((\d{4})\)')

    for item in raw_items:
        if not item.strip():
            continue
        
        id_match = re.search(r'^\{([^}]+)\}', item.strip())
        if not id_match: continue
        ref_id = id_match.group(1).strip()
        id_end_idx = item.find('}') + 1
        
        ref_data = {
            "id": ref_id, "title": "", "authors": [], "year": "", 
            "journal": "", "source_type": "bibitem"
        }
        
        if r'\bibinfo' in item:
            for key, value in pat_bibinfo.findall(item):
                key = key.lower()
                val = clean_latex_string(value)
                if key == 'author': ref_data['authors'].append(val)
                else: ref_data[key] = val
            
        else:
            y_match = pat_year.search(item)
            if y_match: ref_data['year'] = y_match.group(1)
            quote_match = pat_quotes.search(item)
            italic_match = pat_italic.search(item)
            if quote_match and (not italic_match or quote_match.start() < italic_match.start()):
                raw_title = quote_match.group(1) or quote_match.group(2)
                ref_data['title'] = clean_latex_string(raw_title)
                
                title_start = quote_match.start()
                if title_start > id_end_idx:
                    raw_auth = item[id_end_idx:title_start]
                    ref_data['authors'] = [clean_latex_string(raw_auth).rstrip(',').strip()]
                    
                if italic_match:
                    ref_data['journal'] = clean_latex_string(italic_match.group(1))

            elif italic_match:
                italic_text = clean_latex_string(italic_match.group(1))
                
                italic_start = italic_match.start()
                pre_text = item[id_end_idx:italic_start].strip()
                if pre_text.endswith(',') or pre_text.endswith('.'):
                    pre_text = pre_text[:-1]
                
                last_comma = pre_text.rfind(',')
                
                if last_comma != -1:
                    potential_author = pre_text[:last_comma]
                    potential_title = pre_text[last_comma+1:]
                    
                    # too short, the comma was part of the name
                    if len(potential_title.strip()) < 5:
                        # case: Book. Italic = Title.
                        ref_data['title'] = italic_text
                        ref_data['authors'] = [clean_latex_string(pre_text)]
                    else:
                        # case: Article. Plain = Title, Italic = Journal.
                        ref_data['title'] = clean_latex_string(potential_title)
                        ref_data['authors'] = [clean_latex_string(potential_author)]
                        ref_data['journal'] = italic_text
                else:
                    # case: "Author \emph{BookTitle}"
                    ref_data['title'] = italic_text
                    ref_data['authors'] = [clean_latex_string(pre_text)]
            else:
                ref_data['title'] = clean_latex_string(item[id_end_idx:])

        references.append(ref_data)
    return references

def get_ref_fingerprint(ref):
    title = ref.get('title', '')
    if not title: return None 
    
    clean_title = re.sub(r'[^a-z0-9]', '', title.lower())
    
    year = ref.get('year', '')
    clean_year = re.sub(r'[^0-9]', '', str(year))
    
    return f"{clean_title}_{clean_year}"

def unionize_refs(target, source):
    for key, value in source.items():
        if key not in target or not target[key]:
            target[key] = value
    return target

def process_batch_get_paper_refs(extracted_list, content_map, key_map):
    for ref in extracted_list:
        ref_id = ref['id']
        fp = get_ref_fingerprint(ref)
        
        if not fp: 
            fp = f"ID_{ref_id}"

        if fp in content_map:
            master_ref = content_map[fp]
            unionize_refs(master_ref, ref)
            key_map[ref_id] = master_ref['id']
        else:
            content_map[fp] = ref
            key_map[ref_id] = ref_id

def get_paper_references(paper_id, root_path):
    paper_path = os.path.join(root_path, paper_id)
    content_map = {}
    key_map = {} 
    
    for root, _, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".bib"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        refs = extract_from_bib(f.read())
                        process_batch_get_paper_refs(refs, content_map, key_map)
                except:
                    pass
    
    for root, _, files in os.walk(paper_path):
        for file in files:
            if file.endswith(".tex"):
                try:
                    with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        if "bibitem" in content:
                            refs = extract_from_bibitem(content)
                            process_batch_get_paper_refs(refs, content_map, key_map)
                except:
                    pass
                
    return list(content_map.values()), key_map

In [18]:
def parse_latex_nodes(nodes, base_dir, current_level_name="leaf"):
    elements = []
    citations = set()
    
    for node in nodes:
        if node is None or isinstance(node, LatexCommentNode):
            continue

        if isinstance(node, LatexCharsNode):
            text = clean_body_text(node.chars)
            if text:
                for s in split_sentences(text):
                    elements.append({
                        'content': s,
                        'level': LEVELS['leaf'],
                        'type': 'sentence'
                    })

        elif isinstance(node, LatexMathNode):
            is_inline = (node.displaytype == 'inline')
            norm_math = normalize_math(node.latex_verbatim(), is_inline)
            elements.append({
                'content': norm_math,
                'level': LEVELS['leaf'],
                'type': 'math'
            })

        elif isinstance(node, LatexEnvironmentNode):
            env = node.environmentname.lower()
            if env in ['thebibliography', 'references']:
                continue
            if env in ['figure', 'table', 'figure*', 'table*']:
                elements.append({
                    'content': node.latex_verbatim(), 
                    'level': LEVELS['leaf'],
                    'type': 'figure'
                })
            elif env in ['itemize', 'enumerate']:
                elements.append({
                    'content': f"List ({env})", 
                    'level': LEVELS['itemize'],
                    'type': env
                })
                sub_elems, sub_cites = parse_latex_nodes(node.nodelist, base_dir, 'item')
                elements.extend(sub_elems)
                citations.update(sub_cites)
            elif env in ['equation', 'align', 'equation*', 'align*']:
                norm_math = normalize_math(node.latex_verbatim(), is_inline=False)
                elements.append({
                    'content': norm_math,
                    'level': LEVELS['leaf'],
                    'type': 'math_block'
                })
                
            else:
                if env == 'abstract':
                    elements.append({'content': 'Abstract', 'level': LEVELS['abstract'], 'type': 'section'})
                
                sub_elems, sub_cites = parse_latex_nodes(node.nodelist, base_dir, env)
                elements.extend(sub_elems)
                citations.update(sub_cites)

        elif isinstance(node, LatexMacroNode):
            name = node.macroname
            
            if name == 'bibliography':
                continue

            if name in ['section', 'subsection', 'subsubsection', 'paragraph', 'subparagraph']:
                if node.nodeargd and node.nodeargd.argnlist:
                    title = ""
                    for arg in node.nodeargd.argnlist:
                        if not arg:
                            continue
                        
                        if hasattr(arg, 'nodelist'):
                            title += "".join([n.chars for n in arg.nodelist if isinstance(n, LatexCharsNode)])
                        
                        elif isinstance(arg, LatexCharsNode):
                            title += arg.chars
                    
                    elements.append({
                        'content': clean_body_text(title),
                        'level': LEVELS.get(name, LEVELS['section']),
                        'type': name
                    })

            elif name == 'item':
                elements.append({
                    'content': "Item", 
                    'level': LEVELS['item'],
                    'type': 'item'
                })
            elif name in ['cite', 'citep', 'citet', 'citeauthor']:
                if node.nodeargd and node.nodeargd.argnlist:
                    for arg in node.nodeargd.argnlist:
                        if not arg:
                            continue

                        if hasattr(arg, 'nodelist'):
                            for n in arg.nodelist:
                                if isinstance(n, LatexCharsNode):
                                    keys = n.chars.split(',')
                                    for k in keys: citations.add(k.strip())
                        elif isinstance(arg, LatexCharsNode):
                            keys = arg.chars.split(',')
                            for k in keys: citations.add(k.strip())

                elements.append({
                    'content': node.latex_verbatim(), 
                    'level': LEVELS['leaf'],
                    'type': 'citation'
                })

            elif name in ['input', 'include']:
                if node.nodeargd and node.nodeargd.argnlist:
                    arg_node = node.nodeargd.argnlist[0]
                    fname = ""
                    if hasattr(arg_node, 'nodelist') and arg_node.nodelist:
                         if isinstance(arg_node.nodelist[0], LatexCharsNode):
                             fname = arg_node.nodelist[0].chars
                    elif isinstance(arg_node, LatexCharsNode):
                        fname = arg_node.chars
                        
                    if fname:
                        if not fname.endswith('.tex'):
                            fname += '.tex'
                        
                        full_path = os.path.join(base_dir, fname)
                        if os.path.exists(full_path):
                            print(f"    -> Parsing input: {fname}")
                            try:
                                with open(full_path, 'r', encoding='latin-1') as f:
                                    sub_nodes, _, _ = LatexWalker(f.read()).get_latex_nodes()
                                    sub_elems, sub_cites = parse_latex_nodes(sub_nodes, base_dir, current_level_name)
                                    elements.extend(sub_elems)
                                    citations.update(sub_cites)
                            except Exception as e:
                                print(f"    [Error] Failed to parse input {fname}: {e}")

    return elements, citations

In [19]:
def build_hierarchy_tree(flat_elements, paper_id, global_elements_store):
    hierarchy = {}
    
    root_content = f"Document Root {paper_id}"
    root_id = generate_id(root_content, paper_id)
    global_elements_store[root_id] = root_content
    
    stack = [(0, root_id)]
    
    for item in flat_elements:
        content = item['content']
        level = item['level']
        if not content:
            continue
        
        node_id = generate_id(content, paper_id)
        global_elements_store[node_id] = content

        while stack and stack[-1][0] >= level:
            stack.pop()
            
        if stack:
            parent_id = stack[-1][1]
            hierarchy[node_id] = parent_id
        
        if level < LEVELS['leaf']:
            stack.append((level, node_id))
            
    return hierarchy

def write_refs_bib(references, output_path):
    internal_keys = {'id', 'type', 'source_type', 'authors'}
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for ref in references:
            entry_type = ref.get('type', 'misc').lower()
            key = ref.get('id', 'unknown')
            
            lines = [f"@{entry_type}{{{key},"]
            
            if ref.get('authors'):
                auth_str = " and ".join(ref['authors'])
                lines.append(f"  author = {{{auth_str}}},")
            
            for k, v in ref.items():
                if k not in internal_keys and v:
                    lines.append(f"  {k} = {{{v}}},")
                
            lines.append("}\n")
            f.write("\n".join(lines))

In [None]:
def process_paper(paper_id):
    source_dir = os.path.join(RAW_DATA_PATH, paper_id)
    target_dir = os.path.join(OUTPUT_PATH, paper_id)
    os.makedirs(target_dir, exist_ok=True)

    for fname in ['metadata.json', 'references.json']:
        src = os.path.join(source_dir, fname)
        if os.path.exists(src):
            shutil.copy(src, os.path.join(target_dir, fname))
            
    all_raw_refs, key_map = get_paper_references(paper_id, RAW_DATA_PATH)
    ref_lookup = {r['id']: r for r in all_raw_refs}
    
    tex_root = os.path.join(source_dir, 'tex')
    if not os.path.exists(tex_root):
        return

    global_elements = {}
    hierarchy_output = {}
    all_cited_keys = set()
    
    for v_folder in sorted(os.listdir(tex_root)):
        full_v_path = os.path.join(tex_root, v_folder)
        if not os.path.isdir(full_v_path):
            continue

        main_file = None
        for f in os.listdir(full_v_path):
            if f.endswith('.tex'):
                try:
                    with open(os.path.join(full_v_path, f), 'r', encoding='latin-1') as tf:
                        if r'\documentclass' in tf.read():
                            main_file = f
                            break
                except:
                    pass
        
        if not main_file:
            continue
        
        with open(os.path.join(full_v_path, main_file), 'r', encoding='latin-1') as f:
            w = LatexWalker(f.read())
            nodes, _, _ = w.get_latex_nodes()
            
        elements, citations = parse_latex_nodes(nodes, full_v_path)
        
        normalized_citations = set()
        for c in citations:
            master_key = key_map.get(c, c)
            normalized_citations.add(master_key)
            
        all_cited_keys.update(normalized_citations)

        version_key = v_folder.split('v')[-1]
        hierarchy_output[version_key] = build_hierarchy_tree(elements, paper_id, global_elements)

    final_json = {"elements": global_elements, "hierarchy": hierarchy_output}
    with open(os.path.join(target_dir, 'hierarchy.json'), 'w', encoding='utf-8') as f:
        json.dump(final_json, f, indent=4, ensure_ascii=False)
        
    final_refs = []
    for k in all_cited_keys:
        if k in ref_lookup:
            final_refs.append(ref_lookup[k])
    
    if not final_refs and all_raw_refs:
        final_refs = all_raw_refs
        
    write_refs_bib(final_refs, os.path.join(target_dir, 'refs.bib'))
    
    print(f"[{paper_id}] Done. Elements: {len(global_elements)}, Refs: {len(final_refs)}")

paper_ids = [d for d in os.listdir(RAW_DATA_PATH) if os.path.isdir(os.path.join(RAW_DATA_PATH, d))]
paper_ids.sort()

MAX_WORKERS = 30 

print(f"Starting parallel processing of {len(paper_ids)} papers with {MAX_WORKERS} threads...")
print("-" * 50)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_paper, pid): pid for pid in paper_ids}
    
    for future in as_completed(futures):
        pid = futures[future]
        try:
            future.result() 
        except Exception as e:
            print(f"\n[CRITICAL ERROR] Failed to process {pid}: {e}")

print("-" * 50)
print("All papers processed.")

Starting parallel processing of 6 papers with 30 threads...
--------------------------------------------------
[2211.03026] Done. Elements: 310, Refs: 26
[2211.03016] Done. Elements: 121, Refs: 6
    -> Parsing input: LaTeX/1_introduction.tex
    -> Parsing input: LaTeX/2_related_works.tex
    -> Parsing input: LaTeX/3_middleGAN.tex
    -> Parsing input: content/1-introduction.tex
    -> Parsing input: content/2-related-work.tex
    -> Parsing input: LaTeX/4_evaluation.tex
    -> Parsing input: content/3-method.tex
    -> Parsing input: LaTeX/6_conclusion.tex
    -> Parsing input: content/4-evaluation.tex
    -> Parsing input: content/5-discussion.tex
    -> Parsing input: content/6-conclusions.tex
[2211.03144] Done. Elements: 429, Refs: 45
    -> Parsing input: content/8-appendix.tex
[2211.03002] Done. Elements: 612, Refs: 29
[2211.03001] Done. Elements: 674, Refs: 43
    -> Parsing input: tex/abstract.tex
    -> Parsing input: tex/intro.tex
    -> Parsing input: tex/related_works.tex