Cleaning LaTeX files

In [1]:
import os, json, re, zipfile

# This notebook works locally
os.chdir('C:/Users/3140m/Documents/Master Thesis/data4tune/Connectionism and Eliminativism')
print(os.getcwd())

C:\Users\3140m\Documents\Master Thesis\data4tune\Connectionism and Eliminativism


In [None]:
# The full list of literatures in Connectionism and Eliminativism in Philpapers, which consists of around 40 papers without overlap
literature_list = [
    "Philosophy and Connectionist Theory", # 0
    "Connectionism Debates on Psychological Explanation_1", # 1
    "Connectionism Debates on Psychological Explanation_2", # 2
    "The Mind as a Scientific Object", # 3
    "Folk psychology and cognitive architecture",
    "Is connectionism commonsense",
    "Connectionism and the commitments of folk psychology",
    "Connectionism and the fate of folk psychology A reply to Ramsey, Stich and Garon",
    "Connectionism, eliminativism, and the semantic view of theories",
    "Beliefs, functionally discrete states, and connectionist networks",
    "Distributed representation and causal modularity A rejoinder to Forster and Saidel",
    "On the threat of eliminativism",
    "Eliminative connectionism Its implications for a return to an empiricistbehaviorist linguistics",
    "Networks with Attitudes",
    "Connectionism isnt magic",
    "Asking Whats Inside the Head Neurophilosophy Meets the Extended Mind"
]

# Let the codes for cleaning iterate over a literature list
for i in literature_list[4:]:
    file_name = i

    # Load the tex files having the file name in the list. The files were converted from PDF files, using Mathpix.
    with open(f'{file_name}.tex', 'r') as file:
        latex_content = file.read()

# Heavy regular expressions (regex) will be used for cleaning.
    
    # Replace the title in TeX into "processed_{file_name}"
    original_title = ""
    title_match = re.search(r'\\title\{(.+?)\}', latex_content)
    if title_match:
        original_title = title_match.group(1)
        latex_content = re.sub(r'\\title\{.+?\}', f'\\\\title{{processed_{file_name}}}', latex_content)

    # Make the original title in TeX be a section name, because most of the original titles were set wrongly by Mathpix and they are section names in most cases.
    if original_title:
        new_section = f'\\\section*{{{original_title}}}\n'
        print(original_title)
        print(new_section)
        latex_content = re.sub(r'(\\maketitle)', f'\\\\maketitle\n{new_section}', latex_content)
    else:
        print("No title found in the LaTeX content.")

    # Remove sections such as introduction and conclusion and their contents.
    sections_to_remove = [
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Abstract(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Introduction(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Conclusion(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Summar(y|ies)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Background(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Note(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Acknowledg(e?)ment(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Reference(s?)\}.*?(?=\\(sub)?section|\Z)",
        r"\\(sub)?section\*\{(\d+(\.\d+)*\.?\s*)?Appendi(x|ces).*?(?=\\(sub)?section|\Z)"
    ]
    for section in sections_to_remove:
        latex_content = re.sub(section, '', latex_content, flags=re.DOTALL | re.IGNORECASE)

    # Remove figures, tables, and so on, which were centered by Mathpix.
    latex_content = re.sub(r"\n*?\\begin{center}.*?\\end{center}(\n*?(?=[a-z]))?", '\n', latex_content, flags=re.DOTALL)

    # Remove footnotes
    latex_content = re.sub(r"\n*?\\footnotetext\{.*?\n\}(\n*?(?=[a-z]))?", '\n', latex_content, flags=re.DOTALL)

    # Make sure to remove graphic elements.
    latex_content = re.sub(r"\n*?\\includegraphics.*?\}(\n*?(?=[a-z]))?", '\n', latex_content, flags=re.DOTALL)
    latex_content = re.sub(r"\n+?(Figure|Table).+?\n+?(?=[a-z])", ' ', latex_content)

    # Replace the form of abstract into form of quotation, because Mathpix interpreted quotations as abstracts.
    latex_content = re.sub(r"\\begin\{abstract\}", r'\\begin{quotation}', latex_content)
    latex_content = re.sub(r"\\end\{abstract\}", r'\\end{quotation}', latex_content)

    # Remove line breaks caused by page breaks, except for intended breaks
    latex_content = re.sub(r"(?<!\:|\.)\\\\\n(?!\(.)(?!.\.)(?![A-Z])", ' ', latex_content)

    # Remove indicies used for annotations.
    latex_content = re.sub(r"\$\{\s*\}\^\{\d+\}\$", '', latex_content)
    latex_content = re.sub(r"\$\^\{\d+\}\$", '', latex_content)

    # Remove section titles without contents.
    latex_content = re.sub(r"\\(sub)?section\*\{.*?\}\n*?(?=\\(sub)?section\*\{)", '', latex_content)

    # Remove interventions between words, such as title and author name, which are sometimes perceived as sections by Mathpix. 
    latex_content = re.sub(r"\n*?\\(sub)?section\*\{.*?\}\n*?(?=[a-z])", ' ', latex_content)

    # Save the processed contents as a TeX file.
    processed_tex_file = f'processed_{file_name}.tex'
    with open(processed_tex_file, 'w') as file:
        file.write(latex_content)

    # Zip the file to be able to open it in Overleaf.
    new_zip_file_path = f'processed_{file_name}.zip'
    with zipfile.ZipFile(new_zip_file_path, 'w') as zip_ref:
        zip_ref.write(processed_tex_file)

    # Remove the temporary .tex file.
    os.remove(processed_tex_file)

    print(f"Processing complete. Modified file saved in '{new_zip_file_path}'.")


No title found in the LaTeX content.
Processing complete. Modified file saved in 'processed_Folk psychology and cognitive architecture.zip'.
No title found in the LaTeX content.
Processing complete. Modified file saved in 'processed_Is connectionism commonsense.zip'.
CONNECTIONISM AND THE COMMITMENTS OF FOLK PSYCHOLOGY 
\\section*{CONNECTIONISM AND THE COMMITMENTS OF FOLK PSYCHOLOGY }

Processing complete. Modified file saved in 'processed_Connectionism and the commitments of folk psychology.zip'.
Connectionism and the fate of folk psychology: a reply to Ramsey, Stich and Garon 
\\section*{Connectionism and the fate of folk psychology: a reply to Ramsey, Stich and Garon }

Processing complete. Modified file saved in 'processed_Connectionism and the fate of folk psychology A reply to Ramsey, Stich and Garon.zip'.
No title found in the LaTeX content.
Processing complete. Modified file saved in 'processed_Connectionism, eliminativism, and the semantic view of theories.zip'.
DISCUSSION 
\\