In [None]:
import arxiv
import time

search_query = 'cat:math.CT'
max_results = 10000  
client = arxiv.Client()
batch_size = max_results

search = arxiv.Search(
    query=search_query,
    max_results=batch_size,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)
for result in results:
    if result is None:
        continue

    try:
        arxiv_id = result.get_short_id()
        result.download_source(dirpath="./papers", filename=f"{arxiv_id}.tar.gz")
        time.sleep(3)  # Respect arxiv TOS and don't spam requests
    except Exception as e:
        print(f"Failed to download source for {arxiv_id}: {e}")


In [96]:
import os
import tarfile
import re

def extract_filtered_tex_files(input_dir='papers', output_dir='tex_files'):
    """
    Extracts .tex files from all gzipped tar archives in the input directory,
    and saves only those that begin with \documentclass[...]{amsart} or
    \documentclass[...]{article}, accounting for comments or whitespace above.

    Parameters:
    - input_dir: Directory containing the gzipped tar archives.
    - output_dir: Directory where the filtered .tex files will be saved.
    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pattern_amsart = re.compile(r'\\documentclass\[.*\]\{.*\}')
    pattern_article = re.compile(r'\\documentclass\[.*\]\{article\}')

    for filename in os.listdir(input_dir):
        file_path = os.path.join(input_dir, filename)

        if filename.endswith(('.tar.gz', '.tgz', '.tar')):
            print(f"Processing archive: {filename}")

            try:
                with tarfile.open(file_path, 'r:*') as tar:
                    # Iterate over each member in the tar archive
                    for member in tar.getmembers():
                        # Check if the member is a .tex file
                        if member.isfile() and member.name.endswith('.tex'):
                            # Extract the .tex file content
                            member_file = tar.extractfile(member)
                            if member_file:
                                try:
                                    content_bytes = member_file.read()
                                    # Try decoding with utf-8, fallback to latin-1
                                    try:
                                        content = content_bytes.decode('utf-8')
                                    except UnicodeDecodeError:
                                        content = content_bytes.decode('latin-1')
                                except Exception as e:
                                    print(f"Error reading {member.name}: {e}")
                                    continue

                                # Remove comments and whitespace at the beginning
                                lines = content.splitlines()
                                for line in lines:
                                    line = line.strip()
                                    # Skip empty lines and comments
                                    if not line or line.startswith('%'):
                                        continue
                                    else:
                                        # Check if the line matches the patterns
                                        if (pattern_amsart.match(line) or pattern_article.match(line)):

                                            member_name = os.path.normpath(filename[:-7] + member.name)
                                            output_file_path = os.path.join(output_dir, member_name)

                                            with open(output_file_path, 'w', encoding='utf-8') as f_out:
                                                f_out.write(content)

                                            print(f"Extracted: {output_file_path}")
                                        else:
                                            print(f"Skipped {member.name}: Does not match documentclass patterns")
                                        break  # Stop processing after the first non-comment line
            except Exception as e:
                print(f"Error processing {filename}: {e}")




In [97]:
extract_filtered_tex_files(input_dir='papers', output_dir='extracted')
#delete_files('papers')


Processing archive: 2308.08519v1.tar.gz
Extracted: extracted/2308.08519v1Auslander-Iyama_correspondence_for_exact_dg_categories.tex
Processing archive: 2409.05062v1.tar.gz
Error processing 2409.05062v1.tar.gz: file could not be opened successfully
Processing archive: 2310.19613v1.tar.gz
Error processing 2310.19613v1.tar.gz: file could not be opened successfully
Processing archive: 2309.02116v1.tar.gz
Error processing 2309.02116v1.tar.gz: file could not be opened successfully
Processing archive: 2207.12983v1.tar.gz
Error processing 2207.12983v1.tar.gz: file could not be opened successfully
Processing archive: 2208.11077v2.tar.gz
Skipped journalnames.tex: Does not match documentclass patterns
Skipped output.tex: Does not match documentclass patterns
Skipped uci.tex: Does not match documentclass patterns
Processing archive: 2209.15606v2.tar.gz
Skipped Frobenius-monoidal-functors-from-coHopf-adjunctions.tex: Does not match documentclass patterns
Skipped macros.tex: Does not match documentc