In [3]:
import arxiv
import pandas as pd
import random

# Define a broad search query (no category or date restrictions)
query = 'all'  # Search all papers
total_results = 1000

# Construct the default API client.
client = arxiv.Client()

search = arxiv.Search(query, max_results= total_results)
results = client.results(search)

# Collect paper metadata
papers = []
for result in results:
    papers.append({
        'id': result.entry_id.split('/')[-1],  # Extract ArXiv ID
        'title': result.title,
        'abstract': result.summary,
        'published': result.published,
        'pdf_url': result.pdf_url
    })

# Convert to a DataFrame
papers_df = pd.DataFrame(papers)

In [8]:
import requests
import pypdfium2 as pdfium
from io import BytesIO
from concurrent.futures import ProcessPoolExecutor, as_completed


# Randomly sample papers
random_papers_df = papers_df.sample(n=2, random_state=0)
print(random_papers_df.info())


# Download the PDF
def download_and_extract(pdf_url):
    response = requests.get(pdf_url)
    pdf_file = BytesIO(response.content)
    
    # Extract text using PyPDFium2
    text = ''
    pdf = pdfium.PdfDocument(pdf_file)
    for i in range(len(pdf)):
        page = pdf[i]
        text += page.get_textpage().get_text_bounded()
    
    return text

delay = 15
def limit_download(pdf_url):
    time.sleep(delaty)
    return download_and_extract(pdf_url)
    
# Add full text to the DataFrame
random_papers_df['full_text'] = random_papers_df['pdf_url'].apply(download_and_extract)

# The below code downloads the data as fast as possible in parallel.
# This violates the terms of the arxiv's service and will resultyour ip being temorarily restricted
"""
def download_and_extract_text(pdf_url):
    try:
        # Download the PDF
        response = requests.get(pdf_url)
        pdf_file = BytesIO(response.content)

        # Extract text using PyPDFium2
        text = ''
        pdf = pdfium.PdfDocument(pdf_file)
        for i in range(len(pdf)):
            page = pdf[i]
            text += page.get_textpage().get_text_bounded()

        return text
    
    except Exception as e:
        print(f"Error processing {pdf_url}: {e}")
        return None


# Use ThreadPoolExecutor for Parallel Processing
def process_papers_parallel(papers_df, max_workers=10):
    results = {}
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks
        future_to_id = {
            executor.submit(download_and_extract_text, row['pdf_url']): row['id']
            for _, row in papers_df.iterrows()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_id):
            paper_id = future_to_id[future]
            try:
                results[paper_id] = future.result()
            except Exception as e:
                print(f"Error processing {paper_id}: {e}")
                results[paper_id] = None
    return results


import time
start = time.time()


# Process papers in parallel
text_results = process_papers_parallel(random_papers_df, max_workers=20)  # Adjust `max_workers` as needed

# Add full text to the DataFrame
random_papers_df['full_text'] = random_papers_df['id'].map(text_results)


end = time.time()
print(end - start)
"""



# Display the first few rows with full text
print(random_papers_df[['id', 'abstract', 'full_text']].head())

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 993 to 859
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   id         2 non-null      object             
 1   title      2 non-null      object             
 2   abstract   2 non-null      object             
 3   published  2 non-null      datetime64[ns, UTC]
 4   pdf_url    2 non-null      object             
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 96.0+ bytes
None
               id                                           abstract  \
993  2006.02335v2  Partition identities are often statements asse...   
859   0807.4753v1  For all p > 1, we demonstrate the existence of...   

                                             full_text  
993  arXiv:2006.02335v2 [math.NT] 15 Sep 2020\r\nBE...  
859  arXiv:0807.4753v1 [quant-ph] 30 Jul 2008\r\nCo...  


In [16]:
import re

def remove_abstract(text):
    # Match common abstract headings (case insensitive)
    abstract_patterns = [
        r'\babstract\b.*?\n',  # Matches "Abstract" as a heading
        r'\babstract\b:?'       # Matches "Abstract:" or "abstract"
    ]

    # Compile regex with case insensitivity
    pattern = re.compile('|'.join(abstract_patterns), re.IGNORECASE)

    # Remove abstract heading
    text = pattern.sub('', text, count=1)  

    # Remove the actual abstract text until the next section heading (non-greedy)
    text = re.sub(r'(?s)^.*?(?=\n\s*(?:\d*\s*Introduction|\bI\b\.|I\.))', '', text, count=1)

    return text.strip()

In [18]:
random_papers_df["full_text"] = random_papers_df["full_text"].apply(remove_abstract)
# Printing data to look at possible issues 
for index, row in random_papers_df.iterrows():
    print(f"Paper ID: {row['id']}")  # Print the paper ID (if available)
    print(f"Full Text:\n{row['full_text']}\n")  # Print the full text
    print("-" * 80)  # Separator for readability

Paper ID: 2006.02335v2
Full Text:
arXiv:2006.02335v2 [math.NT] 15 Sep 2020
BECK-TYPE IDENTITIES FOR EULER PAIRS OF ORDER r
CRISTINA BALLANTINE AND AMANDA WELCH
of partitions of n subject to condition X is equinumerous to the set PY of
partitions of n subject to condition Y . A Beck-type identity is a companion
identity to |PX| = |PY | asserting that the difference b(n) between the number
of parts in all partitions in PX and the number of parts in all partitions in PY
equals a c|PX′ | and also c|PY ′ |, where c is some constant related to the original
identity, and X′, respectively Y
′
, is a condition on partitions that is a very
slight relaxation of condition X, respectively Y . A second Beck-type identity
involves the difference b
′
(n) between the total number of different parts in all
partitions in PX and the total number of different parts in all partitions in PY .
We extend these results to Beck-type identities accompanying all identities
given by Euler pairs of order r (for any 

In [49]:
random_papers_df.to_csv("results.csv")