In [2]:
! pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [1]:
import os
import json
from src.arxiv_data import fetch_all_years  # Your data acquisition functions
from src.arxiv_data_Latex import get_latex_source_text
from src.arxiv_data_Latex import enrich_json_with_latex


from src.text_processing import advanced_text_processing  # Your advanced text processing function

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing the Dataset 

In [2]:
def create_dataset():
    """
    Dataset: Fetch arXiv metadata and LaTeX source for each March, year by year.
    Saves one JSON per year in 'data/'.
    """
    data = fetch_all_years(start_year=2025, end_year=2025, month=3, output_dir="data")
    print("✅ LaTeX dataset saved (one file per year in /data)")
    return data


In [3]:
dataset1 = create_dataset()


📅 Fetching March 2025...
🔄 Fetching March 2025 papers in parallel...
    Thread fetching: submittedDate:[20250301 TO 20250307]
    Thread fetching: submittedDate:[20250308 TO 20250314]
    Thread fetching: submittedDate:[20250315 TO 20250321]
    Thread fetching: submittedDate:[20250322 TO 20250328]
    ✅ Done: 4488 papers from 20250322 to 20250328
    Thread fetching: submittedDate:[20250329 TO 20250331]
    ✅ Done: 4674 papers from 20250315 to 20250321
    ✅ Done: 4807 papers from 20250301 to 20250307
    ✅ Done: 744 papers from 20250329 to 20250331
    ✅ Done: 4998 papers from 20250308 to 20250314
✅ Total for March 2025: 19711 papers
✅ Saved 19711 papers to data/arxiv_march_2025.json
✅ LaTeX dataset saved (one file per year in /data)


# Uploading the Content

In [9]:
def create_dataset_pdf():
    """
    Dataset: Fetch arXiv metadata and LaTeX source for each March, year by year.
    Saves one JSON per year in 'data/'.
    """
    data = fetch_all_years(start_year=1995, end_year=1995, month=3, output_dir="data",download_pdf=True)
    print("✅ LaTeX dataset saved (one file per year in /data)")
    return data


In [10]:
pdf_dataset = create_dataset_pdf()


📅 Fetching March 1995...
🔄 Fetching March 1995 papers in parallel...
    Thread fetching: submittedDate:[19950301 TO 19950307]
    Thread fetching: submittedDate:[19950308 TO 19950314]
      Downloading PDF for: On the Informativeness of the DNA Promoter Sequences Domain ...
⚠️ Not a PDF (Content-Type): text/html — http://arxiv.org/pdf/cs/9503101v1
      Downloading PDF for: Cost-Sensitive Classification: Empirical Evaluation of a Hyb...
      Downloading PDF for: Principal pairs for oscillatory second order linear differen...
⚠️ Not a PDF (Content-Type): text/html — http://arxiv.org/pdf/math/9503223v1
      Downloading PDF for: Imaging the Small-Scale Circumstellar Gas Around T~Tauri Sta...
⚠️ Not a PDF (Content-Type): text/html — http://arxiv.org/pdf/cs/9503102v1
      Downloading PDF for: Less nonstationary ideals...
⚠️ Not a PDF (Content-Type): text/html — http://arxiv.org/pdf/math/9503203v1
⚠️ Not a PDF (Content-Type): text/html — http://arxiv.org/pdf/astro-ph/9503034v1
      Dow

In [6]:
import gzip
import json

with gzip.open("data/enriched/enriched_arxiv_202504.json.gz", "rt", encoding="utf-8") as f:
    data = json.load(f)

pdf_fallbacks = [p for p in data if p.get("from_pdf")]
print(f"Found {len(pdf_fallbacks)} papers with PDF fallback")
print(pdf_fallbacks[10]["title"])
print("\n--- PDF Content ---\n")
print(pdf_fallbacks[880]["latex_source"][:2000])  # Show first 2000 characters


Found 899 papers with PDF fallback
A fast partial parse of natural language sentences using a connectionist method

--- PDF Content ---




In [None]:

def create_dataset_2():
    """
    Dataset 2: Fetch arXiv metadata for a given range with PDFs downloaded.
    """
    data = fetch_all_years(start_year=1991, end_year=2025, month=3, download_pdf=True)
    filepath = os.path.join("data", "arxiv_march_data_with_pdf.json")
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset 2 saved as {filepath}")
    return data

def create_dataset_3():
    """
    Dataset 3: Load Dataset 1, process each paper's summary using advanced_text_processing,
    and save the processed result.
    """
    filepath_in = os.path.join("data", "arxiv_march_data.json")
    with open(filepath_in, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for year, papers in data.items():
        for paper in papers:
            summary_text = paper.get("summary", "")
            processed_tokens = advanced_text_processing(summary_text)
            paper["processed_summary"] = processed_tokens
    
    filepath_out = os.path.join("data", "arxiv_march_data_with_processed_summary.json")
    with open(filepath_out, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset 3 saved as {filepath_out}")
    return data



In [None]:
dataset1 = create_dataset_1()

In [None]:
dataset2 = create_dataset_2()

In [None]:
dataset3 = create_dataset_3()

In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

def create_dataset_4_incremental(input_filename="data/arxiv_march_data_with_pdf.json",
                                 output_filename="data/arxiv_march_data_with_processed_summary_and_pdf.json"):
    """
    Processes Dataset 2 incrementally, year by year.
    For each paper, it processes the summary and PDF content (if available) using advanced_text_processing.
    It writes the processed data out after processing each year, reducing memory usage.
    """
    # Open the input file and load the top-level dictionary (one key per year)
    with open(input_filename, "r", encoding="utf-8") as f_in:
        data = json.load(f_in)
    
    output_data = {}  # to store processed data year-by-year
    
    for year, papers in data.items():
        processed_papers = []
        for paper in papers:
            # Process summary field
            summary_text = paper.get("summary", "")
            processed_summary = advanced_text_processing(summary_text)
            paper["processed_summary"] = processed_summary
            
            # Process PDF content if available
            pdf_text = paper.get("pdf_content", "")
            if pdf_text:
                processed_pdf = advanced_text_processing(pdf_text)
                paper["processed_pdf"] = processed_pdf
            
            processed_papers.append(paper)
        
        # Save the processed papers for this year into our output_data
        output_data[year] = processed_papers
        
        # Write out the results incrementally after processing each year
        with open(output_filename, "w", encoding="utf-8") as f_out:
            json.dump(output_data, f_out, indent=2)
        print(f"Processed and saved year {year}.")
    
    print(f"Dataset 4 saved as {output_filename}")
    return output_data


In [None]:
create_dataset_4_incremental()