In [1]:
import os
import json
from src.arxiv_data import fetch_all_years  # Your data acquisition functions
from src.text_processing import advanced_text_processing  # Your advanced text processing function

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

def create_dataset_1():
    """
    Dataset 1: Fetch arXiv metadata for a given range without downloading PDFs.
    Adjust start_year, end_year, and month as needed.
    """
    # Fetch data for 1991 to 2025, March of each year
    data = fetch_all_years(start_year=1991, end_year=2025, month=3, download_pdf=False)
    filepath = os.path.join("data", "arxiv_march_data.json")
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset 1 saved as {filepath}")
    return data

def create_dataset_2():
    """
    Dataset 2: Fetch arXiv metadata for a given range with PDFs downloaded.
    """
    data = fetch_all_years(start_year=1991, end_year=2025, month=3, download_pdf=True)
    filepath = os.path.join("data", "arxiv_march_data_with_pdf.json")
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset 2 saved as {filepath}")
    return data

def create_dataset_3():
    """
    Dataset 3: Load Dataset 1, process each paper's summary using advanced_text_processing,
    and save the processed result.
    """
    filepath_in = os.path.join("data", "arxiv_march_data.json")
    with open(filepath_in, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for year, papers in data.items():
        for paper in papers:
            summary_text = paper.get("summary", "")
            processed_tokens = advanced_text_processing(summary_text)
            paper["processed_summary"] = processed_tokens
    
    filepath_out = os.path.join("data", "arxiv_march_data_with_processed_summary.json")
    with open(filepath_out, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset 3 saved as {filepath_out}")
    return data



[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataset1 = create_dataset_1()

In [None]:
dataset2 = create_dataset_2()

In [None]:
dataset3 = create_dataset_3()

In [None]:
# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

def create_dataset_4_incremental(input_filename="data/arxiv_march_data_with_pdf.json",
                                 output_filename="data/arxiv_march_data_with_processed_summary_and_pdf.json"):
    """
    Processes Dataset 2 incrementally, year by year.
    For each paper, it processes the summary and PDF content (if available) using advanced_text_processing.
    It writes the processed data out after processing each year, reducing memory usage.
    """
    # Open the input file and load the top-level dictionary (one key per year)
    with open(input_filename, "r", encoding="utf-8") as f_in:
        data = json.load(f_in)
    
    output_data = {}  # to store processed data year-by-year
    
    for year, papers in data.items():
        processed_papers = []
        for paper in papers:
            # Process summary field
            summary_text = paper.get("summary", "")
            processed_summary = advanced_text_processing(summary_text)
            paper["processed_summary"] = processed_summary
            
            # Process PDF content if available
            pdf_text = paper.get("pdf_content", "")
            if pdf_text:
                processed_pdf = advanced_text_processing(pdf_text)
                paper["processed_pdf"] = processed_pdf
            
            processed_papers.append(paper)
        
        # Save the processed papers for this year into our output_data
        output_data[year] = processed_papers
        
        # Write out the results incrementally after processing each year
        with open(output_filename, "w", encoding="utf-8") as f_out:
            json.dump(output_data, f_out, indent=2)
        print(f"Processed and saved year {year}.")
    
    print(f"Dataset 4 saved as {output_filename}")
    return output_data


In [None]:
create_dataset_4_incremental()