In [None]:
import json
from datetime import datetime

# Define the date cutoff
date_cutoff = datetime.strptime("2024-04-01", "%Y-%m-%d")

# Path to the JSON metadata file
metadata_path = "arxiv-metadata-oai-snapshot.json"

# Extract relevant metadata
paper_ids = []
with open(metadata_path, encoding="utf8") as f:
    for entry in f:
        data = json.loads(entry)
        update_date = datetime.strptime(data.get("update_date", ""), "%Y-%m-%d")
        if update_date >= date_cutoff:
            paper_ids.append(data.get("id", ""))


In [None]:
import os
import requests

# Construct the list of PDF URLs
base_url = "https://arxiv.org/pdf/"
pdf_urls = [f"{base_url}{paper_id}.pdf" for paper_id in paper_ids]

# Ensure the directory to save downloaded PDFs exists
download_directory = "./downloaded_pdfs/"
os.makedirs(download_directory, exist_ok=True)

# Function to download a single PDF
def download_pdf(url, directory):
    response = requests.get(url)
    if response.status_code == 200:
        filename = os.path.join(directory, url.split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Failed to download {url}")

# Download all PDFs
for pdf_url in pdf_urls:
    download_pdf(pdf_url, download_directory)

print("Download complete.")


In [None]:
from PyPDF2 import PdfReader

# Define the directory containing the downloaded PDFs
pdf_directory = "./downloaded_pdfs/"

# Define the directory to save the TXT files
txt_directory = "./converted_txt/"
os.makedirs(txt_directory, exist_ok=True)

# Function to convert a single PDF to TXT
def convert_pdf_to_txt(pdf_path, txt_path):
    try:
        with open(pdf_path, 'rb') as pdffileobj:
            pdfreader = PdfReader(pdffileobj)
            text = ""
            for page in pdfreader.pages:
                text += page.extract_text()
            
            with open(txt_path, 'w', encoding='utf-8') as txtfile:
                txtfile.write(text)
            
            print(f"Converted {pdf_path} to {txt_path}")
    except Exception as e:
        print(f"Failed to convert {pdf_path}: {e}")

# Iterate over all PDFs in the directory and convert them to TXT
for pdf_filename in os.listdir(pdf_directory):
    if pdf_filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, pdf_filename)
        txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
        txt_path = os.path.join(txt_directory, txt_filename)
        convert_pdf_to_txt(pdf_path, txt_path)


In [None]:
import pandas as pd

# Path to the JSON metadata file
metadata_path = "arxiv-metadata-oai-snapshot.json"

# Extract metadata and store in a dictionary
metadata_dict = {}
with open(metadata_path, encoding="utf8") as f:
    for entry in f:
        data = json.loads(entry)
        paper_id = data.get("id", "")
        metadata_dict[paper_id] = data

# Function to chunk text
def chunk_text(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Define the directory containing the TXT files
txt_directory = "./converted_txt/"

# Store the chunks and their metadata
chunks_data = []

for txt_filename in os.listdir(txt_directory):
    if txt_filename.endswith('.txt'):
        paper_id = os.path.splitext(txt_filename)[0]
        txt_path = os.path.join(txt_directory, txt_filename)
        
        # Read the TXT file contents
        with open(txt_path, 'r', encoding='utf-8') as txtfile:
            text = txtfile.read()
        
        # Chunk the text
        chunks = chunk_text(text)
        
        # Get the metadata for this paper
        if paper_id in metadata_dict:
            metadata = metadata_dict[paper_id]
        else:
            print(f"Metadata not found for paper ID: {paper_id}")
            continue
        
        # Store each chunk with its metadata
        for i, chunk in enumerate(chunks):
            chunk_id = f"{paper_id}_{i}"
            chunk_data = {
                "chunk_id": chunk_id,
                "chunk": chunk,
            }
            # Add metadata fields to chunk_data
            chunk_data.update(metadata)
            chunks_data.append(chunk_data)

# Convert the data to a DataFrame
df = pd.DataFrame(chunks_data)

# Define the path to the output CSV file
csv_output_path = "chunks_metadata.csv"

# Save the DataFrame to a CSV file
df.to_csv(csv_output_path, index=False)

print("CSV file created successfully.")
