# Notebook 01: Data Collection from ArXiv

###   Scrape research papers and create a dataset for RAG system

In [None]:
import arxiv
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm
import os

In [2]:
# ================================
# 1. CONFIGURATION
# ================================


SEARCH_QUERIES = [
    "machine learning",
    "deep learning",
    "natural language processing",
    "computer vision",
    "reinforcement learning"]

MAX_RESULTS_PER_QUERY = 1000  
CATEGORIES = ['cs.LG', 'cs.AI', 'cs.CL', 'cs.CV']  # Relevant CS categories
OUTPUT_PATH = '../data/arxiv_papers.csv'

print(f"Starting data collection at {datetime.now()}")
print(f"Target: {MAX_RESULTS_PER_QUERY * len(SEARCH_QUERIES)} papers")

Starting data collection at 2025-09-29 14:46:42.030279
Target: 5000 papers


In [None]:
# ================================
# 2. DATA COLLECTION FUNCTION - CORRECTED
# ================================

def fetch_arxiv_papers(query, max_results=1000, max_retries=3):
    """
    Fetch papers from ArXiv API with retry logic.
    
    
    Args:
        query: Search query string
        max_results: Maximum number of papers to fetch
        max_retries: Number of times to retry on a temporary failure
    
    Returns:
        List of paper dictionaries
    """
    
    papers = []
    
    for attempt in range(max_retries):
        try:
            # Recreate the Search object for each attempt in case of connection issues
            search = arxiv.Search(
                query=query,
                max_results=max_results,
                sort_by=arxiv.SortCriterion.Relevance,
                sort_order=arxiv.SortOrder.Descending
            )
            
            # The search.results() generator will raise the error when it runs out of pages.
            for result in search.results():
                paper = {
                    'arxiv_id': result.entry_id.split('/')[-1],
                    'title': result.title,
                    'abstract': result.summary.replace('\n', ' '),
                    'authors': ', '.join([author.name for author in result.authors]),
                    'published_date': result.published.strftime('%Y-%m-%d'),
                    'categories': ', '.join(result.categories),
                    'pdf_url': result.pdf_url,
                    'query': query
                }
                papers.append(paper)
            
            # If the loop finishes without an error, we successfully fetched all max_results papers.
            return papers

      
        except arxiv.UnexpectedEmptyPageError as e:
            # Handle the specific case where max_results was requested,but the API returned fewer papers (ran out of results).
            print(f"   ℹ️ Query '{query}' ran out of results. Fetched {len(papers)} papers.")
            return papers # Return the papers collected so far.
        
        except Exception as e:
            # Handle genuine API/network/parsing errors 
            print(f"   ⚠️ Attempt {attempt + 1} failed for query '{query}': {e}")
            if attempt < max_retries - 1:
                # Wait longer between retries for a genuine error
                sleep_time = 5 * (attempt + 1)
                print(f"   ⏳ Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print(f"   ❌ Max retries reached for query '{query}'. Skipping.")
                return [] # Return empty list on final failure
    
    return papers

In [4]:
# ================================
# 3. COLLECT DATA
# ================================

all_papers = []

for query in tqdm(SEARCH_QUERIES, desc="Fetching papers"):
    print(f"\n🔍 Searching for: '{query}'")
    papers = fetch_arxiv_papers(query, MAX_RESULTS_PER_QUERY)
    all_papers.extend(papers)
    print(f"   ✓ Found {len(papers)} papers")
    time.sleep(1)  # Be nice to the API

print(f"\n📊 Total papers collected: {len(all_papers)}")

Fetching papers:   0%|          | 0/5 [00:00<?, ?it/s]


🔍 Searching for: 'machine learning'


  for result in search.results():


   ℹ️ Query 'machine learning' ran out of results. Fetched 500 papers.
   ✓ Found 500 papers


Fetching papers:  20%|██        | 1/5 [00:26<01:46, 26.62s/it]


🔍 Searching for: 'deep learning'
   ℹ️ Query 'deep learning' ran out of results. Fetched 500 papers.
   ✓ Found 500 papers


Fetching papers:  40%|████      | 2/5 [00:53<01:19, 26.57s/it]


🔍 Searching for: 'natural language processing'
   ℹ️ Query 'natural language processing' ran out of results. Fetched 100 papers.
   ✓ Found 100 papers


Fetching papers:  60%|██████    | 3/5 [01:06<00:41, 20.67s/it]


🔍 Searching for: 'computer vision'
   ℹ️ Query 'computer vision' ran out of results. Fetched 200 papers.
   ✓ Found 200 papers


Fetching papers:  80%|████████  | 4/5 [01:23<00:19, 19.10s/it]


🔍 Searching for: 'reinforcement learning'
   ✓ Found 1000 papers


Fetching papers: 100%|██████████| 5/5 [01:52<00:00, 22.58s/it]


📊 Total papers collected: 2300





In [5]:
# ================================
# 4. CREATE DATAFRAME
# ================================

df = pd.DataFrame(all_papers)

# Remove duplicates (same paper might appear in multiple queries)
print(f"\nBefore deduplication: {len(df)} papers")
df = df.drop_duplicates(subset=['arxiv_id'])
print(f"After deduplication: {len(df)} papers")


Before deduplication: 2300 papers
After deduplication: 2228 papers


In [6]:
# ================================
# 5. BASIC DATA QUALITY CHECKS
# ================================

print("\n" + "="*60)
print("DATA QUALITY ASSESSMENT")
print("="*60)

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Abstract length statistics
df['abstract_length'] = df['abstract'].str.len()
print("\nAbstract length statistics:")
print(df['abstract_length'].describe())

# Papers per category
print("\nTop 10 categories:")
all_categories = df['categories'].str.split(', ').explode()
print(all_categories.value_counts().head(10))

# Papers over time
df['year'] = pd.to_datetime(df['published_date']).dt.year
print("\nPapers by year:")
print(df['year'].value_counts().sort_index(ascending=False).head())


DATA QUALITY ASSESSMENT

Missing values:
arxiv_id          0
title             0
abstract          0
authors           0
published_date    0
categories        0
pdf_url           0
query             0
dtype: int64

Abstract length statistics:
count    2228.000000
mean      981.642729
std       390.430930
min         3.000000
25%       680.000000
50%      1000.500000
75%      1252.250000
max      1919.000000
Name: abstract_length, dtype: float64

Top 10 categories:
categories
cs.LG      1614
cs.AI       765
stat.ML     660
cs.CV       319
cs.RO       172
cs.CL       154
cs.SY        89
cs.NE        84
eess.SY      82
cs.CR        59
Name: count, dtype: int64

Papers by year:
year
2025    146
2024    231
2023    243
2022    224
2021    228
Name: count, dtype: int64


In [7]:
# ================================
# 6. SAVE DATA
# ================================

os.makedirs('../data', exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Data saved to {OUTPUT_PATH}")


✅ Data saved to ../data/arxiv_papers.csv


In [8]:

# ================================
# 7. SAMPLE PREVIEW
# ================================

print("\n" + "="*60)
print("SAMPLE PAPERS")
print("="*60)

for idx, row in df.head(3).iterrows():
    print(f"\n📄 Paper {idx + 1}:")
    print(f"   Title: {row['title'][:80]}...")
    print(f"   Authors: {row['authors'][:60]}...")
    print(f"   Categories: {row['categories']}")
    print(f"   Abstract: {row['abstract'][:150]}...")

print("\n" + "="*60)
print("DATA COLLECTION COMPLETE!")
print(f"Dataset shape: {df.shape}")
print(f"Ready for EDA in notebook 02")
print("="*60)


SAMPLE PAPERS

📄 Paper 1:
   Title: Lecture Notes: Optimization for Machine Learning...
   Authors: Elad Hazan...
   Categories: cs.LG, stat.ML
   Abstract: Lecture notes on optimization for machine learning, derived from a course at Princeton University and tutorials given in MLSS, Buenos Aires, as well a...

📄 Paper 2:
   Title: An Optimal Control View of Adversarial Machine Learning...
   Authors: Xiaojin Zhu...
   Categories: cs.LG, stat.ML
   Abstract: I describe an optimal control view of adversarial machine learning, where the dynamical system is the machine learner, the input are adversarial actio...

📄 Paper 3:
   Title: Minimax deviation strategies for machine learning and recognition with short lea...
   Authors: Michail Schlesinger, Evgeniy Vodolazskiy...
   Categories: cs.LG
   Abstract: The article is devoted to the problem of small learning samples in machine learning. The flaws of maximum likelihood learning and minimax learning are...

DATA COLLECTION COMPLETE!
Datase