In [1]:
from scientific_rag.application.data_loader import DataLoader

## Load Sample Papers

In [2]:
loader = DataLoader(split="arxiv")
papers = loader.load_papers(sample_size=10, data_split="train")

[32m2025-12-12 15:15:22.992[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m48[0m - [1mLoading arxiv papers from armanc/scientific_papers (train split)[0m


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

[32m2025-12-12 16:05:43.165[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m59[0m - [1mSampling 10 papers[0m
Loading papers: 100%|██████████| 10/10 [00:00<00:00, 200.36it/s]
[32m2025-12-12 16:05:43.232[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m77[0m - [1mLoaded 10 papers[0m


In [3]:
print(f"Loaded {len(papers)} papers")
print(f"First paper ID: {papers[0].paper_id}")
print(f"Source: {papers[0].source}")

Loaded 10 papers
First paper ID: arxiv_0
Source: arxiv


## Inspect Paper Structure

In [4]:
sample_paper = papers[0]

print("Abstract:")
print(sample_paper.abstract[:200], "...\n")

print("Section Names:")
print(sample_paper.section_names[:200], "...\n")

print("Article (first 300 chars):")
print(sample_paper.article[:300], "...")

Abstract:
 additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for additive models . 
 these learning rates compare favou ...

Section Names:
introduction
main results on learning rates
comparison of learning rates ...

Article (first 300 chars):
additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fu ...


## Statistics

In [5]:
import numpy as np


abstract_lengths = [len(p.abstract) for p in papers]
article_lengths = [len(p.article) for p in papers]

print(f"Abstract length - mean: {np.mean(abstract_lengths):.0f}, std: {np.std(abstract_lengths):.0f}")
print(f"Article length - mean: {np.mean(article_lengths):.0f}, std: {np.std(article_lengths):.0f}")

Abstract length - mean: 2515, std: 4761
Article length - mean: 25667, std: 11867


## Test Both Sources

In [6]:
loader_both = DataLoader()
papers_both = loader_both.load_both_sources(sample_size_per_source=5, data_split="train")

arxiv_count = sum(1 for p in papers_both if p.source.value == "arxiv")
pubmed_count = sum(1 for p in papers_both if p.source.value == "pubmed")

print(f"Total papers: {len(papers_both)}")
print(f"ArXiv: {arxiv_count}, PubMed: {pubmed_count}")

[32m2025-12-12 16:05:43.479[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m48[0m - [1mLoading arxiv papers from armanc/scientific_papers (train split)[0m
[32m2025-12-12 16:05:45.910[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m59[0m - [1mSampling 5 papers[0m
Loading papers: 100%|██████████| 5/5 [00:00<00:00, 12343.45it/s]
[32m2025-12-12 16:05:45.914[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m77[0m - [1mLoaded 5 papers[0m
[32m2025-12-12 16:05:45.915[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m48[0m - [1mLoading pubmed papers from armanc/scientific_papers (train split)[0m


Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

[32m2025-12-12 16:06:04.070[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m59[0m - [1mSampling 5 papers[0m
Loading papers: 100%|██████████| 5/5 [00:00<00:00, 6983.52it/s]
[32m2025-12-12 16:06:04.074[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_papers[0m:[36m77[0m - [1mLoaded 5 papers[0m
[32m2025-12-12 16:06:04.074[0m | [1mINFO    [0m | [36mscientific_rag.application.data_loader[0m:[36mload_both_sources[0m:[36m110[0m - [1mLoaded 10 papers from both sources[0m


Total papers: 10
ArXiv: 5, PubMed: 5
