# Download Sample Data

This notebook downloads sample PDF files needed for the RAG training modules.

In [0]:
%run ./Setup

In [0]:
# Download sample PDF files for RAG training
# These are research papers from ArXiv that will be used in the training

import urllib.request
import os

# ArXiv paper URLs - sample papers about Generative AI
arxiv_papers = [
    ("https://arxiv.org/pdf/2309.07930.pdf", "Generative_AI_Survey.pdf"),
    ("https://arxiv.org/pdf/2303.08774.pdf", "GPT_4_Technical_Report.pdf"),
    ("https://arxiv.org/pdf/2302.13971.pdf", "LLaMA_Open_Foundation.pdf")
]

# Create arxiv-articles directory in volume if it doesn't exist
articles_dir = f"{DA.paths.arxiv}/arxiv-articles"
dbutils.fs.mkdirs(articles_dir)

print(f"Downloading sample PDF files to {articles_dir}...")

# Download each PDF
for url, filename in arxiv_papers:
    try:
        file_path = f"{articles_dir}/{filename}"
        # Check if file already exists
        if not any(finfo.path.endswith(filename) for finfo in dbutils.fs.ls(articles_dir)):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, f"/tmp/{filename}")
            dbutils.fs.cp(f"file:/tmp/{filename}", file_path)
            print(f"✓ Downloaded {filename}")
        else:
            print(f"✓ {filename} already exists")
    except Exception as e:
        print(f"⚠ Error downloading {filename}: {e}")

print(f"\n✓ Sample data setup complete!")
print(f"Files are available at: {articles_dir}")