# PILArNet Dataset

This notebook demonstrates how to download and explore the PILArNet dataset using Panda's HuggingFace integration.


## 1. Downloading the Dataset

The PILArNet dataset can be downloaded directly from HuggingFace. There are several ways to do this:


In [1]:
import panda

# quick download: just specify the split
data_root = panda.download_pilarnet(split="test")
print(f"Data downloaded to: {data_root}")

ImportError: cannot import name 'download_pilarnet' from 'panda.data' (/sdf/group/neutrino/youngsam/representations/panda/panda/data.py)

### Alternative: Auto-download via PILArNetH5Dataset

The dataset class can also auto-download data when initialized without a `data_root`:


In [None]:
# auto-download when creating dataset
dataset = panda.PILArNetH5Dataset(split="test", energy_threshold=0.13)

or, if you save all your datasets in a single place you can pass in the `data_root`:

In [None]:
dataset = panda.PILArNetH5Dataset(data_root=data_root, split="test", energy_threshold=0.13)

### Advanced: Using PILArNetHFInterface

For more control, use the interface directly:


In [None]:
from panda import PILArNetHFInterface

interface = PILArNetHFInterface(
    repo_id="deeplearnphysics/pilarnet-m",
)

# list files in the repository
files = interface.list_files("**/*.h5")
print(f"Found {len(files)} h5 files:")
for f in files[:10]:
    print(f"  {f}")
if len(files) > 10:
    print(f"  ... and {len(files) - 10} more")

## 2. Exploring the Dataset


In [None]:
import numpy as np

print(f"Dataset size: {len(dataset)} events")
print(f"Data root: {dataset.data_root}")
print(f"H5 files: {len(dataset.h5_files)}")


In [None]:
# get a single event
idx = np.random.randint(0, len(dataset))
data = dataset[idx]

print(f"Event {idx} keys:")
for k, v in data.items():
    if hasattr(v, 'shape'):
        print(f"  {k}: {v.shape} ({v.dtype})")
    else:
        print(f"  {k}: {type(v).__name__}")


## 3. Dataset Labels

PILArNet includes several label types:

**Semantic (motif) classes:**
- 0: Shower
- 1: Track
- 2: Michel
- 3: Delta
- 4: Low energy deposit

**Particle ID (PID) classes:**
- 0: Photon
- 1: Electron
- 2: Muon
- 3: Pion
- 4: Proton
- 5: None (Low energy deposit)


In [None]:
MOTIF_CLASSES = ['Shower', 'Track', 'Michel', 'Delta', 'LED']
PID_CLASSES = ['Photon', 'Electron', 'Muon', 'Pion', 'Proton', 'None']

# count label distribution
motif_labels = data['segment_motif'].squeeze().numpy()
pid_labels = data['segment_particle'].squeeze().numpy()

print("Motif distribution:")
for i, name in enumerate(MOTIF_CLASSES):
    count = (motif_labels == i).sum()
    print(f"  {name}: {count} ({100*count/len(motif_labels):.1f}%)")

print("\nPID distribution:")
for i, name in enumerate(PID_CLASSES):
    count = (pid_labels == i).sum()
    print(f"  {name}: {count} ({100*count/len(pid_labels):.1f}%)")


## 4. Visualization


In [None]:
import plotly.graph_objects as go
import plotly.express as px

coords = data['coord'].numpy()
energy = data['energy'].squeeze().numpy()
motif = data['segment_motif'].squeeze().numpy()
pid = data['segment_particle'].squeeze().numpy()
instance = data['instance_particle'].squeeze().numpy()


### Energy Visualization


In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=coords[:, 0], y=coords[:, 1], z=coords[:, 2],
    mode='markers',
    marker=dict(size=2, color=energy, colorscale='Viridis', colorbar=dict(title='Energy'), opacity=0.8)
)])
fig.update_layout(
    title=f'Event {idx}: Energy Deposition',
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'),
    width=800, height=600
)
fig.show()


### Semantic Labels (Motif)


In [None]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

fig = go.Figure()
for i, name in enumerate(MOTIF_CLASSES):
    mask = motif == i
    if mask.sum() > 0:
        fig.add_trace(go.Scatter3d(
            x=coords[mask, 0], y=coords[mask, 1], z=coords[mask, 2],
            mode='markers',
            marker=dict(size=2, color=colors[i], opacity=0.8),
            name=name
        ))

fig.update_layout(
    title=f'Event {idx}: Semantic Labels (Motif)',
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'),
    width=800, height=600
)
fig.show()


### Particle ID Labels

In [None]:
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#999999']

fig = go.Figure()
for i, name in enumerate(PID_CLASSES):
    mask = pid == i
    if mask.sum() > 0:
        fig.add_trace(go.Scatter3d(
            x=coords[mask, 0], y=coords[mask, 1], z=coords[mask, 2],
            mode='markers',
            marker=dict(size=2, color=colors[i], opacity=0.8),
            name=name
        ))

fig.update_layout(
    title=f'Event {idx}: Particle ID Labels',
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'),
    width=800, height=600
)
fig.show()


### Particle Instances

In [None]:
unique_instances = np.unique(instance)
n_instances = len(unique_instances)
colorscale = px.colors.qualitative.Dark24

fig = go.Figure()
for i, inst_id in enumerate(unique_instances):
    mask = instance == inst_id
    inst_pid = pid[mask][0]
    inst_name = PID_CLASSES[inst_pid]
    color = colorscale[i % len(colorscale)]
    fig.add_trace(go.Scatter3d(
        x=coords[mask, 0], y=coords[mask, 1], z=coords[mask, 2],
        mode='markers',
        marker=dict(size=2, color=color, opacity=0.8),
        name=f'{inst_name} (inst {inst_id})'
    ))

fig.update_layout(
    title=f'Event {idx}: Particle Instances ({n_instances} particles)',
    scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'),
    width=800, height=600
)
fig.show()


## 5. Using with DataLoader


In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=panda.utils.collate_fn,
    num_workers=0,
)

batch = next(iter(dataloader))
print("Batch keys:")
for k, v in batch.items():
    if hasattr(v, 'shape'):
        print(f"  {k}: {v.shape}")
