<div style="display: flex; justify-content: center; margin-bottom: 20px;">  <img src="../docs/_static/seispolarity_logo_title.svg"></div>---## Dataset API Usage Example - PNW

Import necessary libraries

In [11]:
# Import necessary libraries for data processing and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))
from seispolarity.data import PNW

## PNW Dataset Automatic Download and ProcessingSet output directory and create PNW processor, enable automatic download function:- `auto_download=True`: automatically download missing data (if CSV and HDF5 files do not exist)- `use_hf=False`: use ModelScope (True uses Hugging Face)- `force_download=False`: do not force re-download (True will overwrite existing files)- `component_order="ENZ"`: original component order in HDF5 (E-N-Z)- `component="Z"`: extract Z component (vertical component)- `sampling_rate=100`: target sampling rate 100 Hz

In [None]:
# Set output directory and create PNW processor
# auto_download=True: automatically download missing data
# use_hf=False: use ModelScope instead of Hugging Face
# force_download=True: force re-download to overwrite existing files
# component_order="ENZ": original component order in HDF5
# component="Z": extract vertical component
# sampling_rate=100: target sampling rate in Hz
output_dir = Path('datasets/PNW')
output_dir.mkdir(parents=True, exist_ok=True)

csv_path = output_dir / 'PNW.csv'
hdf5_path = output_dir / 'PNW.hdf5'

print(f'Creating PNW processor...')
processor = PNW(
    csv_path=str(csv_path),
    hdf5_path=str(hdf5_path),
    output_polarity=str(output_dir),
    component_order='ENZ',
    component='Z',
    sampling_rate=100,
    auto_download=True,
    use_hf=False,
    force_download=True
)

print(f'Starting processing...')
processor.process()

## View Processed PNW DataRead processed HDF5 file, view basic dataset information and polarity distribution

In [None]:
# Read processed HDF5 file and inspect dataset information
# X: waveform data, Y: polarity labels, p_pick: P-wave arrival points
with h5py.File(processor.output_polarity, 'r') as f:
    X = f['X'][:]
    Y = f['Y'][:]
    p_pick = f['p_pick'][:]
    
print(f"Dataset shape: {X.shape}")
print(f"Label shape: {Y.shape}")
print(f"P-pick shape: {p_pick.shape}")
print(f"n Polarity distribution:")
unique, counts = np.unique(Y, return_counts=True)
label_names = ['positive', 'negative', 'undecidable']
for u, c in zip(unique, counts):
    print(f"  {label_names[u] if u < len(label_names) else str(u)}: {c}")

## Balanced Sampling for PNW DatasetPNW dataset is recommended to use **min-based** balanced sampling strategy to ensure equal proportions of positive, negative, and undecidable samples.**Min-Based Strategy**:- Count samples in each polarity class (positive, negative, undecidable)- Determine the minimum count among all classes- Sample equally from each class up to the minimum count- Final distribution: positive = 1/3, negative = 1/3, undecidable = 1/3

In [None]:
# Create balanced dataloader for PNW dataset using min-based strategy
from seispolarity.generate import BalancedPolarityGenerator

# Reload PNW data with all three labels (positive, negative, undecidable)
pnw_datasets_full = WaveformDataset(
    path=pnw_datasets,
    name="PNW_Full",
    preload=True,
    allowed_labels=[0, 1, 2],  # Include all three labels
    data_key="X",
    label_key="Y",
    p_pick_position=None,
    pick_key="p_pick",
    crop_left=200,
    crop_right=200
)

# Create balanced generator using min-based strategy
balanced_generator = BalancedPolarityGenerator(
    pnw_datasets_full,
    strategy="min_based"  # Recommended for PNW dataset
)

# Get balanced dataloader
balanced_loader = balanced_generator.get_dataloader(
    batch_size=256,
    num_workers=4,
    shuffle=True
)

print(f"Original dataset size: {len(pnw_datasets_full)}")
print(f"Label distribution: {pnw_datasets_full.label_distribution}")
print(f"Balanced dataset size: {len(balanced_generator)}")
print(f"Balanced dataloader created successfully!")

## Load PNW Data Using WaveformDatasetLoad processed PNW data using WaveformDataset class, can conveniently access individual samples

In [None]:
# Load PNW data using WaveformDataset class
# preload=True: load all data into memory
# allowed_labels=[0,1]: only use positive and negative labels
# crop_left=200, crop_right=200: extract 400 samples centered at P-wave arrival
from seispolarity.data import WaveformDataset

# PNW Datasets
pnw_datasets = processor.output_polarity
pnw_datasets = WaveformDataset(
    path=pnw_datasets,
    name="PNW",
    preload=True,
    allowed_labels=[0,1],
    data_key="X",
    label_key="Y",
    p_pick_position=None,
    pick_key="p_pick",  # Use p_pick as P-wave arrival point
    crop_left=200,   # 300 - 200 = 100
    crop_right=200   # 300 + 200 = 500 (total 400 sampling points)
)

# View data
print(f"Dataset size: {len(pnw_datasets)}")
print(f"n Access first sample:")
waveform, metadata = pnw_datasets[0]
print(f"Waveform data shape: {waveform.shape}")


