In [23]:
# Import required packages (will fail if not available)
import xarray as xr
import pysnptools

print(f"✓ xarray {xr.__version__}")
print(f"✓ pysnptools {type(pysnptools)}")

✓ xarray 2025.7.1
✓ pysnptools <class 'module'>


In [36]:
import numpy as np
# Get sample BED file using PySnpTools
from pysnptools.snpreader import Bed
from pysnptools.util import example_file

bed_file = example_file("tests/datasets/all_chr.maf0.001.N300.*")
snp_reader = Bed(bed_file, count_A1=True)
print(f"Shape: {snp_reader.shape} (individuals × SNPs)")

Shape: (300, 1015) (individuals × SNPs)


In [40]:
# Read all genotype data from disk to in-memory
snp_data = snp_reader.read()
print(snp_data.val)

# Instead read every second individual and SNPs (variants) from 20 to 30.
subset = snp_reader[::2, 20:30]
print(f"Subset shape: {subset.shape} (individuals × SNPs)")

# List the first 5 individual (sample) ids, 
# the first 5 SNP (variant) ids, and every unique chromosome. Then, read every value in chromosome 5.
print(snp_reader.iid[:5])
print(snp_reader.sid[:5])
print(np.unique(snp_reader.pos[:,0]))
val3 = snp_reader[:,snp_reader.pos[:,0] == 5].read().val
print(val3.shape)

[[0. 0. 1. ... 1. 1. 0.]
 [0. 0. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 1. 1. 0.]
 ...
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 2. 0.]]
Subset shape: (150, 10) (individuals × SNPs)
[['POP1' '0']
 ['POP1' '12']
 ['POP1' '44']
 ['POP1' '58']
 ['POP1' '65']]
['1_12' '1_34' '1_10' '1_35' '1_28']
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23.]
(300, 43)


In [25]:
# Convert PySnpTools BED data to xarray Dataset with full genomic metadata
import numpy as np

# Create xarray Dataset using numeric indexes (avoids MultiIndex complexity and ensures Zarr compatibility)
xarray_form = xr.Dataset(
    {
        "genotypes": (["individual", "snp"], snp_reader.read().val)
    },
    coords={
        "individual": range(snp_reader.iid_count),
        "fid": (["individual"], [fid for fid, _iid in snp_reader.read().iid]),  # Family IDs
        "iid": (["individual"], [iid for _fid, iid in snp_reader.read().iid]),  # Individual IDs
        
        "snp": range(snp_reader.sid_count),
        "sid": (["snp"], snp_reader.sid),                                                 # SNP IDs
        "chromosome": (["snp"], np.nan_to_num(snp_reader.pos[:, 0], nan=0).astype(int)),  # Chromosome (NaN→0)
        "cm_position": (["snp"], snp_reader.pos[:, 1]),                                   # Genetic position
        "bp_position": (["snp"], np.nan_to_num(snp_reader.pos[:, 2], nan=0).astype(int)), # Physical position (NaN→0)
    },
    attrs={
        "description": "Genotype data from PySnpTools BED file",
        "encoding": "0=homozygous ref, 1=heterozygous, 2=homozygous alt, NaN=missing",
        "source": bed_file,
    }
)

display(xarray_form)

In [26]:
# Import zarr explicitly and save to Zarr format
import zarr
import warnings
print(f"✓ zarr {zarr.__version__}")
warnings.filterwarnings('ignore', category=UserWarning, module='zarr') # Can ignore unicode warnings

# Save to Zarr format for efficient storage and access
zarr_path = "all_chr.maf0.001.N300.zarr"
xarray_form.to_zarr(zarr_path, mode='w')

# Verify by loading back lazily
zarr_form = xr.open_zarr(zarr_path)
display(zarr_form)


✓ zarr 3.0.10


In [27]:
# read all genotype data from the zarr file
zarr_form.genotypes.values  # Accessing the genotype data directly

array([[0., 0., 1., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 2., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 2., 0.]], shape=(300, 1015))

In [28]:
# Read every second individual and SNPs (variants) from 20 to 30.
val2 = zarr_form.genotypes[::2, 20:30].values
val2.shape

(150, 10)

In [29]:
# List the first 5 individual (sample) ids, the first 5 SNP (variant) ids,
# and every unique chromosome. Then, read every value in chromosome 5.
print(zarr_form.iid[:5].values)
print(zarr_form.sid[:5].values)
print(np.unique(zarr_form.chromosome))
print(zarr_form.genotypes[:, zarr_form.chromosome == 5].values)

['0' '12' '44' '58' '65']
['1_12' '1_34' '1_10' '1_35' '1_28']
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
[[0. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
# Look at https://sgkit-dev.github.io/sgkit/latest/getting_started.html#getting-started