In [1]:
# Import required packages (will fail if not available)
import xarray as xr
import pysnptools

print(f"✓ xarray {xr.__version__}")
print(f"✓ pysnptools {type(pysnptools)}")

✓ xarray 2025.7.1
✓ pysnptools <class 'module'>


In [2]:
# Get sample BED file using PySnpTools
from pysnptools.snpreader import Bed
from pysnptools.util import example_file

bed_file = example_file("tests/datasets/all_chr.maf0.001.N300.*")
snp_reader = Bed(bed_file, count_A1=True)
print(f"Shape: {snp_reader.shape} (individuals × SNPs)")

Shape: (300, 1015) (individuals × SNPs)


In [9]:
# Convert PySnpTools BED data to xarray Dataset with full genomic metadata
import numpy as np

# Create xarray Dataset using numeric indexes (avoids MultiIndex complexity and ensures Zarr compatibility)
xarray_form = xr.Dataset(
    {
        "genotypes": (["individual", "snp"], snp_reader.read().val)
    },
    coords={
        "individual": range(snp_reader.iid_count),
        "fid": (["individual"], [fid for fid, _iid in snp_reader.read().iid]),  # Family IDs
        "iid": (["individual"], [iid for _fid, iid in snp_reader.read().iid]),  # Individual IDs
        
        "snp": range(snp_reader.sid_count),
        "sid": (["snp"], snp_reader.sid),                                                 # SNP IDs
        "chromosome": (["snp"], np.nan_to_num(snp_reader.pos[:, 0], nan=0).astype(int)),  # Chromosome (NaN→0)
        "cm_position": (["snp"], snp_reader.pos[:, 1]),                                   # Genetic position
        "bp_position": (["snp"], np.nan_to_num(snp_reader.pos[:, 2], nan=0).astype(int)), # Physical position (NaN→0)
    },
    attrs={
        "description": "Genotype data from PySnpTools BED file",
        "encoding": "0=homozygous ref, 1=heterozygous, 2=homozygous alt, NaN=missing",
        "source": bed_file,
    }
)

display(xarray_form)

In [10]:
# Import zarr explicitly and save to Zarr format
import zarr
print(f"✓ zarr {zarr.__version__}")

# Save to Zarr format for efficient storage and access
zarr_path = "all_chr.maf0.001.N300.zarr"
xarray_form.to_zarr(zarr_path, mode='w')

# Verify by loading back lazily
zarr_form = xr.open_zarr(zarr_path)
zarr_form


✓ zarr 3.0.10


  return cls(**configuration_parsed)
  meta = AsyncArray._create_metadata_v3(
  return cls(**configuration_parsed)
  meta = AsyncArray._create_metadata_v3(
  meta = AsyncArray._create_metadata_v3(
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)


In [None]:
print(f"Dataset loaded with data variables: {list(lazy_genotypes.data_vars.keys())}")
print(f"Dataset shape: {lazy_genotypes.sizes}")
print(f"Coordinates preserved: {list(lazy_genotypes.coords.keys())}")
print(f"Attributes preserved: {bool(lazy_genotypes.attrs)}")

# Show that it works the same way
print(f"\nFirst individual from disk: ({lazy_genotypes.fid.values[0]}, {lazy_genotypes.iid.values[0]})")
print(f"First SNP from disk: {lazy_genotypes.sid.values[0]}")
print(f"First genotype from disk: {lazy_genotypes.genotypes.values[0, 0]}")

# Show efficient filtering examples
print(f"\nFiltering examples:")
print("# By chromosome: lazy_genotypes.where(lazy_genotypes.chromosome == 1, drop=True)")
print("# By family: lazy_genotypes.where(lazy_genotypes.fid == 'POP1', drop=True)")
print("# By SNP ID: lazy_genotypes.where(lazy_genotypes.sid.isin(['1_12', '1_34']), drop=True)")