In [1]:
# Import required packages (will fail if not available)
import xarray as xr
import pysnptools

print(f"✓ xarray {xr.__version__}")
print(f"✓ pysnptools {type(pysnptools)}")

✓ xarray 2025.7.1
✓ pysnptools <class 'module'>


In [2]:
# Load sample BED file using PySnpTools
from pysnptools.snpreader import Bed
from pysnptools.util import example_file

bed_file = example_file("tests/datasets/all_chr.maf0.001.N300.*")
snp_reader = Bed(bed_file, count_A1=True)
print(f"Shape: {snp_reader.shape} (individuals × SNPs)")

Shape: (300, 1015) (individuals × SNPs)


In [3]:
# Read data and convert to xarray with metadata
import numpy as np
import pandas as pd

# Read the actual genotype data
snp_data = snp_reader.read()
print(f"Data loaded: {snp_data.val.shape}")

# Extract all metadata
# Individual metadata
fid = [fid for fid, _iid in snp_data.iid]
iid = [iid for _fid, iid in snp_data.iid]

# SNP metadata (SNP IDs and positions)
sid = snp_data.sid
# Handle NaN values by converting to 0 before casting to int
chromosome = np.where(np.isnan(snp_data.pos[:, 0]), 0, snp_data.pos[:, 0]).astype(int)
cm_position = snp_data.pos[:, 1]
bp_position = np.where(np.isnan(snp_data.pos[:, 2]), 0, snp_data.pos[:, 2]).astype(int)

# Create xarray DataArray with full metadata
genotypes = xr.DataArray(
    snp_data.val,
    dims=["individual", "snp"],
    coords={
        "individual": pd.MultiIndex.from_tuples([tuple(row) for row in snp_data.iid], names=["fid", "iid"]),
        "snp": sid,
        "chromosome": (["snp"], chromosome),
        "cm_position": (["snp"], cm_position),
        "bp_position": (["snp"], bp_position),
    },
    attrs={
        "description": "Genotype data from PySnpTools BED file",
        "encoding": "0=homozygous ref, 1=heterozygous, 2=homozygous alt, NaN=missing",
        "source": bed_file,
    }
)

print("xarray DataArray created:")
display(genotypes)

Data loaded: (300, 1015)
xarray DataArray created:


In [4]:
# Import zarr explicitly and save to Zarr format
import zarr
print(f"✓ zarr {zarr.__version__}")

# Save to Zarr format for efficient storage and access
zarr_path = "all_chr.maf0.001.N300.zarr"

print(f"Saving to {zarr_path}...")
genotypes.to_zarr(zarr_path, mode='w')
print(f"✓ Saved successfully to {zarr_path}")

# Verify by loading back lazily
print("\nVerifying saved data:")
lazy_genotypes = xr.open_zarr(zarr_path)
print(f"Lazy loaded shape: {lazy_genotypes.shape}")
print(f"Coordinates preserved: {list(lazy_genotypes.coords.keys())}")
print(f"Attributes preserved: {bool(lazy_genotypes.attrs)}")

# Show that it works the same way
print(f"\nFirst individual from disk: {lazy_genotypes.individual.values[0]}")
print(f"First SNP from disk: {lazy_genotypes.snp.values[0]}")
print(f"First genotype from disk: {lazy_genotypes.values[0, 0]}")

✓ zarr 3.0.10
Saving to all_chr.maf0.001.N300.zarr...


NotImplementedError: variable 'individual' is a MultiIndex, which cannot yet be serialized. Instead, either use reset_index() to convert MultiIndex levels into coordinate variables instead or use https://cf-xarray.readthedocs.io/en/latest/coding.html.