This notebook loads some genomic data from a single user from a 23andMe file and stores it on the Nillion platform.

The objective is to benchmark the storage capacity and performance of the Nillion platform for large datasets.

Let's install the necessary libraries.

In [1]:
!pip install numpy
!pip install nada_numpy

Collecting numpy
  Downloading numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.1.0
Collecting nada_numpy
  Downloading nada_numpy-0.4.0-py3-none-any.whl (23 kB)
Collecting numpy<2.0.0,>=1.26.4
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: numpy, nada_numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.0
    Uninstalling numpy-2.1.0:
      Successfully uninstalled numpy-2.1.0
Successfully installed nada_numpy-0.4.0 numpy-1.26.4


Let's load the genomic data from the sample 23andMe file.

In [2]:
import os
import numpy as np
import re

def read_and_process_23andme(file_path, num_lines=None):
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")

    # Genotype to integer mapping
    genotype_to_int = {
        "AA": 0, "AC": 1, "AG": 2, "AT": 3,
        "CC": 4, "CG": 5, "CT": 6,
        "GG": 7, "GT": 8,
        "TT": 9
    }

    results = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines_read = 0
        for line in file:
            if num_lines is not None and lines_read >= num_lines:
                break

            if line.startswith('#'):
                continue

            parts = line.strip().split('\t')
            if len(parts) < 4:
                continue

            rsid, chromosome, position, genotype = parts

            # Extract only the numbers from rsid
            rsid_int = int(re.sub(r'\D', '', rsid))

            genotype_int = genotype_to_int.get(genotype, -1)  # Use -1 for unrecognized genotypes

            # Convert chromosome to int if possible, otherwise use -1
            try:
                chrom_int = int(chromosome)
            except ValueError:
                chrom_int = -1  # Use -1 for non-numeric chromosomes (e.g., 'X', 'Y', 'MT')

            results.append([rsid_int, chrom_int, int(position), genotype_int])

            lines_read += 1

    return np.array(results, dtype=int)

In [3]:
gene_data = read_and_process_23andme('testdata/hu278AF5_20210124151934.txt', 1000000)
gene_data[:5, :]

array([[548049170,         1,     69869,         9],
       [  9283150,         1,    565508,         0],
       [116587930,         1,    727841,         7],
       [  3131972,         1,    752721,         2],
       [ 12184325,         1,    754105,         4]])

In [4]:
gene_data.shape

(631955, 4)

In [5]:
gene_data[gene_data[:, 0] == 1815739]

array([[ 1815739,       11, 66328095,        6]])

In [10]:
target_snp_1 = 1815739
target_genotype_1 = 6

boolean_mask = gene_data[:, 0] == target_snp_1
actual_genotype_1 = np.dot(boolean_mask, gene_data[:, 3])
actual_genotype_1

6

In [11]:

is_successful = actual_genotype_1 == target_genotype_1
print(is_successful)

True


Let's do some Nillon setup.

In [47]:
import os
import py_nillion_client as nillion
from py_nillion_client import NodeKey, UserKey
from dotenv import load_dotenv
from nillion_python_helpers import get_quote_and_pay, create_nillion_client, create_payments_config
from cosmpy.aerial.client import LedgerClient
from cosmpy.aerial.wallet import LocalWallet
from cosmpy.crypto.keypairs import PrivateKey
import uuid

home_dir = os.path.expanduser("~")
env_path = os.path.join(home_dir, ".config", "nillion", "nillion-devnet.env")

print(f"Loading environment variables from {env_path}")

load_dotenv(env_path)
for key, value in os.environ.items():
    if key.startswith("NILLION_"):
        print(f"{key}: {value}")

cluster_id = os.getenv('NILLION_CLUSTER_ID')
chain_id = os.getenv('NILLION_NILCHAIN_CHAIN_ID')
grpc_endpoint = os.getenv('NILLION_NILCHAIN_GRPC')

monadic_seed = "monadic_seed"
monadic_userkey = UserKey.from_seed(monadic_seed)
monadic_client = create_nillion_client(monadic_userkey, NodeKey.from_seed(uuid.uuid4().hex))
monadic_party_id = monadic_client.party_id
monadic_user_id = monadic_client.user_id

payments_config = create_payments_config(chain_id, grpc_endpoint)
payments_client = LedgerClient(payments_config)
payments_wallet = LocalWallet(
    PrivateKey(bytes.fromhex(os.getenv("NILLION_NILCHAIN_PRIVATE_KEY_0"))),
    prefix="nillion",
)

Loading environment variables from /Users/vishakh/.config/nillion/nillion-devnet.env
NILLION_CLUSTER_ID: 9e68173f-9c23-4acc-ba81-4f079b639964
NILLION_BOOTNODE_MULTIADDRESS: /ip4/127.0.0.1/tcp/37939/p2p/12D3KooWMvw1hEqm7EWSDEyqTb6pNetUVkepahKY6hixuAuMZfJS
NILLION_BOOTNODE_WEBSOCKET: /ip4/127.0.0.1/tcp/54936/ws/p2p/12D3KooWMvw1hEqm7EWSDEyqTb6pNetUVkepahKY6hixuAuMZfJS
NILLION_NILCHAIN_CHAIN_ID: nillion-chain-devnet
NILLION_NILCHAIN_JSON_RPC: http://127.0.0.1:48102
NILLION_NILCHAIN_REST_API: http://localhost:26650
NILLION_NILCHAIN_GRPC: localhost:26649
NILLION_NILCHAIN_PRIVATE_KEY_0: 9a975f567428d054f2bf3092812e6c42f901ce07d9711bc77ee2cd81101f42c5
NILLION_NILCHAIN_PRIVATE_KEY_1: 1e491133b9408b39572a29f91644873decea554224b20e2b0b923aeb860a1c18
NILLION_NILCHAIN_PRIVATE_KEY_2: 980488572f235316cdb330191f8bafe4e635efbe88b3a40f5bee9bd21047c059
NILLION_NILCHAIN_PRIVATE_KEY_3: 612bb5173dc60d9e91404fcc0d1f1847fb4459a7d5160d63d84e91aacbf2ab2f
NILLION_NILCHAIN_PRIVATE_KEY_4: 04f5a984eeea9dce4e5e907da

Adapting a utility function from https://github.com/NillionNetwork/nada-numpy/blob/8dbf08c0ee840cab8b6103ef1d7dc5dcf438d442/examples/common/utils.py#L158. 

In [48]:
from typing import Any
import nada_numpy.client as na_client

async def store_secret_array(
        client: nillion.NillionClient,
        payments_wallet: LocalWallet,
        payments_client: LedgerClient,
        cluster_id: str,
        secret_array: np.ndarray,
        secret_name: str,
        nada_type: Any,
        ttl_days: int = 1,
        permissions: nillion.Permissions = None,
):

    # Create a secret
    stored_secret = nillion.NadaValues(
        na_client.array(secret_array, secret_name, nada_type)
    )

    # Get cost quote, then pay for operation to store the secret
    receipt_store = await get_quote_and_pay(
        client,
        nillion.Operation.store_values(stored_secret, ttl_days=ttl_days),
        payments_wallet,
        payments_client,
        cluster_id,
    )

    # Store a secret, passing in the receipt that shows proof of payment
    store_id = await client.store_values(
        cluster_id, stored_secret, permissions, receipt_store
    )
    return store_id

And now..the moment of truth!

In [49]:
permissions = nillion.Permissions.default_for_user(monadic_user_id)

store_id = await store_secret_array(
    monadic_client,
    payments_wallet,
    payments_client,
    cluster_id,
    gene_data,
    "gene_data",
    nillion.SecretInteger,
    1,
    permissions,
)

Getting quote for operation...


RuntimeError: requesting price quote: price quote query failed: price quote query failed: payload size (11171406) exceeds maximum (6291456)

Here are the results so far:

* 1 record - 2s 545ms
* 10 records - 2s 212ms
* 100 records - 3s 5ms
* 1000 records - 3s 104ms

We now need to break the 10,000 record barrier!

Now let's do it using a Nada program.