In [1]:
import pandas as pd
import numpy as np
import h5py

In [2]:
# create a hp5y file
f = h5py.File('data/test.h5', 'w')
# create random 5x20 matrix of 0 1 or 2
data = np.random.randint(0, 3, (5, 20))
print(data)

# write data to the file
f.create_dataset('genomics_data', data=data)

# close the file
f.close()

[[0 1 1 0 2 0 0 2 1 1 0 2 1 2 2 1 1 0 1 1]
 [1 0 2 2 2 1 2 1 2 0 1 2 1 0 0 2 1 2 2 1]
 [1 1 1 2 0 2 2 2 2 0 0 2 1 2 0 0 1 2 2 0]
 [0 2 2 2 1 1 0 1 0 0 1 0 1 1 2 0 2 1 2 2]
 [1 0 0 0 2 1 1 0 1 1 1 2 2 0 1 1 2 1 1 1]]


In [3]:
# read the data
f = h5py.File('data/test.h5', 'r')
data = f['genomics_data']

In [9]:
np.dstack([data[0,:]]).shape

(1, 20, 1)

In [11]:
np.atleast_2d(data[0,:]).shape

(1, 20)

In [12]:
len(data[0,:].shape)

1

In [18]:
np.expand_dims(data[1,:], axis=-1).shape

(20, 1)

In [32]:
# test models
import src.models as models

import torch
import pandas as pd

from dataclasses import make_dataclass
import time

gene_bim_file = "/cluster/project/beltrao/gankin/vnn/data/ukb_gene.bim"
gene_bim_df = pd.read_csv(gene_bim_file, sep="\t")

snp_id_map = {
    snp: ind
    for snp, ind in zip(
        gene_bim_df["snp"].unique(), range(0, len(gene_bim_df["snp"].unique()))
    )
}

## model setup from dataclasses import make_dataclass
argument_dict = {
    "onto": "ontology.txt",
    "train": "labels.csv",
    "label_col": "bc_reported",  # "has_cancer", # new for ukb
    "epoch": 150,
    "lr": 0.001,
    "wd": 0.001,
    "alpha": 0.3,
    "batchsize": 40480,  # 33840,
    "modeldir": "/model_test/",
    "cuda": 0,
    "gene2id": "all_genes.csv",
    "cell2id": "../../data/sample_train/sample2ind.txt",  # not used
    "genotype_hiddens": 4,
    "feature_dim": 1,
    "mutations": "features.npy",  # all genes #"../../data/sample_train/bin_features.npz",
    "cn_deletions": "cell2cndeletion.txt",  # not used
    "cn_amplifications": "cell2cnamplification.txt",  # not used
    "optimize": 1,
    "zscore_method": "auc",
    "std": "/model_test/std.txt",
    "patience": 30,
    "delta": 0.001,
    "min_dropout_layer": 2,
    "dropout_fraction": 0.0,
    "lr_step_size": 120,
}
args = make_dataclass(
    "DataclassFromDir", ((k, type(v)) for k, v in argument_dict.items())
)(**argument_dict)

  gene_bim_df = pd.read_csv(gene_bim_file, sep="\t")


In [33]:
from src.graphs import GeneOntology

print("Creating gene ontology")

snp_ont = GeneOntology(
    snp_id_map,
    "/cluster/project/beltrao/gankin/vnn/snp_vnn/data/NEST_UKB_snp_onto.txt",
    child_node="snp",
)

Creating gene ontology
There are 1 roots: NEST
There are 18074 terms
There are 1 connected components


In [42]:
def test_forward_speed(model = None, args = None, snp_ont = None):
    # Create a model
    if model is None:
        print("Setting up model")
        model = models.GenoVNN(args, snp_ont)
    # create random data

    # Create random gene input data
    x = torch.randn(16, 429371, 1)  # For example, a batch of 16 samples

    print("Starting speed test")

    # Run both the original and refactored versions
    # measure time
    start = time.time()
    aux_out_original, hidden_original = model.forward(x)
    end = time.time()
    print(f"Original forward time: {end - start}")

    start = time.time()
    aux_out_refactored, hidden_refactored = model.forward_refactored(x)
    end = time.time()
    print(f"Refactored forward time: {end - start}")

    # Check that the outputs are equal
    for term in aux_out_original.keys():
        assert torch.allclose(
            aux_out_original[term], aux_out_refactored[term]
        ), f"Mismatch in aux_out for {term}"
    for term in hidden_original.keys():
        assert torch.allclose(
            hidden_original[term][0], hidden_refactored[term][0]
        ), f"Mismatch in hidden state for {term}"

    #return [aux_out_original, hidden_original, aux_out_refactored, hidden_refactored]

In [43]:
from importlib import reload

In [48]:
reload(models)

<module 'src.models' from '/cluster/project/beltrao/gankin/vnn/snp_vnn/src/models.py'>

In [49]:
test_forward_speed(args=args, snp_ont=snp_ont)

Setting up model
computing masks
Constructing first NN layer
Constructing NN graph
Starting speed test
Original forward time: 7.054757595062256
Refactored forward time: 4.069234848022461


In [29]:
result[0]['NEST']

tensor([[ 0.8208],
        [-0.3880],
        [-0.2677],
        [-0.1230],
        [-0.3625],
        [-0.8709],
        [-0.7866],
        [-0.8860],
        [-0.5675],
        [ 0.4832],
        [-0.0118],
        [ 0.8688],
        [ 0.5905],
        [ 0.1373],
        [-0.7200],
        [-0.0082]], grad_fn=<TanhBackward0>)

In [31]:
result[2]['NEST']

tensor([[ 0.5677],
        [-0.7363],
        [-0.1503],
        [-0.2473],
        [-0.5651],
        [-0.6245],
        [-0.9000],
        [-0.8622],
        [-0.0805],
        [ 0.0681],
        [ 0.0509],
        [ 0.7747],
        [ 0.5926],
        [-0.0012],
        [ 0.5538],
        [-0.2507]], grad_fn=<TanhBackward0>)