In [1]:
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra.core.hydra_config import HydraConfig
from dataset.data_builder import training_data
from models.sklearn_factory import train_model
from evaluator.r2_evaluator import evaluate_model
from sklearn.model_selection import train_test_split
from utils.mlflow_utils import log_params_from_omegaconf_dict
import numpy as np
import hashlib
import mlflow

In [2]:

cfg = OmegaConf.create(
    {
        "dataset": {
            "params": {
                "geno_bed": "/scratch/capstone/ratgenes_pruned/ratgenes_pruned_0.95.bed",
                "geno_bim": "/scratch/capstone/ratgenes_pruned/ratgenes_pruned_0.95.bim",
                "geno_fam": "/scratch/capstone/ratgenes_pruned/ratgenes_pruned_0.95.fam",
                "phenotypes": "/projects/ps-palmer/bbjohnson/rattaca/nida_poster/mass.csv",
                "select_traits": '',
                "filter_unknowns": False
            },
            "gwas_p_value": 0.5
        }
    }
)

In [3]:
X_geno, Y_pheno = training_data(**cfg.dataset.params).gwas_filtered(cfg.dataset.gwas_p_value)

X_train, X_val, y_train, y_val = train_test_split(X_geno, Y_pheno, test_size=0.20, random_state=42)

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


Building genotypes matrix ...
Done building genotypes matrix



In [4]:
from torch.utils.data import Dataset, DataLoader
class GenomicsSet(Dataset):

    def __init__(self, geno_df, pheno_df):
        self.geno_df = geno_df.astype("float32")
        self.pheno_df = pheno_df.astype("float32")

    def __len__(self):
        return len(self.geno_df)

    def __getitem__(self, idx):
        geno = self.geno_df.iloc[idx]
        #print(geno)
        pheno = self.pheno_df.loc[geno.name]
        #print(geno, pheno)
        
        return geno.to_numpy(), pheno.to_numpy()
    
dataset = GenomicsSet(X_train, y_train)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import lightning as pl
class GenomicsDataModule(pl.LightningDataModule):
    def __init__(self, X_geno, Y_pheno, val_size=0.2, batch_size=32, num_workers=16):
        super().__init__()
        self.batch_size = batch_size
        self.val_size = val_size
        self.num_workers = num_workers
        self.X = X_geno
        self.y = Y_pheno

    def setup(self, stage: str):
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=self.val_size, random_state=42
        )
        self.train_set = GenomicsSet(self.X_train, self.y_train)
        self.val_set = GenomicsSet(self.X_val, self.y_val)
        
    def train_dataloader(self):
        return DataLoader(self.train_set, self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_set, self.batch_size, num_workers=self.num_workers)
    
datamodule = GenomicsDataModule(X_geno, Y_pheno)
datamodule.setup(stage="train")
for x, y in datamodule.train_dataloader():
    print(x.shape, y.shape)
    print(x.dtype)
    break

torch.Size([32, 214018]) torch.Size([32, 1])
torch.float32


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import math
from torch import optim

class FF_block(nn.Module):
    def __init__(self, d_in=214018, d_hidden=32, d_out=1):
        super().__init__()
        self.lin1 =  nn.Linear(d_in, d_hidden)
        self.norm = nn.LayerNorm(d_hidden)
        self.lin2 = nn.Linear(d_hidden, d_out)
    
    def forward(self, x: Tensor) -> Tensor:
        x = self.lin1(x)
        x = F.relu(x)
        x = self.norm(x)
        x = self.lin2(x)
        return x

class GenomicsTrainer(pl.LightningModule):
    def __init__(self, 
            model: nn.Module,
            config: DictConfig) -> None:
        super(GenomicsTrainer, self).__init__()
        self.config = config
        self.model = model(**config.model)
        self.loss_fn = nn.MSELoss()
        #self.accuracy = Accuracy(task="multiclass", num_classes=10)
        #self.train_accuracy = Accuracy(task="multiclass", num_classes=10)

    def forward(self, x: Tensor, **kwargs) -> Tensor:
        return self.model(x, **kwargs)

    def training_step(self, batch: Tensor, batch_idx: int) -> Tensor:
        x, y = batch
        output = self.model(x)
        loss = self.loss_fn(output, y)

        self.log("train_loss", loss, prog_bar=True)
        #self.train_accuracy(output, y)
        #self.log('train_acc', self.train_accuracy, prog_bar=True, on_epoch=True)
        return loss
    
    def validation_step(self, batch: Tensor, batch_idx: int) -> Tensor:
        x, y = batch
        output = self.model(x)
        loss = self.loss_fn(output, y)

        self.log("val_loss", loss, prog_bar=True)
        #self.accuracy(output, y)
        #self.log('val_acc', self.accuracy, prog_bar=True, on_epoch=True)
    
    def configure_optimizers(self):
        # access the saved hyperparameters
        opt = optim.SGD(self.parameters(), lr=self.config.hparams.lr)
        return opt 

cfg = OmegaConf.create(
    {
        "model": {
            "d_in": 214018,
            "d_hidden": 32,
            "d_out": 1
        },
        "hparams":{
            "lr": 0.001
        }
    }
)

model = GenomicsTrainer(model=FF_block, config=cfg)

trainer = pl.Trainer()
trainer.fit(model, GenomicsDataModule(X_geno, Y_pheno, batch_size=32, num_workers=1))

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
  rank_zero_warn(
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name    | Type     | Params
-------------------------------------
0 | model   | FF_block | 6.8 M 
1 | loss_fn | MSELoss  | 0     
-------------------------------------
6.8 M     Trainable params
0         Non-trainable params
6.8 M     Total params
27.395    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  4.78it/s]



                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 70/70 [00:08<00:00,  7.78it/s, v_num=14, train_loss=0.945]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|▌         | 1/18 [00:00<00:00, 161.57it/s][A
Validation DataLoader 0:  11%|█         | 2/18 [00:00<00:00, 22.06it/s] [A
Validation DataLoader 0:  17%|█▋        | 3/18 [00:00<00:00, 17.01it/s][A
Validation DataLoader 0:  22%|██▏       | 4/18 [00:00<00:00, 15.23it/s][A
Validation DataLoader 0:  28%|██▊       | 5/18 [00:00<00:00, 14.37it/s][A
Validation DataLoader 0:  33%|███▎      | 6/18 [00:00<00:00, 13.82it/s][A
Validation DataLoader 0:  39%|███▉      | 7/18 [00:00<00:00, 13.44it/s][A
Validation DataLoader 0:  44%|████▍     | 8/18 [00:00<00:00, 13.21it/s][A
Validation DataLoader 0:  50%|█████     | 9/18 [00:00<00:00, 13.01it/s][A
Validation DataLoader 0:  56%|█████▌    | 10/18 [00:00<00:00, 12.85it/s][A
Validat