# Fully Connected Genomic Model (No filtering) 


In [48]:
import numpy  as np
import pandas as pd

import torch
from   torch import nn
import torch.nn.functional as F
from   torch.utils.data import Dataset
from   torch.utils.data import DataLoader

from EnvDL.dlfn import BigDataset, plDNN_general

import lightning.pytorch as pl
from   lightning.pytorch.loggers import CSVLogger

In [49]:
from dataG2F.core import get_data
from dataG2F.qol  import ensure_dir_path_exists

In [50]:

# Run settings: 
max_epoch  = 200
batch_size = 48
batch_size = 256

max_epoch  = 20

# run settings
params_run = {
    'max_epoch': 200,
    'batch_size': 48,
    'batch_size': 256,
    'max_epoch': 20,   
}

In [51]:
# data settings
params_data = {
    'y_var': 'Yield_Mg_ha',
    'y_resid': 'None', # None, Env, Geno
    'y_resid_strat': 'None', # None, naive_mean, filter_mean, ...
    'holdout_parents': [
        # Testers from 2020-2021
        'PHZ51', 
        # 'PHP02', 
        # 'PHK76'

    ],    
}

In [52]:
# in this file I define params later. I've included it here to gurantee that we can merge other params dicts into it.
params = {
}

In [53]:
# def get_setting(settings_dict, key):
#     if key in settings_dict.keys():
#         return settings_dict[key]
#     else:
#         return None
    
# get_setting(settings_data, 'holdout_parents')

In [54]:
y_var = params_data['y_var']

In [None]:
cache_path = '../nbs_artifacts/aim_1a_G_None_FCN/'
save_prefix = [e for e in cache_path.split('/') if e != ''][-1]
#TODO append info on the residual to the save prefix name?

ensure_dir_path_exists(dir_path = cache_path)

In [55]:
use_gpu_num = 0

device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]: 
    torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")

Using cuda device


## Load Data

In [56]:
obs_geno_lookup          = get_data('obs_geno_lookup')
phno                     = get_data('phno')
acgt                     = get_data('ACGT')

In [57]:
# flatten data
acgt = acgt.reshape(4926, -1)

In [58]:
# make holdout sets
# create a mask for parent genotype
def  mask_parent(df_FM, holdout = 'PHZ51'):
    holdout=   holdout.upper()
    mask_F = tmp.F.str.upper() == holdout
    mask_M = tmp.M.str.upper() == holdout
    mask = (mask_F | mask_M)
    return mask


holdout_parents = params_data['holdout_parents']

tmp = phno.loc[:, ['Env', 'Year', 'Hybrid']]
tmp[['F', 'M']] = tmp['Hybrid'].str.split('/', n=1, expand=True)

mask = pd.concat([
    mask_parent(df_FM=tmp, 
                holdout=e) for e in holdout_parents], axis=1)

test_idx  = mask.loc[(mask[0]),  ].index
train_idx = mask.loc[(~mask[0]), ].index

In [59]:
# convert y to residual if needed

if params_data['y_resid'] == 'None':
    pass
else:
    if params_data['y_resid_strat'] == 'naive_mean':
        # use only data in the training set (especially since testers will be more likely to be found across envs)
        # get enviromental means, subtract from observed value
        tmp = phno.loc[train_idx, ]
        env_mean = tmp.groupby(['Env_Idx']
                     ).agg(Env_Mean = (y_var, 'mean')
                     ).reset_index()
        tmp = phno.merge(env_mean)
        tmp.loc[:, y_var] = tmp.loc[:, y_var] - tmp.loc[:, 'Env_Mean']
        phno = tmp.drop(columns='Env_Mean')

    if params_data['y_resid_strat'] == 'filter_mean':
        # for adjusting to environment we could use _all_ observations but ideally we will use the same set of genotypes across all observations
        def minimum_hybrids_for_env(tmp = phno.loc[:, ['Env', 'Year', 'Hybrid']],
                                    year = 2014):
            # Within each year what hybrids are most common?
            tmp = tmp.loc[(tmp.Year == year), ].groupby(['Env', 'Hybrid']).count().reset_index().sort_values('Year')

            all_envs = set(tmp.Env)
            # if we filter on the number of sites a hybrid is planted at, what is the largest number of sites we can ask for before we lose a location?
            # site counts for sets which contain all envs
            i = max([i for i in list(set(tmp.Year)) if len(set(tmp.loc[(tmp.Year >= i), 'Env'])) == len(all_envs)])

            before = len(set(tmp.loc[:, 'Hybrid']))
            after  = len(set(tmp.loc[(tmp.Year >= i), 'Hybrid']))
            print(f'Reducing {year} hybrids from {before} to {after} ({round(100*after/before)}%).')
            tmp = tmp.loc[(tmp.Year >= i), ['Env', 'Hybrid']].reset_index(drop=True)
            return tmp


        tmp = phno.loc[:, ['Env', 'Year', 'Hybrid']]
        filter_hybrids = [minimum_hybrids_for_env(tmp = phno.loc[:, ['Env', 'Year', 'Hybrid']], year = i) 
                          for i in list(set(phno.Year)) ]
        env_mean = pd.concat(filter_hybrids).merge(phno, how = 'left')

        env_mean = env_mean.groupby(['Env_Idx']
                          ).agg(Env_Mean = (y_var, 'mean')
                          ).reset_index()

        tmp = phno.merge(env_mean)
        tmp.loc[:, y_var] = tmp.loc[:, y_var] - tmp.loc[:, 'Env_Mean']
        phno = tmp.drop(columns='Env_Mean')
        

In [60]:
# center and y value data
assert 0 == phno.loc[:, y_var].isna().sum()

y = phno.loc[:, y_var]
# use train index to prevent information leakage
y_c = y[train_idx].mean()
y_s = y[train_idx].std()

y = (y - y_c)/y_s

In [61]:
training_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = False,
    lookup_obs  = torch.from_numpy(np.array(train_idx)), 
    lookup_geno = torch.from_numpy(obs_geno_lookup),
    y =           torch.from_numpy(y.to_numpy()).to(torch.float32)[:, None],
    G =           torch.from_numpy(acgt).to(torch.float32),
    G_type = 'raw',
    send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = True
)

validation_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = False,
    lookup_obs  = torch.from_numpy(np.array(test_idx)), 
    lookup_geno = torch.from_numpy(obs_geno_lookup),
    y =           torch.from_numpy(y.to_numpy()).to(torch.float32)[:, None],
    G =           torch.from_numpy(acgt).to(torch.float32),
    G_type = 'raw',
    send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = True
)

## Define Parameters of model

In [62]:
# convenince wrapper to fill in for R's seq or x:y notation
def linrange(start, stop):
    import numpy as np
    diff = start - stop
    res = np.linspace(start, stop, abs(diff)+1).astype(int)
    return res 

In [63]:
[2**i for i in linrange(11, 4)]

[2048, 1024, 512, 256, 128, 64, 32, 16]

In [64]:
# This one is designed to go from (3x125891) -> ~1258.91  -> ~125.891 -> ~12.5891 -> 1
layer_sizes = [1024, 128, 12]
layer_drops = [0.1 for e in layer_sizes]

num_layers = len(layer_sizes)

params = {
    'num_layers':num_layers,
    f"in_1_of_{num_layers}": (4 * 125891)
}

for i in range(num_layers):
    params[f"out_{ i + 1}_of_{num_layers}"] = layer_sizes[i]
    params[f"drop_{ i + 1}_of_{num_layers}"] = layer_drops[i]
        

In [65]:
params

{'num_layers': 3,
 'in_1_of_3': 503564,
 'out_1_of_3': 1024,
 'drop_1_of_3': 0.1,
 'out_2_of_3': 128,
 'drop_2_of_3': 0.1,
 'out_3_of_3': 12,
 'drop_3_of_3': 0.1}

In [66]:
[e.shape for e in next(iter(training_dataloader))]

[torch.Size([256, 1]), torch.Size([256, 503564])]

In [1]:
from EnvDL.dlfn import Linear_res_block

In [68]:
# A quirk of this is that to get only a single layer the length of the input tensor must be passed in. for 2+ I'll figure it out.
class NeuralNetwork(nn.Module):
    def __init__(self, parameterization):
        super(NeuralNetwork, self).__init__()            
        module_list = []

        max_layer = parameterization['num_layers']
        for i in range(max_layer):
            if i  == 0:
                name_in = f"in_{i+1}_of_{max_layer}"
            else:
                name_in = f"out_{i}_of_{max_layer}"
            name_out = f"out_{i+1}_of_{max_layer}"
            name_drop= f"drop_{i+1}_of_{max_layer}"

            if i == 0:
                module_list += [nn.Flatten()]
            

            module_list += [
                Linear_res_block(
                    in_size  = parameterization[name_in], 
                    out_size = parameterization[name_out], 
                    drop_pr  = parameterization[name_drop])]
            
            if (i+1) == max_layer:
                module_list += [nn.Linear(parameterization[name_out], 1)]
                
        self.x_network = nn.ModuleList(module_list)
        
    def forward(self, x):
        
        for mod in self.x_network:
            if mod == self.x_network[-1]:
                out = x # get the penultimate layer's outputs for later
            x = mod(x)
        
        pred = x
        return pred#, out

# model = NeuralNetwork(
#     parameterization = parameterization).to(device)

# model(next(iter(training_dataloader))[0][0:5])

In [69]:
model = NeuralNetwork(parameterization = params).to(device)

In [72]:
# model(next(iter(training_dataloader))[1])[0:5]

tensor([[-0.1307],
        [-0.1577],
        [-0.1549],
        [-0.1290],
        [-0.1295]], device='cuda:0', grad_fn=<SliceBackward0>)

In [75]:
# combine parameter info
params['params_data'] = params_data
params['params_run']  = params_run


# params_data
# params_run

In [77]:

DNN = plDNN_general(model)  

optimizer = DNN.configure_optimizers()

logger = CSVLogger("nifa_tb", name=save_prefix)
logger.log_hyperparams(params=params)

trainer = pl.Trainer(max_epochs=max_epoch, logger=logger)

trainer.fit(model=DNN, train_dataloaders=training_dataloader, val_dataloaders=validation_dataloader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type          | Params
---------------------------------------
0 | mod  | NeuralNetwork | 517 M 
---------------------------------------
517 M     Trainable params
0         Non-trainable params
517 M     Total params
2,071.663 Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
