In [1]:
import torch
import torch.utils.data as data
from dataset import SNPmarkersDataset
import json
from utils import train_DL_model
import numpy as np
import random
from torch.utils.data import Dataset
from sklearn.feature_selection import mutual_info_regression
from Models.GPTransformer import GPTransformer

In [31]:
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
DROPOUT = 0
N_EMBEDDING = 8
N_HEADS = 2
N_LAYERS = 2
HIDDEN_NODES = 256
N_EPOCHS = 5

In [3]:
selected_phenotypes = "ep_res"

In [6]:
mi = np.zeros(36304)
modes = ["local_train", "validation", "test"]
X_train = []
y_train = []
X_val = []
y_val = []
for mode in modes:
    dataset = SNPmarkersDataset(mode = mode, skip_check=True)
    dataset.set_phenotypes = selected_phenotypes
    
    X = dataset.get_all_SNP()
    y = dataset.phenotypes[selected_phenotypes]
    
    # Save the results to avoid fetching two times the sames values later on
    if mode == "local_train":
        X_train = X
        y_train = y 
    if mode == "validation":
        X_val = X
        y_val = y 
        
    mi += mutual_info_regression(X,y, n_jobs=-1, discrete_features=True, random_state=2307)

# Divide the number of modes to obtain the average mutual information
mi /= len(modes)
indexes = np.where(mi > 0.02)[0]
print(f"Nb of selected features: {len(indexes)}")


Nb of selected features: 2390


In [34]:
class SNPResidualDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

def convert_categorical_to_frequency(data, path = "gptranformer_embedding_data.json"):
    with open(path,"r") as f:
        freq_data = json.load(f)
    
    results = []
    for sample in data:
        func = lambda t: [freq_data[str(t[0])]["p"]**2, 2*freq_data[str(t[0])]["p"]*freq_data[str(t[0])]["q"],freq_data[str(t[0])]["q"]**2].__getitem__(t[1])
        results.append(list(map(func, enumerate(sample))))
    return np.array(results, dtype=np.float32)


train_dataset = SNPResidualDataset(convert_categorical_to_frequency(X_train[indexes].to_numpy()), y_train.to_numpy(dtype=np.float32))
validation_dataset = SNPResidualDataset(convert_categorical_to_frequency(X_val[indexes].to_numpy()), y_val.to_numpy(dtype=np.float32))

In [36]:
# Define function and seed to fix the loading via the dataloader (from https://pytorch.org/docs/stable/notes/randomness.html#pytorch)
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

train_dataloader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = 4, worker_init_fn=seed_worker)
validation_dataloader = data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, num_workers = 4, worker_init_fn=seed_worker)

model = GPTransformer(
    n_features = len(indexes),
    embedding_size=int(N_EMBEDDING), 
    n_hidden=HIDDEN_NODES,
    n_heads=N_HEADS,
    n_blocks=N_LAYERS
)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.L1Loss()

In [35]:
train_DL_model(
    model,
    optimizer,
    train_dataloader,
    validation_dataloader,
    N_EPOCHS,
    criterion,
    phenotype=selected_phenotypes,
    log_wandb=False,
    early_stop_n_epoch=5,
)

Device used: cpu
Model architecture : 
 GPTransformer(
  (embedding): Linear(in_features=2390, out_features=19120, bias=True)
  (transformer): Sequential(
    (0): TransformerBlock(
      (multihead): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
      )
      (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=8, out_features=256, bias=True)
      (relu): ReLU()
      (fc2): Linear(in_features=256, out_features=8, bias=True)
      (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (multihead): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
      )
      (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=8, out_features=256, bias=True)
      (relu): ReLU()
      (fc2): Linear(in_features=256, out_features=8, bias=True)
    

Train feature batch shape: torch.Size([8, 2390])
Train labels batch shape: torch.Size([8])
Validation feature batch shape: torch.Size([8, 2390])
Validation labels batch shape: torch.Size([8])


KeyboardInterrupt: 