In [1]:
import torch
from torch import nn
import torch.utils.data as data
from dataset import SNPmarkersDataset
import torch.nn.functional as F
import wandb
import numpy as np
from scipy.stats import pearsonr
from utils import format_batch

In [2]:
class MLP(torch.nn.Module):
    def __init__(self, nlayers: int = 1, hidden_nodes: list[int] = [], dropout: float = 0):
        super(MLP, self).__init__()
        
        if dropout < 0 or dropout >= 1:
            raise AttributeError("The dropout must be between 0 and 1")

        if nlayers < 1:
            raise AttributeError("The number of layers must be greater or equal than one !")
        
        if len(hidden_nodes) != nlayers - 1:
            raise AttributeError(f"Not enough hidden_nodes given, expected a list of length {nlayers - 1} but got one of {len(hidden_nodes)}")

        hidden_nodes.insert(0, 36304)
        hidden_nodes.append(4)

        self.model = nn.Sequential(*[LinearBlock(hidden_nodes[i], hidden_nodes[i + 1], dropout=dropout) for i in range(nlayers - 1)])
        self.dropout = nn.Dropout(dropout)
        self.output_layer = nn.Linear(hidden_nodes[-2], hidden_nodes[-1])

    def forward(self, x):
        return self.output_layer(self.dropout(self.model(x)))

class LinearBlock(torch.nn.Module):
    def __init__(self, input_size, output_size, dropout = 0):
        super(LinearBlock, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(in_features=input_size, out_features=output_size)
    
    def forward(self,x):
        return F.relu(self.fc(x))

In [3]:
BATCH_SIZE = 4
LEARNING_RATE = 1e-3
DROPOUT = 0.25
N_LAYERS = 2
HIDDEN_NODES = [1024]
N_EPOCHS = 1
SCHEDULER_STEP_SIZE = 20
SCHEDULER_REDUCE_RATIO = 0.5

In [4]:
selected_phenotypes = ["pheno_1", "pheno_2", "pheno_3", "pheno_4"]
train_dataset = SNPmarkersDataset(mode = "local_train", skip_check= True)
train_dataset.set_phenotypes = selected_phenotypes
train_dataset = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = 4)

validation_dataset = SNPmarkersDataset(mode = "validation", skip_check= True)
validation_dataset.set_phenotypes = selected_phenotypes
validation_dataset = data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, num_workers = 4)

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

model = MLP(nlayers=N_LAYERS, hidden_nodes= HIDDEN_NODES, dropout= DROPOUT)
print(f"Model architecture : \n {model}")
print(f"Numbers of parameters: {sum(p.numel() for p in model.parameters())}")

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = SCHEDULER_STEP_SIZE, gamma = SCHEDULER_REDUCE_RATIO)
criteron = torch.nn.L1Loss()
model.to(device)
for epoch in range(N_EPOCHS):
    train_loss = []
    model.train()
    for x,y in train_dataset:
        x,y = x.to(device), format_batch(y).to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criteron(output, y)
        train_loss.append(loss.cpu().detach())
        loss.backward()
        optimizer.step()

    print(f"Finished training for epoch {epoch}. Train loss: {np.array(train_loss).mean()}")

    val_loss = []
    predicted = []
    target = []
    model.eval()
    for x,y in validation_dataset:
        x,y = x.to(device), format_batch(y).to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criteron(output, y)
        val_loss.append(loss.cpu().detach())
        if len(predicted) == 0:
            predicted = output.cpu().detach()
            target = y.cpu().detach()
        else:
            predicted = np.concatenate((predicted, output.cpu().detach()), axis = 0)
            target = np.concatenate((target, y.cpu().detach()), axis = 0)
        loss.backward()
        optimizer.step()
    
    scheduler.step()
    print(f"Validation step for epoch {epoch} finished! Validation loss: {np.array(val_loss).mean()}")

Model architecture : 
 MLP(
  (model): Sequential(
    (0): LinearBlock(
      (dropout): Dropout(p=0.25, inplace=False)
      (fc): Linear(in_features=36304, out_features=1024, bias=True)
    )
  )
  (dropout): Dropout(p=0.25, inplace=False)
  (output_layer): Linear(in_features=1024, out_features=4, bias=True)
)
Numbers of parameters: 37180420
Finished training for epoch 0. Train loss: 3.4511655488277992
Validation step for epoch 0 finished! Validation loss: 1.7893592690664328
