In [1]:
#imports
import pytorch_lightning as pl
import h5py
import os
import numpy as np
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import library as lib

# Threads and Device

In [2]:
#pl.seed_everything(42, workers=True)
THREADS = 24
DEVICE  = 'cuda'

# Load, Format, Split into train/validation/test datasets

In [3]:
#make numpy file with (seq header, embedding)
esm2 = h5py.File("./KLK_esm2.h5", 'r')
list(esm2.keys())
# List all groups
keys = list(esm2.keys())
#print(keys) 
# Get the data
embedding = []
headers = []
for key in keys:
    #data
    emb = esm2[key][:]
    #print(emb)
    #value = np.array(esm2[key][:])
    #print(value)
    embedding.append(emb)
    #headers
    key = key.replace("isoform=", "isoform_").replace("=", "").replace(":", "_")
    headers.append(key)

headers = np.array(headers)
embedding = np.array(embedding)

#split train test
headers_first, headers_test, embedding_first, embedding_test = train_test_split(headers, embedding, test_size=0.1, random_state=42)
#split train validation
headers_train, headers_validate, embedding_train, embedding_validate = train_test_split(headers_first, embedding_first, test_size =0.2, random_state=42)
print("total:", headers.shape)
print("training+validation:", headers_first.shape)
print("test:", headers_test.shape)
print("training:", headers_train.shape)
print("validation:", headers_validate.shape)
print(len(headers_train), len(embedding_train))
print(len(headers_validate), len(embedding_validate))

total: (437,)
training+validation: (393,)
test: (44,)
training: (314,)
validation: (79,)
314 314
79 79


# Dataloaders

In [4]:
#dataloaders
train_dataset = lib.makedataset(headers_train, embedding_train)
val_dataset = lib.makedataset(headers_validate, embedding_validate)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=1)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=1)

# Training parameters

In [5]:
#encoder_layers, latent_dim, decoder_layers
dim = 1024
a = int(dim * 1/16)
b = int(dim * 1/8)
c = int(dim * 1/2)
encoder_layers = [dim, c, b, a]
latent_dim = a
decoder_layers = [a, b, c, dim]

# Initialize the VAE model
vae = lib.VariationalAutoencoder(encoder_layers, latent_dim, decoder_layers)

trainer = pl.Trainer(accelerator="auto", max_epochs=200)

# Train the VAE model
#trainer.fit(model, train_dataloader, val_dataloader)
trainer.fit(vae, train_dataloader, val_dataloader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 600 K 
1 | latent  | Sequential | 4.3 K 
2 | fc21    | Linear     | 4.2 K 
3 | fc22    | Linear     | 4.2 K 
4 | fc3     | Sequential | 4.3 K 
5 | fc4     | Sequential | 4.3 K 
6 | relu    | ReLU       | 0     
7 | decoder | Sequential | 603 K 
---------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.897     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


ValueError: too many values to unpack (expected 2)

In [None]:
#Validate
trainer.validate(vae, val_dataloader)

In [None]:
#Plot



