In [1]:
#imports
import pytorch_lightning as pl
import h5py
import os
import pandas as pd
import numpy as np
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import library as lib

# Set seed

In [2]:
pl.seed_everything(42, workers=True)

Global seed set to 42


42

# Load, Format, Split into train/validation/test datasets

In [3]:
#make numpy file with (seq header, embedding)
esm2 = h5py.File("./KLK_esm2.h5", 'r')
list(esm2.keys())
# List all groups
keys = list(esm2.keys())
#print(keys) 
# Get the data
embedding = []
headers = []
for key in keys:
    #data
    emb = esm2[key][:]
    #print(emb)
    #value = np.array(esm2[key][:])
    #print(value)
    embedding.append(emb)
    #headers
    key = key.replace("isoform=", "isoform_").replace("=", "").replace(":", "_")
    headers.append(key)

headers = np.array(headers)
embedding = np.array(embedding)

#split train test
headers_first, headers_test, embedding_first, embedding_test = train_test_split(headers, embedding, test_size=0.1, random_state=42)
#split train validation
headers_train, headers_validate, embedding_train, embedding_validate = train_test_split(headers_first, embedding_first, test_size =0.2, random_state=42)
print("total:", headers.shape)
print("training+validation:", headers_first.shape)
print("test:", headers_test.shape)
print("training:", headers_train.shape)
print("validation:", headers_validate.shape)
print(len(headers_train), len(embedding_train))
print(len(headers_validate), len(embedding_validate))

#training_df = pd.DataFrame({'header': headers_train, 'embedding' : embedding_train})
#validate_df = pd.DataFrame({'header': headers_validate, 'embedding' : embedding_validate})

total: (437,)
training+validation: (393,)
test: (44,)
training: (314,)
validation: (79,)
314 314
79 79


# Dataloaders

In [4]:
#dataloaders
train_dataset = lib.makedataset(headers_train, embedding_train)
val_dataset = lib.makedataset(headers_validate, embedding_validate)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=1, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=1, drop_last=True)

#for i in train_dataloader:
    #print(i[0][0])
    #print(i[1][0])
    

# Training parameters

In [5]:
#encoder_layers, latent_dim, decoder_layers
dim = 2560
a = int(dim * 1/16)
b = int(dim * 1/8)
c = int(dim * 1/2)
encoder_layers = [dim, c, b, a]
latent_dim = a
decoder_layers = [a, b, c, dim]

# Initialize the VAE model
vae = lib.VariationalAutoencoder(encoder_layers, latent_dim, decoder_layers)

trainer = pl.Trainer(accelerator="auto", max_epochs=200)

# Train the VAE model
trainer.fit(vae, train_dataloader, val_dataloader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 3.7 M 
1 | latent  | Sequential | 26.1 K
2 | fc21    | Linear     | 25.8 K
3 | fc22    | Linear     | 25.8 K
4 | fc3     | Sequential | 26.1 K
5 | fc4     | Sequential | 26.1 K
6 | relu    | ReLU       | 0     
7 | decoder | Sequential | 3.8 M 
---------------------------------------
7.6 M     Trainable params
0         Non-trainable params
7.6 M     Total params
30.491    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


tensor([[-8.5181e-02, -1.6222e-03, -6.6730e-03,  ..., -3.8031e-02,
         -2.0113e-01, -2.7280e-02],
        [-2.8856e-02, -2.4776e-02, -6.5874e-02,  ..., -5.3546e-05,
         -1.3909e-01, -4.8657e-02],
        [-1.8246e-02,  9.5635e-03,  1.1470e-02,  ..., -1.2474e-03,
         -1.2617e-01,  1.1466e-02],
        ...,
        [-4.2723e-02,  6.8552e-03,  1.6257e-02,  ..., -1.4596e-02,
         -1.6469e-01, -1.8185e-02],
        [-4.1624e-02, -8.9031e-03, -4.2955e-02,  ..., -2.2822e-02,
         -1.5990e-01, -1.8496e-03],
        [-2.1711e-02, -2.7434e-03, -4.2189e-02,  ..., -4.5097e-02,
         -1.5113e-01, -4.1616e-02]], device='cuda:0')
tensor([[0.0762, 0.0031, 0.0000,  ..., 0.0402, 0.0321, 0.0204],
        [0.0756, 0.0003, 0.0000,  ..., 0.0421, 0.0285, 0.0230],
        [0.0790, 0.0057, 0.0000,  ..., 0.0478, 0.0344, 0.0221],
        ...,
        [0.0742, 0.0061, 0.0000,  ..., 0.0394, 0.0229, 0.0231],
        [0.0751, 0.0036, 0.0000,  ..., 0.0420, 0.0297, 0.0207],
        [0.0772, 0

TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not tuple

In [None]:
#Validate
trainer.validate(vae, val_dataloader)

In [None]:
#Plot



