In [9]:
!ln -s /Users/adel.schmucklermann/Desktop/FoPra/script/library.py library.py

ln: library.py: File exists


In [10]:
import pytorch_lightning as pl
import h5py
import os
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import library as lib
import pandas as pd

In [2]:
pl.seed_everything(42, workers=True) 

Global seed set to 42


42

# Data Pre-Processing

In [3]:
#make numpy file with (seq header, embedding)
esm2 = h5py.File("/Users/adel.schmucklermann/Desktop/FoPra/KLK/KLK_esm2.h5", 'r')
list(esm2.keys())
# List all groups
keys = list(esm2.keys())
#print(keys) 
# Get the data
embedding = []
headers = []
for key in keys:
    #data
    emb = esm2[key][:]
    embedding.append(emb)
    #headers
    key = key.replace("isoform=", "isoform_").replace("=", "").replace(":", "_")
    if key == "GZMA_Canis_lupus":
        print("header is now changed")
        key = "GZMA_Canis_lupus_1"
    headers.append(key)

headers = np.array(headers)
embedding = np.array(embedding)

#split train test
headers_first, headers_test, embedding_first, embedding_test = train_test_split(headers, embedding, test_size=0.1, random_state=42)
#split train validation
headers_train, headers_validate, embedding_train, embedding_validate = train_test_split(headers_first, embedding_first, test_size =0.2, random_state=42)
print("total:", headers.shape)
print("training+validation:", headers_first.shape)
print("test:", headers_test.shape)
print("training:", headers_train.shape)
print("validation:", headers_validate.shape)
print(len(headers_train), len(embedding_train))
print(len(headers_validate), len(embedding_validate))

#training_df = pd.DataFrame({'header': headers_train, 'embedding' : embedding_train})
#validate_df = pd.DataFrame({'header': headers_validate, 'embedding' : embedding_validate})

header is now changed
total: (437,)
training+validation: (393,)
test: (44,)
training: (314,)
validation: (79,)
314 314
79 79


# Dataloaders

In [4]:
train_dataset = lib.makedataset(headers_train, embedding_train)
val_dataset = lib.makedataset(headers_validate, embedding_validate)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, drop_last=True)
    

# Training VAE

In [5]:
#if prott5 used: dtype=float16 -> error with .linear()

#encoder_layers, latent_dim, decoder_layers
dim = 2560
a = int(dim * 1/16)
b = int(dim * 1/8)
c = int(dim * 1/2)
encoder_layers = [dim, c, b, a]
latent_dim = a
decoder_layers = [a, b, c, dim]

# Initialize the VAE model
vae = lib.VariationalAutoencoder(encoder_layers, latent_dim, decoder_layers)

trainer = pl.Trainer(accelerator="auto", max_epochs=10, check_val_every_n_epoch=1)
#trainer = pl.Trainer(accelerator="gpu", devices = 1, max_epochs=500, check_val_every_n_epoch=1)

# Train the VAE model
trainer.fit(vae, train_dataloader, val_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/adel.schmucklermann/Desktop/FoPra/script/lightning_logs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 3.7 M 
1 | latent  | Sequential | 26.1 K
2 | fc21    | Linear     | 25.8 K
3 | fc22    | Linear     | 25.8 K
4 | fc3     | Sequential | 26.1 K
5 | fc4     | Sequential | 26.1 K
6 | relu    | ReLU       | 0     
7 | decoder | Sequential | 3.8 M 
---------------------------------------
7.6 M     Trainable params
0         Non-trainable params
7.6 M     Total params
30.491    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

2023-04-06 17:43:49.067964: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


# Validation

In [6]:
trainer.validate(vae, val_dataloader)

Validation: 0it [00:00, ?it/s]

[{'validation_loss': 11.575919151306152}]

# Encode

In [7]:
#encode = vae.encoder(torch.Tensor(embedding))
mu, logvar = vae.encode(torch.Tensor(embedding))
en = vae.reparameterize(mu, logvar)
encode = vae.decode(en)
print(encode)

#encode = encode #[1]


tensor([[ 0.5870,  0.8261, -0.0090,  ...,  0.3287, -1.4833,  0.4336],
        [ 0.5968,  0.3627,  0.3526,  ...,  0.7116, -0.3656,  0.3228],
        [-0.1853, -0.1816, -0.1976,  ..., -0.5695,  0.1777, -0.7030],
        ...,
        [-1.0772,  1.2970, -0.3692,  ...,  0.4776,  0.6877,  0.5839],
        [-0.5280,  0.3657, -1.4455,  ...,  1.1855, -0.2221, -0.5348],
        [-0.0786, -0.8517, -0.8846,  ..., -0.9674,  0.0628, -0.5905]],
       grad_fn=<NativeBatchNormBackward0>)


# Neighbor_joining and create tree

In [12]:
# Neighbor_joining and create tree

from sklearn.manifold import trustworthiness

#write output flag
writer = False

#epochs
epoch = 100

#cosine, euclidean, manhattan, ts_ss, jensenshannon
metric_list = ["cosine","euclidean", "manhattan", "ts_ss"]

for metric in metric_list:
    out_newick = "/Users/adel.schmucklermann/Desktop/FoPra/KLK/KLK_esm2_decoder_"+str(epoch)+"_"+metric+".newick"

    embedding_out = encode.detach() #torch.stack(encode).detach() 
    print(embedding_out)

    #distance metric
    dist = lib.distance_metric(embedding_out)
    distmat = dist.get_metric(embedding_out, metric) 
    
    #neighbor joining to tree
    nj = lib.neighbor_joining(distmat,headers)
    newick = nj.get_newick(distmat,headers)
    
    #trustworthiness
    _distmat  = cophenetic_distmat(newick, names=headers)
    _trustworthiness = trustworthiness(distmat, _distmat, n_neighbors=10, metric='precomputed')
    print( _trustworthiness)
    
    
    #silhouette = silhouette_score(_distmat, headers, metric='precomputed')
    #silhouette = lib.silhouette.get_silhouette(distmat, headers)
    #print("silhouette: ", silhouette)
    

    if writer:
        with open(out_newick, 'w') as w:
            w.write(newick)
            w.close()

tensor([[ 0.5870,  0.8261, -0.0090,  ...,  0.3287, -1.4833,  0.4336],
        [ 0.5968,  0.3627,  0.3526,  ...,  0.7116, -0.3656,  0.3228],
        [-0.1853, -0.1816, -0.1976,  ..., -0.5695,  0.1777, -0.7030],
        ...,
        [-1.0772,  1.2970, -0.3692,  ...,  0.4776,  0.6877,  0.5839],
        [-0.5280,  0.3657, -1.4455,  ...,  1.1855, -0.2221, -0.5348],
        [-0.0786, -0.8517, -0.8846,  ..., -0.9674,  0.0628, -0.5905]])


TypeError: get_metric() takes 2 positional arguments but 3 were given

# UPGMA and create tree

In [None]:
#write output flag
writer = False
from sklearn.metrics import silhouette_score
from itertools import groupby
import string

#epochs
epoch = 100

#cosine, euclidean, manhattan, ts_ss, jensenshannon
metric_list = ["cosine","euclidean", "manhattan", "ts_ss"]

for metric in metric_list:
    out_newick = "/Users/adel.schmucklermann/Desktop/FoPra/KLK/KLK_esm2_decoder_"+str(epoch)+"_"+metric+".newick"

    embedding_out = encode.detach() #torch.stack(encode).detach() 
    print(embedding_out)
    
    #distance metric
    dist = lib.distance_metric(embedding_out)
    distmat = dist.get_metric(embedding_out, metric)
    print(distmat)
    
    #upgma to tree
    upgma = lib.upgma(distmat,headers)
    newick = upgma.get_newick(distmat,headers)
   
    #trustworthiness
    _distmat  = cophenetic_distmat(newick, names=headers)
    _trustworthiness = trustworthiness(distmat, _distmat, n_neighbors=10, metric='precomputed')
    print( _trustworthiness)
    
    
    if writer:
        with open(out_newick, 'w') as w:
            w.write(newick)
            w.close()

# Plot Loss