In [12]:
import pytorch_lightning as pl
import h5py
import os
import torch
import csv
import re
import numpy as np
from torch import nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import library as lib
import pandas as pd
from matplotlib.pylab import plt
from sklearn.manifold import trustworthiness

In [13]:
pl.seed_everything(42, workers=True) 

Global seed set to 42


42

# Parameters

In [22]:
work_dir = "./"
inputs = "./inputs/"
out_path = "./out_dir/reduced/reference/"
exclude = "./inputs/exclude.txt"

writer = True
gpu = True

epoch = 1

dim = 2560
dim3 = int(dim * 1/32)
dim2 = int(dim * 1/16)
dim1 = int(dim * 1/4)

In [23]:
with open(exclude, "r") as ex:
    reader = csv.reader(ex, delimiter='\t')
    column1 = [row[0] for row in reader]
#print(column1)

# Data Pre-Processing

In [24]:
#make numpy file with (seq header, embedding)
esm2 = h5py.File(inputs+"KLK_esm2_reduced.h5", 'r')
list(esm2.keys())
# List all groups
keys = list(esm2.keys())
#print(keys) 
# Get the data
embedding = []
headers = []
for key in keys:
    #data
    emb = esm2[key][:]
    #headers
    #replacing mistakes
    key = key.replace("isoform=", "isoform_").replace("=", "").replace(":", "_")
    if key == "GZMA_Canis_lupus":
        print("header is now changed")
        key = "GZMA_Canis_lupus_1"
    #exclude
    if key not in column1:
        headers.append(key)
        embedding.append(emb)

headers = np.array(headers)
embedding = np.array(embedding)





#split train test
headers_first, headers_test, embedding_first, embedding_test = train_test_split(headers, embedding, test_size=0.1, random_state=42)
#split train validation
headers_train, headers_validate, embedding_train, embedding_validate = train_test_split(headers_first, embedding_first, test_size =0.2, random_state=42)
print("total:", headers.shape)
print("training+validation:", headers_first.shape)
print("test:", headers_test.shape)
print("training:", headers_train.shape)
print("validation:", headers_validate.shape)
print(len(headers_train), len(embedding_train))
print(len(headers_validate), len(embedding_validate))

#training_df = pd.DataFrame({'header': headers_train, 'embedding' : embedding_train})
#validate_df = pd.DataFrame({'header': headers_validate, 'embedding' : embedding_validate})

header is now changed
total: (213,)
training+validation: (191,)
test: (22,)
training: (152,)
validation: (39,)
152 152
39 39


# Dataloaders

In [25]:
train_dataset = lib.makedataset(headers_train, embedding_train)
val_dataset = lib.makedataset(headers_validate, embedding_validate)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, drop_last=True)
    

# Training VAE

In [26]:
#if prott5 used: dtype=float16 -> error with .linear()

#encoder_layers, latent_dim, decoder_layers


encoder_layers = [dim, dim1, dim2, dim3]
latent_dim = dim3
decoder_layers = [dim3, dim2, dim1, dim]

# Initialize the VAE model
vae = lib.VariationalAutoencoder(encoder_layers, latent_dim, decoder_layers)

if gpu:
    trainer = pl.Trainer(accelerator="gpu", devices = 1, max_epochs=epoch, log_every_n_steps=4, check_val_every_n_epoch=1)
else:
    trainer = pl.Trainer(accelerator="auto", max_epochs=epoch, log_every_n_steps=4, check_val_every_n_epoch=1)

# Train the VAE model
#trainer.fit(vae, train_dataloader, val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


# Fix Formatting

In [8]:
def find_largest_number(root_dir):
    largest_number = 0
    for subdir, dirs, files in os.walk(root_dir):
        for dir_name in dirs:
            match = re.search(r'\d+', dir_name)
            if match:
                current_number = int(match.group())
                if current_number > largest_number:
                    largest_number = current_number
    return largest_number

largest_num = find_largest_number("./lightning_logs")

#Fix formatting
with open('lightning_logs/version_' + str(largest_num) + '/metrics.csv', 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)

combined_rows = []
header_row = ['train_loss','reconstruction_loss','kl_loss','epoch','step','validation_loss']

combined_rows.append(header_row)
for i in range(1, len(rows) -1, 2):
    train_loss = rows[i][0]
    reconstruction_loss = rows[i][1]
    kl_loss = rows[i][2]
    epoch = rows[i+1][3]
    step = rows[i+1][4]
    validation_loss = rows[i+1][5]
    combined_rows.append([train_loss,reconstruction_loss,kl_loss,epoch,step,validation_loss])

with open('lightning_logs/version_' + str(largest_num) + '/new_metrics.csv', 'w', newline = "") as file:
    writer = csv.writer(file)
    writer.writerows(combined_rows)

FileNotFoundError: [Errno 2] No such file or directory: 'lightning_logs/version_97/metrics.csv'

# Plot Loss

In [None]:
#Plot
# read csv file
df = pd.read_csv('lightning_logs/version_' + str(largest_num) + '/new_metrics.csv')

df1 = df[df['epoch'] >= int(epoch)/2]

# plot 0
plt.plot('epoch', 'train_loss', data=df)
plt.plot('epoch', 'reconstruction_loss', data=df)
plt.plot('epoch', 'validation_loss', data=df)
plt.title('Epoch vs Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['loss', 'reconstruction loss', 'validation_loss'], loc='upper right')
plt.show()


# plot 1
plt.plot('epoch', 'train_loss', data=df1)
plt.plot('epoch', 'reconstruction_loss', data=df1)
plt.plot('epoch', 'validation_loss', data=df1)
plt.title('Epoch vs Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['loss', 'reconstruction loss', 'validation_loss'], loc='upper right')
plt.show()

# Encode

In [27]:
encode = vae.encoder(torch.Tensor(embedding))
#mu, logvar = vae.encode(torch.Tensor(embedding))
#en = vae.reparameterize(mu, logvar)
#encode = vae.decode(en)
print(encode)

#encode = encode #[1]


tensor([[0.0000, 0.2255, 0.6357,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 1.9195, 0.0000, 0.4062],
        [0.0000, 0.0000, 0.0000,  ..., 1.3156, 0.0000, 1.3926],
        ...,
        [0.0000, 0.3991, 0.0000,  ..., 0.3835, 0.1256, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.5220,  ..., 1.3675, 0.4419, 1.0097]],
       grad_fn=<ReluBackward0>)


# Neighbor_joining and create tree

In [28]:
# Neighbor_joining and create tree

#cosine, euclidean, manhattan, ts_ss, jensenshannon
metric_list = ["cosine","euclidean", "manhattan", "ts_ss"]

for metric in metric_list:
    out_newick = out_path+"KLK_reduced_reference_esm2_nj_"+str(epoch)+"_"+metric+".newick"

    embedding_out = encode.detach() #torch.stack(encode).detach() 
    #print(embedding_out)

    #distance metric
    dist = lib.distance_metric(embedding_out)
    distmat = dist.get_metric(embedding_out, metric) 
    
    #neighbor joining to tree
    nj = lib.neighbor_joining(distmat,headers)
    newick = nj.get_newick(distmat,headers)
    
    #trustworthiness
    _distmat  = lib.cophenetic_distmat(newick, names=headers)
    _trustworthiness = trustworthiness(distmat, _distmat, n_neighbors=10, metric='precomputed')
    print("trustworthiness: ", _trustworthiness)
    
    
    #silhouette = silhouette_score(_distmat, headers, metric='precomputed')
    #silhouette = lib.silhouette.get_silhouette(distmat, headers)
    #print("silhouette: ", silhouette)
    

    if writer:
        with open(out_newick, 'w') as w:
            w.write(newick)
            w.close()

trustworthiness:  0.9067807690021988
trustworthiness:  0.8005633802816902
trustworthiness:  0.8229844892137637
trustworthiness:  0.79615855470375


# UPGMA and create tree

In [29]:
#write output flag
from sklearn.metrics import silhouette_score
from itertools import groupby
import string


#cosine, euclidean, manhattan, ts_ss, jensenshannon
metric_list = ["cosine","euclidean", "manhattan", "ts_ss"]

for metric in metric_list:
    out_newick = out_path+"KLK_reduced_reference_esm2_upgma_"+str(epoch)+"_"+metric+".newick"

    embedding_out = encode.detach() #torch.stack(encode).detach() 
    #print(embedding_out)
    
    #distance metric
    dist = lib.distance_metric(embedding_out)
    distmat = dist.get_metric(embedding_out, metric)
    #print(distmat)
    
    #upgma to tree
    upgma = lib.upgma(distmat,headers)
   
    #trustworthiness
    _distmat  = lib.cophenetic_distmat(upgma, names=headers)
    _trustworthiness = trustworthiness(distmat, _distmat, n_neighbors=10, metric='precomputed')
    print("trustworthiness: ", _trustworthiness)
    
    
    if writer:
        with open(out_newick, 'w') as w:
            w.write(upgma)
            w.close()

trustworthiness:  0.8998752005705117
trustworthiness:  0.8097961609318358
trustworthiness:  0.8442954775063886
trustworthiness:  0.8023557378023415
