# Import libraries and dependencies

In [1]:
import pandas as pd

from torch.utils.data import DataLoader, random_split

from utility.custom_datasets import ProtT5Dataset, OneHotDataset
from models.simple_model.model import ProtENN2_style
from utility.helpers import extract_unique_pfams, one_hot_encode_pfams, write_fasta_file

# Preparing Dataset

In [2]:
pk_data = pd.read_pickle("dataset/dataset.pkl")
pk_data.reset_index(drop=False, inplace=True, names=["protein_id"])
pk_data.head()

Unnamed: 0,protein_id,pfam_tensor,sequence
0,A1AL94,"[None, None, None, None, None, None, None, Non...",MSLTTLVTQQAPDFTAEAVMADNSFASITLSSLKGKFVLLLFYPLD...
1,A0A0F7V1V9,"[None, None, None, None, None, None, None, Non...",MAACLRAARLSLRQMEGLIEPSVRGRSSPLSMVRLLPSSSVSSSSP...
2,Q4UGC8,"[None, None, None, None, None, None, None, Non...",MKLTGISLISSLSYIRNTLPLKNTLTAFHTLNTRNNLKSVNRITSV...
3,A4H879,"[None, None, None, None, None, None, None, Non...",MSCGDAKMNEPAPPFEEMALMPNGSFKKINLASYKGKWVVLFFYPL...
4,A0A0M5IXP6,"[None, None, None, None, None, None, None, Non...",MNPSANESGVCSPVQIGDAAPNFQARTTLGEMSLSDYRGRWVLLFS...


In [3]:
# creating test dataset
pfams = pk_data[:4000].copy()

Extracting all unique pfams from dataset

In [4]:
all_pfams = extract_unique_pfams(pfams["pfam_tensor"])
len(all_pfams)

116

One hot encoding pfams

In [5]:
pfams["pfam_onehot"] = one_hot_encode_pfams(pfams["pfam_tensor"],list(all_pfams))

In [6]:
pfams.head()

Unnamed: 0,protein_id,pfam_tensor,sequence,pfam_onehot
0,A1AL94,"[None, None, None, None, None, None, None, Non...",MSLTTLVTQQAPDFTAEAVMADNSFASITLSSLKGKFVLLLFYPLD...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,A0A0F7V1V9,"[None, None, None, None, None, None, None, Non...",MAACLRAARLSLRQMEGLIEPSVRGRSSPLSMVRLLPSSSVSSSSP...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,Q4UGC8,"[None, None, None, None, None, None, None, Non...",MKLTGISLISSLSYIRNTLPLKNTLTAFHTLNTRNNLKSVNRITSV...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,A4H879,"[None, None, None, None, None, None, None, Non...",MSCGDAKMNEPAPPFEEMALMPNGSFKKINLASYKGKWVVLFFYPL...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,A0A0M5IXP6,"[None, None, None, None, None, None, None, Non...",MNPSANESGVCSPVQIGDAAPNFQARTTLGEMSLSDYRGRWVLLFS...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [7]:
write_fasta_file(pfams, "test_proteins.fasta")

In [8]:
# create dataset from ProtT5 embeddings (h5 file) and one-hot encoded pfams
data = ProtT5Dataset(embeddings_h5_file_path="test_embeddings.h5", labels=pfams["pfam_onehot"])

# Creating Dataloaders

In [9]:
# set dataset proportions
train_size = int(0.7 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_set, val_set, test_set = random_split(data, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

In [10]:
train_dataloader.dataset[1]

(tensor([], size=(0, 1985, 1024)),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]))

ProtT5 embedding data is empty?

In [11]:
import h5py

# Open the HDF5 file in read mode
with h5py.File("test_embeddings.h5", "r") as f:
    # List all top-level groups/datasets
    print("Keys:", list(f.keys()))
    
    # Example: Access the first dataset
    first_key = list(f.keys())[1000]
    data = f[first_key][:]
    print(f"Shape of '{first_key}':", data.shape)
    print(f"First 5 values of '{first_key}':\n", data[:5])

Keys: ['A0A010Q5Q7', 'A0A017SPG0', 'A0A021VWH8', 'A0A026W2W9', 'A0A026WD97', 'A0A060SJ70', 'A0A061AHU5', 'A0A063BYH9', 'A0A066VTR8', 'A0A066XCV1', 'A0A066XUV6', 'A0A067L3Z2', 'A0A067M347', 'A0A067TGL3', 'A0A068RK35', 'A0A072PCS8', 'A0A072V2S7', 'A0A074SC29', 'A0A074VQR9', 'A0A074WYF8', 'A0A074YJQ8', 'A0A075IE34', 'A0A075KHI6', 'A0A078KV17', 'A0A081FXP6', 'A0A084A1A1', 'A0A084GDP7', 'A0A084QCJ0', 'A0A086T1D4', 'A0A086T2G6', 'A0A086T6D7', 'A0A086TAV7', 'A0A090CFW8', 'A0A090D5Y6', 'A0A090V3T8', 'A0A093GKZ5', 'A0A094A5V3', 'A0A094C8K5', 'A0A094H0P1', 'A0A094HQF1', 'A0A098G8B4', 'A0A0A1SX47', 'A0A0A1TDD6', 'A0A0A1TE21', 'A0A0A1TFK2', 'A0A0A1TIJ9', 'A0A0A1TRT9', 'A0A0A2L9J4', 'A0A0A2VDX8', 'A0A0B1PA03', 'A0A0B7N8Y7', 'A0A0B8N738', 'A0A0C1XLB1', 'A0A0C3CAN6', 'A0A0C3DEX1', 'A0A0C3HDB9', 'A0A0C3NME7', 'A0A0C3PBB4', 'A0A0C3QPN8', 'A0A0C4DNT6', 'A0A0C4DQN5', 'A0A0C4DTY0', 'A0A0C4E7R5', 'A0A0C4EA23', 'A0A0C4EAQ6', 'A0A0C4ENE2', 'A0A0C4EUX1', 'A0A0C4EWH6', 'A0A0C4F3A9', 'A0A0C4F573', 'A0A0C4F9E7',