# PESTO output analysis

In [1]:
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import utils
import glob
import re
from datasets import AASequenceDataset
import torch
from torch.utils.data import Dataset, DataLoader

For the ease of analysis we will work with a sample of the data

In [None]:
prot_files = pd.DataFrame({'file' : glob.glob('data/states/*')})
prot_files['UniProt ID'] = prot_files['file'].apply(lambda x : re.search(r'AF-(\w+)-F\d+-model_v1.npy', x).group(1))
prot_files

Unnamed: 0,file,UniProt ID
0,data/states/AF-A0A060CYX6-F1-model_v1.npy,A0A060CYX6
1,data/states/AF-A0A021WW64-F1-model_v1.npy,A0A021WW64
2,data/states/AF-A0A060CYS6-F1-model_v1.npy,A0A060CYS6
3,data/states/AF-A0A023GRW3-F1-model_v1.npy,A0A023GRW3
4,data/states/AF-A0A060CYZ5-F1-model_v1.npy,A0A060CYZ5
...,...,...
340038,data/states/AF-X5M5Y4-F1-model_v1.npy,X5M5Y4
340039,data/states/AF-X5M8U1-F1-model_v1.npy,X5M8U1
340040,data/states/AF-X5ML99-F1-model_v1.npy,X5ML99
340041,data/states/AF-X6R8D5-F1-model_v1.npy,X6R8D5


In [48]:
data = pd.read_csv('data/merged_data_v2.tsv', sep='\t')
data

Unnamed: 0.1,Unnamed: 0,EPSD ID,UniProt ID,AA,Position,phos_site_cnt,seq,seq_len,file
0,0,EP0000001,O00087,"['T', 'S', 'S', 'S']","[138, 139, 206, 207]",4,[12 10 2 15 19 9 11 1 15 0 10 4 1 13 11 13 16 ...,511,data/states/AF-O00087-F1-model_v1.npy
1,1,EP0000002,O00091,"['S', 'S', 'S', 'S']","[163, 485, 498, 53]",4,[12 15 15 0 2 10 15 9 11 6 2 7 0 12 19 19 13 7...,500,data/states/AF-O00091-F1-model_v1.npy
2,2,EP0000004,O00103,"['S', 'S']","[17, 18]",2,[12 3 15 3 12 5 2 5 2 14 8 16 2 15 11 2 15 15 ...,176,data/states/AF-O00103-F1-model_v1.npy
3,3,EP0000005,O00110,"['S', 'Y', 'T', 'T', 'S', 'S', 'S']","[117, 29, 36, 49, 54, 69, 86]",7,[12 14 1 0 13 10 19 1 15 1 1 14 5 14 14 2 17 7...,214,data/states/AF-O00110-F1-model_v1.npy
4,4,EP0000006,O00115,"['Y', 'S', 'Y', 'S', 'S']","[20, 355, 358, 41, 70]",5,[12 9 14 10 10 10 0 0 10 10 4 19 14 0 7 0 10 1...,360,data/states/AF-O00115-F1-model_v1.npy
...,...,...,...,...,...,...,...,...,...
103119,103119,EP0209260,X5LPS1,"['T', 'S']","[19, 21]",2,[12 7 3 1 15 19 3 6 14 0 19 7 4 15 3 3 0 6 16 ...,801,data/states/AF-X5LPS1-F1-model_v1.npy
103120,103120,EP0209262,X5LV34,['S'],[166],1,[12 5 16 7 1 14 14 14 13 13 11 5 12 8 18 14 8 ...,861,data/states/AF-X5LV34-F1-model_v1.npy
103121,103121,EP0209265,X5M5N0,"['T', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'S', ...","[1069, 1076, 1091, 1094, 1598, 1599, 1603, 160...",11,[12 14 3 15 9 16 2 7 7 1 14 14 0 14 14 15 15 1...,1850,data/states/AF-X5M5N0-F1-model_v1.npy
103122,103122,EP0209266,X5M5W2,"['T', 'S', 'S', 'S', 'S', 'T', 'S', 'S', 'T', ...","[1028, 1029, 1030, 1031, 1037, 1039, 1065, 106...",19,[12 6 3 11 16 15 11 6 11 6 11 15 0 3 11 1 9 3 ...,2318,data/states/AF-X5M5W2-F1-model_v1.npy


In [47]:
data.merge(prot_files, how='inner', on='UniProt ID')

Unnamed: 0.1,Unnamed: 0,EPSD ID,UniProt ID,AA,Position,phos_site_cnt,seq,seq_len,file
0,0,EP0000001,O00087,"['T', 'S', 'S', 'S']","[138, 139, 206, 207]",4,[12 10 2 15 19 9 11 1 15 0 10 4 1 13 11 13 16 ...,511,data/states/AF-O00087-F1-model_v1.npy
1,1,EP0000002,O00091,"['S', 'S', 'S', 'S']","[163, 485, 498, 53]",4,[12 15 15 0 2 10 15 9 11 6 2 7 0 12 19 19 13 7...,500,data/states/AF-O00091-F1-model_v1.npy
2,2,EP0000004,O00103,"['S', 'S']","[17, 18]",2,[12 3 15 3 12 5 2 5 2 14 8 16 2 15 11 2 15 15 ...,176,data/states/AF-O00103-F1-model_v1.npy
3,3,EP0000005,O00110,"['S', 'Y', 'T', 'T', 'S', 'S', 'S']","[117, 29, 36, 49, 54, 69, 86]",7,[12 14 1 0 13 10 19 1 15 1 1 14 5 14 14 2 17 7...,214,data/states/AF-O00110-F1-model_v1.npy
4,4,EP0000006,O00115,"['Y', 'S', 'Y', 'S', 'S']","[20, 355, 358, 41, 70]",5,[12 9 14 10 10 10 0 0 10 10 4 19 14 0 7 0 10 1...,360,data/states/AF-O00115-F1-model_v1.npy
...,...,...,...,...,...,...,...,...,...
103119,103686,EP0209260,X5LPS1,"['T', 'S']","[19, 21]",2,[12 7 3 1 15 19 3 6 14 0 19 7 4 15 3 3 0 6 16 ...,801,data/states/AF-X5LPS1-F1-model_v1.npy
103120,103687,EP0209262,X5LV34,['S'],[166],1,[12 5 16 7 1 14 14 14 13 13 11 5 12 8 18 14 8 ...,861,data/states/AF-X5LV34-F1-model_v1.npy
103121,103688,EP0209265,X5M5N0,"['T', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'S', ...","[1069, 1076, 1091, 1094, 1598, 1599, 1603, 160...",11,[12 14 3 15 9 16 2 7 7 1 14 14 0 14 14 15 15 1...,1850,data/states/AF-X5M5N0-F1-model_v1.npy
103122,103689,EP0209266,X5M5W2,"['T', 'S', 'S', 'S', 'S', 'T', 'S', 'S', 'T', ...","[1028, 1029, 1030, 1031, 1037, 1039, 1065, 106...",19,[12 6 3 11 16 15 11 6 11 6 11 15 0 3 11 1 9 3 ...,2318,data/states/AF-X5M5W2-F1-model_v1.npy


In [4]:
name_pattern = r'AF-(\w+)-F1-model_v1.npy'
prot_ids = []
pesto_vals = []
for file_name in prot_sample_files:
    prot_ids.append(
        re.search(name_pattern, prot_sample_files[342]).group(1)
    )
    with open(file_name, 'rb') as f:
        pesto_vals.append(np.load(f))
pesto_data = pd.DataFrame({'ID':prot_ids, 'embeddings':pesto_vals})

In [5]:
pesto_data

Unnamed: 0,ID,embeddings
0,Q22798,"[[6.8824115, 4.9083676, 1.5853906, -4.5155897,..."
1,Q22798,"[[-7.9706397, -0.9218383, 1.5963042, 0.2413132..."
2,Q22798,"[[-11.019621, -1.6533804, 2.5373554, -3.363891..."
3,Q22798,"[[-3.0326939, 1.112207, 0.22040679, -5.603626,..."
4,Q22798,"[[-4.865187, 0.8347398, 2.4084833, 2.311211, 2..."
...,...,...
9995,Q22798,"[[-4.4327655, 4.069865, 1.0039551, -3.5745363,..."
9996,Q22798,"[[6.6074142, 3.1016734, 0.89802915, -4.3556876..."
9997,Q22798,"[[-0.969046, 4.3470454, 0.7636617, -3.2490418,..."
9998,Q22798,"[[1.7111797, 6.0833898, 1.925912, -7.098635, -..."


In [25]:
PESTO_DATA = 'data/states/'
sequence_dataset = AASequenceDataset('./data/merged_data_train_v2.tsv', onehot_input=True, multihot_output=True, include_pesto_file=True)
def pesto_collate(x):
    x = list(zip(*x))
    embeddings = []
    protids, _, _, _ = x
    for pid in protids:
        with open(, 'rb') as f:
            embeddings.append(np.load(f))
    return embeddings, x[1], x[2], x[3]
    
dataloader = DataLoader(sequence_dataset, batch_size=30,
                        shuffle=True, num_workers=0, collate_fn=pesto_collate)

In [29]:
for d in dataloader:
    print(d[0][0].shape)

(262, 64)
(535, 64)
(860, 64)
(486, 64)
(947, 64)


FileNotFoundError: [Errno 2] No such file or directory: 'data/states/AF-Q80TZ9-F1-model_v1.npy'