### Inspection of embedding space in presence of query and reference

In [1]:
# general libs
import numpy as np
import pandas as pd
import glob
import pickle
import torch

from random import sample

import re

# torch libs
import torch
import torch.nn as nn
import torch.nn.functional as F

# latent space libs
import umap
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import StandardScaler

# plotting libs
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from pandas import read_csv

# color palette
zeileis_28 = [
    "#023fa5",
    "#7d87b9",
    "#bec1d4",
    "#d6bcc0",
    "#bb7784",
    "#8e063b",
    "#4a6fe3",
    "#8595e1",
    "#b5bbe3",
    "#e6afb9",
    "#e07b91",
    "#d33f6a",
    "#11c638",
    "#8dd593",
    "#c6dec7",
    "#ead3c6",
    "#f0b98d",
    "#ef9708",
    "#0fcfc0",
    "#9cded6",
    "#d5eae7",
    "#f3e1eb",
    "#f6c4e1",
    "#f79cd4",
    # these last ones were added:
    '#7f7f7f',
    "#c7c7c7",
    "#1CE6FF",
    "#336600",
] 

In [None]:
query_evidence = pd.read_csv('/stornext/General/data/academic/lab_davis/prot/benchmarking/PXD014777/HeLa_10_replicates_2hr/evidence.txt', 
                             sep='\t', engine='python', header=0)
pd.set_option('display.max_columns', None)


## data preprocessing
query_evidence = query_evidence.loc[(query_evidence['Intensity'] > 0) & \
                                (query_evidence['Charge'] != 1)]

# keep most intese peptide
query_evidence = query_evidence.loc[query_evidence.groupby(['Modified sequence', 'Charge', 'Raw file'])['Intensity'].idxmax()]


# remove contaminants
query_evidence = query_evidence.loc[[(not bool(re.search("CON__|REV__", i))) for i in query_evidence['Leading razor protein'].tolist()]]


query_evidence['study'] = 'query'
query_evidence['Species'] = 'HeLa'
query_evidence['PrecursorID'] = query_evidence['Modified sequence'].astype(str).str.cat(query_evidence.Charge.astype(str), sep='')


query_evidence


In [2]:
evidence_train = pd.read_pickle('PXD019086_PXD010012_combined_evidence_train_90Kto20Ksplit_5query_1shot_allPeptidesTxtFeatures_modSeqSpecies.pkl')

In [3]:
evidence_train

Unnamed: 0,Modified sequence,Sequence,Charge,Mass,m/z,Retention time,CCS,Gene names,Length,Raw file,...,Mass error [ppm],CCS length,Retention length,Ion mobility index,Ion mobility length,Number of isotopic peaks,Species,study,experiment_type,PrecursorID
1853774,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,49.447,372.288031,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,1.50620,6.307007,0.27169,615,16,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1853775,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.470,376.070427,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.91928,89.609762,1.80470,606,214,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1853776,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.769,374.809613,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.20340,42.836264,1.08670,609,103,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1853781,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.610,382.372619,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,-0.16818,17.638467,0.65977,591,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1853782,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.374,378.591787,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.12008,17.648187,0.62096,600,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686997,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4159.100,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,3.32690,,31.87200,446,31,5,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
1686998,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4131.700,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,3.71580,,19.10000,432,28,5,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
1686999,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4126.700,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,5.06560,,38.03200,437,49,6,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
669906,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,92.273,449.532607,,13,20190504_TIMS1_FlMe_SA_HeLa_frac10_B11_1_102,...,5.13220,18.487382,0.55038,426,46,6,HeLa,PXD019086,HeLa_frac10,_YYYVPADFVEYEK__2


In [None]:
# append query to ref, shuffle rows, subsample and run umap

data = pd.concat([query_evidence, evidence_train], sort=False, ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# high freq idents
pd.set_option('display.max_rows', None)
grouped = data.groupby(['PrecursorID', 'study'])
grouped.size()

In [4]:
input_dim = 7

class Encoder(nn.Module):

    def __init__(self, input_dim=8, hidden_dim=64, latent_dim=10):
        super().__init__()
        # Boring model
        self.encoder = nn.Sequential(
                    nn.Linear(input_dim, hidden_dim),
                    nn.ReLU()
                    # nn.Linear(in_size, out_size)
                    # nn.ReLU()
            )

        self.mean_encoder = nn.Linear(hidden_dim, latent_dim)
        self.var_encoder = nn.Linear(hidden_dim, latent_dim)


    def forward(self, x):
        # Simple forward
        hidden = self.encoder(x)
        mu = self.mean_encoder(hidden)
        logvar = self.var_encoder(hidden)

        # sample from the distribution having latent parameters z_mu and z_var
        # reparametrization
        # std = torch.exp(0.5*logvar)
        
        std = logvar
        # std = torch.exp(logvar) + 1e-4
        eps = torch.randn_like(std)
#         x_sample = mu + eps*std
        x_sample = mu

        # return x.view(x.size(0), -1)
        return x_sample
        

In [5]:
model = torch.load('../peptideprotonet/PXD019086_PXD010012_combined_evidence_90Kto20Ksplit_5query_1shot_fullmodel_featuresScaled_allPeptidesTxtFeatures_modSeqSpecies_hidden64_latent10_maxEpoch300_164trainways_xlatent_conditionalEmbedding.pth' ,
                          map_location=torch.device('cpu'))
model.eval()

Encoder(
  (encoder): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
  )
  (mean_encoder): Linear(in_features=64, out_features=10, bias=True)
  (var_encoder): Linear(in_features=64, out_features=10, bias=True)
)

In [6]:
# device = 'cuda'
device = 'cpu'
attr_names = ['Charge','Mass', 'm/z', 'Retention time','Retention length',
              'Ion mobility index', 'Ion mobility length','Number of isotopic peaks']


In [None]:
x2 = torch.from_numpy(data[attr_names].to_numpy()).float()
x2 = StandardScaler().fit_transform(x2)
print(x2.shape)
z_query = model(torch.from_numpy(x2).float().to(device))
print(z_query.shape)

In [None]:
protonet_embedding = z_query.cpu().detach().numpy()
print(protonet_embedding.shape)


In [None]:
random.seed(64)
row_idx = np.random.choice(protonet_embedding.shape[0], 200000, replace=False)
z_subsample = protonet_embedding[row_idx]

In [None]:
reducer = umap.UMAP(metric = 'cosine')
embedding = reducer.fit_transform(z_subsample)

In [None]:
color_dict = {'PXD019086':'tab:blue', 'PXD010012':'tab:orange', 'query':'tab:green'}
markers = [plt.Line2D([0], [0], color=c, linestyle='', marker='o') for c in color_dict.values()]
labels = color_dict.keys()


plt.scatter(embedding[:,0], embedding[:,1],
           c=data.study[row_idx].map(color_dict), s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.legend(markers, labels, bbox_to_anchor=(1.05, 1), loc='upper left')        
plt.title('peptideprotonet latent space')

In [None]:
# color by charge - purple is charge 2
charge_state = data.Charge.astype("category")
charge_state = charge_state.to_numpy()
charge_sub = charge_state[row_idx]


plt.scatter(embedding[:,0], embedding[:,1],
           c=charge_sub, s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.colorbar()
plt.title('peptideprotonet latent space')

In [None]:
# evidence_train.reset_index(inplace=True, drop=True)

plt.scatter(embedding[:,0], embedding[:,1],
           c=data.CCS[row_idx], s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend()
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.colorbar()
plt.title('peptideprotonet latent space\nCCS')

In [None]:
plt.scatter(embedding[:,0], embedding[:,1],
           c=data['Retention time'][row_idx], s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend()
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.colorbar()
plt.title('peptideprotonet latent space\nRT')
# plt.savefig('latent_twoStudy_conditionalModel_byRT.png')

In [None]:
charge_state = data.Charge.astype("category")
charge_state = charge_state.to_numpy()
charge_sub = charge_state[row_idx][data.study[row_idx].isin(['query'])]


plt.scatter(embedding[data.study[row_idx].isin(['query']),0],
            embedding[data.study[row_idx].isin(['query']),1],
           c=charge_sub, s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.colorbar()
plt.title('peptideprotonet latent space')

In [None]:
# query = ['_IYNSIYIGSQDALIAHYPR__2']
#query = ['_KAEEEHLGILGPQLHADVGDK__3']
query = ['_KPADDQDPIDALSGDLDSCPSTTETSQNTAK__3']


modified_seq = data.PrecursorID
query_pep = np.zeros(len(modified_seq))
query_pep[modified_seq.isin(query)] = 1
query_pep = query_pep.astype('int')

xs = embedding[query_pep[row_idx]==1,0]
ys = embedding[query_pep[row_idx]==1,1]


plt.scatter(embedding[:,0], embedding[:,1],
           c='lavender', s = 0.4)


plt.scatter(xs, ys,
           c= data.study[row_idx][query_pep[row_idx] ==1].map(color_dict),
            s = 10.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend()
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.title('peptideprotonet latent space\n _KPADDQDPIDALSGDLDSCPSTTETSQNTAK__3')
# plt.savefig('latent_twoStudy_conditionalModel_peptide2.png', bbox_inches='tight')

### Working with allPeptides table

#### Embedding query runs only

In [None]:
query_allpeptides = pd.read_csv('/stornext/General/data/academic/lab_davis/prot/benchmarking/PXD014777/HeLa_10_replicates_2hr/allPeptides.txt', sep='\t', engine='python', header=0)

In [None]:
query_allpeptides['study'] = 'query'
query_allpeptides['Species'] = 'HeLa'

In [None]:
query_allpeptides.head(10)

In [None]:
# query_allpeptides = query_allpeptides.loc[(query_allpeptides['Intensity'] > 0) & \
#                                 (query_allpeptides['Charge'] != 1) &\
#                                 (query_allpeptides['Isotope correlation'] > 0.5) ]

In [None]:
query_allpeptides.shape

In [None]:
attr_names = ['Charge','Mass', 'm/z', 'Retention time','Retention length',
              'Ion mobility index', 'Ion mobility index length','Number of isotopic peaks']
x = torch.from_numpy(query_allpeptides[attr_names].to_numpy()).float()
x = StandardScaler().fit_transform(x)
print(x.shape)
z = model(torch.from_numpy(x).float().to(device))

In [None]:
protonet_embedding_z = z.cpu().detach().numpy()
print(protonet_embedding_z.shape)

In [None]:
# append all meta-data
latent_dim = 10
latent_space = pd.DataFrame(protonet_embedding_z, columns = ['dim_'+str(i) for i in range(latent_dim)])


In [None]:
latent_space = pd.concat([latent_space, query_allpeptides], axis = 1)
latent_space.head(15)

In [None]:
latent_space.shape

In [None]:
latent_space.to_csv('peptideprotonet_embedding_space_90Kto20KsplitTrain_epoch300_featuresScaled_HeLa10Reps2hr_noEvidenceTrain.csv')

#### Embedding query runs with reference data

In [10]:
# data = pd.concat([query_allpeptides, evidence_train], sort=False, ignore_index=True)
evidence_train.reset_index(inplace=True, drop=True)


In [11]:
data = evidence_train
data

Unnamed: 0,Modified sequence,Sequence,Charge,Mass,m/z,Retention time,CCS,Gene names,Length,Raw file,...,Mass error [ppm],CCS length,Retention length,Ion mobility index,Ion mobility length,Number of isotopic peaks,Species,study,experiment_type,PrecursorID
0,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,49.447,372.288031,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,1.50620,6.307007,0.27169,615,16,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.470,376.070427,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.91928,89.609762,1.80470,606,214,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
2,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.769,374.809613,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.20340,42.836264,1.08670,609,103,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
3,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.610,382.372619,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,-0.16818,17.638467,0.65977,591,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
4,_VISNPLLARK_,VISNPLLARK,2,1109.69208,555.853316,43.374,378.591787,RPS24B;RPS24A,10,20190304_tims03_FlMe_SA_200ng_Yeast_Lysc_IRT_F...,...,0.12008,17.648187,0.62096,600,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2173818,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4159.100,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,3.32690,,31.87200,446,31,5,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
2173819,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4131.700,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,3.71580,,19.10000,432,28,5,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
2173820,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,4126.700,,PHKB,13,20180631_TIMS2_12-2_AnBr_SA_200ng_HeLa_50cm_10...,...,5.06560,,38.03200,437,49,6,HeLa,PXD010012,200ng_HeLa_50cm_100ms_120min_Fraction,_YYYVPADFVEYEK__2
2173821,_YYYVPADFVEYEK_,YYYVPADFVEYEK,2,1684.76609,843.390322,92.273,449.532607,,13,20190504_TIMS1_FlMe_SA_HeLa_frac10_B11_1_102,...,5.13220,18.487382,0.55038,426,46,6,HeLa,PXD019086,HeLa_frac10,_YYYVPADFVEYEK__2


In [12]:
x = torch.from_numpy(data[attr_names].to_numpy()).float()
x = StandardScaler().fit_transform(x)
print(x.shape)
z = model(torch.from_numpy(x).float().to(device))
protonet_embedding_z = z.cpu().detach().numpy()

# append all meta-data
latent_dim = 10
latent_space = pd.DataFrame(protonet_embedding_z, columns = ['dim_'+str(i) for i in range(latent_dim)])


(2173823, 8)


In [13]:
ref_latent_space = pd.concat([latent_space, data], axis = 1)
ref_latent_space.head(15)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,Mass error [ppm],CCS length,Retention length,Ion mobility index,Ion mobility length,Number of isotopic peaks,Species,study,experiment_type,PrecursorID
0,-86.978577,-79.470383,65.228668,45.599819,-89.245079,20.779688,66.632835,-76.245003,-84.255302,-72.529778,...,1.5062,6.307007,0.27169,615,16,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
1,-86.206093,-79.878128,64.184944,44.833485,-89.427238,21.805971,67.89727,-75.245369,-82.828827,-71.286896,...,0.91928,89.609762,1.8047,606,214,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
2,-86.598953,-80.067009,64.785507,44.829628,-89.540253,21.085287,67.789536,-75.920822,-83.157516,-71.844193,...,0.2034,42.836264,1.0867,609,103,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
3,-87.139153,-79.73098,65.152618,45.008793,-89.64933,20.669962,67.162071,-76.152069,-83.717529,-72.210388,...,-0.16818,17.638467,0.65977,591,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
4,-87.062927,-79.728523,65.228172,45.045128,-89.594131,20.732605,67.196365,-76.151741,-83.736389,-72.26136,...,0.12008,17.648187,0.62096,600,43,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
5,-86.858612,-79.754982,65.258568,45.294476,-89.274117,20.813955,67.059807,-76.322487,-83.93721,-72.434326,...,-1.8926,13.875392,1.6689,609,34,4,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
6,-86.97654,-79.474998,65.227623,45.601723,-89.24395,20.783102,66.634705,-76.248169,-84.25531,-72.529312,...,0.21567,6.308166,0.32989,615,16,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
7,-86.889931,-79.505653,65.372719,45.516525,-89.312706,20.938604,66.818207,-76.137093,-84.155876,-72.525726,...,-0.71341,8.836321,0.32988,624,22,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
8,-86.898346,-79.526871,65.40274,45.459351,-89.382401,20.978662,66.892044,-76.091164,-84.098892,-72.494858,...,0.16795,10.099588,0.28759,624,25,3,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2
9,-86.803391,-79.859009,65.31884,44.916489,-89.377068,20.748774,67.461594,-76.267113,-83.591263,-72.321503,...,-0.21006,22.719842,0.64041,615,55,5,Yeast,PXD019086,200ng_Yeast_Lysc_IRT_Fraction,_VISNPLLARK__2


In [14]:
ref_latent_space.to_csv('peptideprotonet_embedding_space_90Kto20KsplitTrain_epoch300_featuresScaled_EvidenceTrain.csv')

In [None]:
#### plotting the embedding

In [None]:
random.seed(614)
row_idx = np.random.choice(protonet_embedding_z.shape[0], 50000, replace=False)
z_subsample = protonet_embedding_z[row_idx]

In [None]:
reducer = umap.UMAP(metric = 'cosine')
embedding = reducer.fit_transform(z_subsample)

In [None]:
# color by charge - purple is charge 2
charge_state = query_allpeptides.Charge.astype("category")
charge_state = charge_state.to_numpy()
charge_sub = charge_state[row_idx]


plt.scatter(embedding[:,0], embedding[:,1],
           c=charge_sub, s = 0.1)
#plt.gca().set_aspect('equal', 'datalim')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.colorbar()
plt.title('peptideprotonet latent space')

### Reload evidence and allPeptide tables and apply the model to them

### Old stuff start here

In [None]:
evidence = pd.read_csv('PXD023049/txt/evidence.txt', sep='\t', engine='python', header=0)
print(evidence.shape)
allPeptide = pd.read_csv('PXD023049/txt/allPeptides.txt', sep='\t', engine='python', header=0)
print(allPeptide.shape)

allPeptide.rename({'Ion mobility index length': 'Ion mobility length'}, axis=1, inplace=True)

# df_diff = pd.concat([allPeptide[attrs], evidence[attrs]]).drop_duplicates(keep=False)
# print(df_diff.shape)




In [None]:
# evidence.head()

In [None]:
evidence = evidence.round({'Retention time': 1,  'Retention length':3})

In [None]:
attrs = ['Raw file',
         'Charge',
         #'Mass', 
         #'m/z', 
#          'Retention time',
         #'Retention length',
         'Ion mobility index', 
         #'Ion mobility length',
         'MS/MS scan number',
         'Number of isotopic peaks',
         'Intensity'
        ]
df3 = pd.merge(evidence, allPeptide, on=attrs , how='outer', indicator='Exist')
#df3.head(15)

In [None]:
df3.Exist.value_counts()

In [None]:
df3

In [None]:
### Apply the model to all allPeptide txt records except charge 1 idents
allPeptide = allPeptide.loc[(allPeptide['Intensity'] > 0) & \
                                (allPeptide['Charge'] != 1)]



allPeptide['study'] = 'query'
allPeptide['Species'] = 'Human'

attr_names = ['Charge','Mass', 'm/z', 'Retention time','Retention length',
              'Ion mobility index', 'Ion mobility length','Number of isotopic peaks']

In [None]:
data = pd.concat([allPeptide, evidence_train], sort=False, ignore_index=True)
data

In [None]:
x = torch.from_numpy(data[attr_names].to_numpy()).float()
x = StandardScaler().fit_transform(x)
print(x.shape)
z = model(torch.from_numpy(x).float().to(device))
protonet_embedding_z = z.cpu().detach().numpy()

# append all meta-data
latent_dim = 10
latent_space = pd.DataFrame(protonet_embedding_z, columns = ['dim_'+str(i) for i in range(latent_dim)])


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
ref_latent_space = pd.concat([latent_space, data], axis = 1)
ref_latent_space.head(15)

In [None]:
ref_latent_space

In [None]:
# ref_latent_space.to_csv('peptideprotonet_embedding_space_90Kto20KsplitTrain_epoch300_featuresScaled_tcells_cancer_query_icml2021.csv')

In [None]:
ref_data = ref_latent_space[~ref_latent_space.study.isin(['query'])]
ref_embedding = ref_data.iloc[:,:10]
ref_labels = ref_data.PrecursorID
ref_run = ref_data['Raw file']

In [None]:
query_data = ref_latent_space[ref_latent_space.study.isin(['query'])]
df3 = pd.merge(evidence, query_data, on=attrs , how='outer', indicator='Exist')

In [None]:
df3.Exist.value_counts()

In [None]:
# df3

In [None]:
ident_query_embedding = df3.loc[df3.Exist == 'both','dim_0':'dim_9']
ident_query_labels = df3.loc[df3.Exist == 'both', 'Modified sequence_x'].astype(str).str.cat(df3.Charge[df3.Exist == 'both'].astype(str), sep='_')
ident_query_run = df3.loc[df3.Exist == 'both','Raw file']

In [None]:
ident_query_labels = ident_query_labels.tolist()

In [None]:
ref_labels = ref_labels.tolist()

In [None]:
# these two need to be converted to numpy arrays using .to_numpy()
print(ident_query_embedding.shape)
print(len(ident_query_labels))
print(ref_embedding.shape)
print(len(ref_labels))

In [None]:
embeddings = pd.concat([ref_embedding, ident_query_embedding]).to_numpy() # numpy array
labels = ref_labels + ident_query_labels # list

In [None]:
print(embeddings.shape, len(labels))

In [None]:
pdata = pd.concat([ref_embedding, ident_query_embedding])
pdata.reset_index(inplace=True, drop=True)

prun = pd.concat([ref_run, ident_query_run])
prun.reset_index(inplace=True, drop=True)


pd.concat([pdata,
          pd.DataFrame(labels, columns = ['labels']),
          prun], axis = 1)

In [None]:
pd.concat([pdata,
          pd.DataFrame(labels, columns = ['labels']),
          prun], axis = 1).to_csv('reference_embeddings_and_metadata_icml2021.csv')

In [None]:
query_embeddings = df3.loc[df3.Exist == 'right_only','dim_0':'dim_9']
query_embeddings

In [None]:
query_metadata = df3.loc[df3.Exist == 'right_only', ['Raw file','Charge','Intensity', 'Isotope correlation']]
query_metadata

In [None]:
pd.concat([query_embeddings, query_metadata], axis = 1).to_csv('query_embeddings_and_metadata_icml2021.csv')

In [None]:
# query_embeddings = query_embeddings.to_numpy()