In [1]:
import h5py
import glob
import os
import sys
from pathlib import Path
import torch
import pandas as pd
import numpy as np
import logging
from deeprankcore.trainer import Trainer
from deeprankcore.dataset import GraphDataset, save_hdf5_keys
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


## General metafeatures exploration
General (both for CNNs and GNNs), in terms of allele, peptide length, cluster, binary target.

In [2]:
# load csv and pdb files used for generating hdf5 files
project_folder = '/projects/0/einf2380/'
csv_file_name = 'BA_pMHCI_human_quantitative_only_eq.csv'
csv_file_clusters = '/projects/0/einf2380/data/external/processed/I/clusters/BA_pMHCI_human_quantitative_all_hla_gibbs_clusters.csv'
models_folder_name = 'exp_nmers_all_HLA_quantitative'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
# csv
csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'
csv_data = pd.read_csv(csv_file_path)
csv_data.drop(columns=['cluster'], inplace=True) # they are not the updated ones
csv_clusters = pd.read_csv(csv_file_clusters)
# assigning the correct clusters (20/03/2023)
csv_data = csv_data.merge(csv_clusters[['ID', 'cluster_set_10']], how='left', on='ID').rename(columns={'cluster_set_10': 'cluster'})

In [3]:

# filling nans: the <9mers have not been assigned a cluster due to the nature of the gibbs clustering
csv_data.cluster = csv_data.cluster.fillna(-1)
# creating binary target column
csv_data['binary'] = csv_data.measurement_value.apply(lambda x: int(float(x) <= 500))
# creating peptide length column
csv_data['peptide_length'] = csv_data.peptide.apply(lambda x: len(x))
# creating allele type column
csv_data['allele_type'] = csv_data.allele.str.extract(r'HLA-(\w)\*.+')
# pdbs
models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'
pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
# filter on used pdbs
csv_data_hdf5 = csv_data[csv_data.ID.isin(pdb_ids_csv)]

In [4]:
print(f'Original BA_pMHCI_human_quantitative.csv shape: {csv_data.shape[0]}')
print(f'CSV shape after filtering with PDBs IDs used for generating HDF5 files: {csv_data_hdf5.shape[0]}')
print(f'{csv_data.shape[0] - csv_data_hdf5.shape[0]} PDBs are missing from the pdb/ folder:')
# why?
csv_data_not_hdf5 = csv_data[~csv_data.ID.isin(csv_data_hdf5.ID)]
(csv_data_not_hdf5
    .groupby(['allele_type', 'peptide_length'])['ID']
    .count()
    .reset_index(name="count"))

Original BA_pMHCI_human_quantitative.csv shape: 100315
CSV shape after filtering with PDBs IDs used for generating HDF5 files: 100178
137 PDBs are missing from the pdb/ folder:


Unnamed: 0,allele_type,peptide_length,count
0,A,7,1
1,A,8,1
2,A,9,25
3,A,10,21
4,A,12,1
5,B,7,4
6,B,8,1
7,B,9,52
8,B,10,23
9,B,11,2


In [5]:
print(f'Measurement inequalities:\n{csv_data_hdf5.measurement_inequality.value_counts()}')
print(f'Measurement type: {csv_data_hdf5.measurement_type.unique()}')
print(f'Measurement kind: {csv_data_hdf5.measurement_kind.unique()}')
print(f'Clusters: {np.sort(csv_data_hdf5.cluster.unique())}')
print(f'Alleles types:\n{csv_data_hdf5.allele_type.value_counts()}')
print(f'Peptides lengths: {np.sort(csv_data_hdf5.peptide_length.unique())}')
csv_data_hdf5[csv_data_hdf5.peptide_length > 15][['ID', 'allele', 'peptide', 'peptide_length']]

Measurement inequalities:
=    100178
Name: measurement_inequality, dtype: int64
Measurement type: ['quantitative']
Measurement kind: ['affinity']
Clusters: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
Alleles types:
A    72521
B    26401
C     1219
E       37
Name: allele_type, dtype: int64
Peptides lengths: [ 8  9 10 11 12 13 14 15 17 18 20 21]


Unnamed: 0,ID,allele,peptide,peptide_length
1541,BA-49485,HLA-A*01:01,PLMGGAYIAFPTSCHMFI,18
1598,BA-49759,HLA-A*01:01,QFLKFSLPFPFLYKFLL,17
5248,BA-64367,HLA-A*02:01,FSWGAEGQPRPGFGYGGRASD,21
5544,BA-65081,HLA-A*02:01,GFGYGGRASDYKSAHKGFKG,20
5650,BA-65401,HLA-A*02:01,GKGRGLSLSRFSWGAEGQPR,20
...,...,...,...,...
97094,BA-503344,HLA-B*57:01,PLMGGAYIAFPTSCHMFI,18
97114,BA-503656,HLA-B*57:01,QFLKFSLPFPFLYKFLL,17
98344,BA-522339,HLA-B*58:01,PLMGGAYIAFPTSCHMFI,18
98363,BA-522461,HLA-B*58:01,QFLKFSLPFPFLYKFLL,17


In [6]:
print(f'All peptides length CSV shape: {csv_data_hdf5.shape[0]}')
csv_data_hdf5 = csv_data_hdf5[csv_data_hdf5.peptide_length <= 15]
print(f'CSV shape after filtering with peptides <= 15: {csv_data_hdf5.shape[0]}')

All peptides length CSV shape: 100178
CSV shape after filtering with peptides <= 15: 100090


In [9]:
# distribution of alleles types, peptides lengths, binary target
data_grouped = (csv_data_hdf5[csv_data_hdf5.peptide_length < 16]
                    .groupby(['peptide_length', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['peptide_length', 'allele_type', 'binary', 'ID']])
fig = go.Figure()

for p in np.sort(data_grouped.peptide_length.unique()):
    plot_df = data_grouped[data_grouped.peptide_length == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Peptide length",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and peptide length",
)

# fig.show()
# fig.write_image("allele_target_pep_length.png")

In [8]:
# distribution of alleles types, clusters, binary target
data_grouped = (csv_data_hdf5[csv_data_hdf5.peptide_length < 16]
                    .groupby(['cluster', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cluster', 'allele_type', 'binary', 'ID']])

fig = go.Figure()

for p in np.sort(data_grouped.cluster.unique()):
    plot_df = data_grouped[data_grouped.cluster == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Cluster",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and cluster number",
)

# fig.show()
# fig.write_image("allele_target_cluster.png")