In [None]:
import h5py
import glob
import os
import sys
from pathlib import Path
import torch
import pandas as pd
import numpy as np
import logging
from deeprankcore.trainer import Trainer
from deeprankcore.dataset import GraphDataset, save_hdf5_keys
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

## General metafeatures exploration
General (both for CNNs and GNNs), in terms of allele, peptide length, cluster, binary target.

In [None]:
# load csv and pdb files used for generating hdf5 files
run_day = '11122022'
project_folder = '/projects/0/einf2380/'
csv_file_name = 'BA_pMHCI_human_quantitative_gibbs_clust.csv'
models_folder_name = 'exp_nmers_all_HLA_quantitative'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
# csv
csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'
output_folder = f'{project_folder}data/{data}/features_output_folder/GNN/{resolution}/{run_day}'
csv_data = pd.read_csv(csv_file_path)
# filling nans: the <9mers have not been assigned a cluster due to the nature of the gibbs clustering
csv_data.cluster = csv_data.cluster.fillna(-1)
# creating binary target column
csv_data['binary'] = csv_data.measurement_value.apply(lambda x: int(float(x) <= 500))
# creating peptide length column
csv_data['peptide_length'] = csv_data.peptide.apply(lambda x: len(x))
# creating allele type column
csv_data['allele_type'] = csv_data.allele.str.extract(r'HLA-(\w)\*.+')
# pdbs
models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'
pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
# filter on used pdbs
csv_data_hdf5 = csv_data[csv_data.ID.isin(pdb_ids_csv)]

In [None]:
print(f'Original BA_pMHCI_human_quantitative.csv shape: {csv_data.shape[0]}')
print(f'CSV shape after filtering with PDBs IDs used for generating HDF5 files: {csv_data_hdf5.shape[0]}')
print(f'{csv_data.shape[0] - csv_data_hdf5.shape[0]} PDBs are missing from the pdb/ folder:')
# why?
csv_data_not_hdf5 = csv_data[~csv_data.ID.isin(csv_data_hdf5.ID)]
(csv_data_not_hdf5
    .groupby(['allele_type', 'peptide_length'])['ID']
    .count()
    .reset_index(name="count"))

In [None]:
print(f'Measurement inequalities:\n{csv_data_hdf5.measurement_inequality.value_counts()}')
print(f'Measurement type: {csv_data_hdf5.measurement_type.unique()}')
print(f'Measurement kind: {csv_data_hdf5.measurement_kind.unique()}')
print(f'Clusters: {np.sort(csv_data_hdf5.cluster.unique())}')
print(f'Alleles types:\n{csv_data_hdf5.allele_type.value_counts()}')
print(f'Peptides lengths: {np.sort(csv_data_hdf5.peptide_length.unique())}')
csv_data_hdf5[csv_data_hdf5.peptide_length > 15][['ID', 'allele', 'peptide', 'peptide_length']]

In [None]:
# distribution of alleles types, peptides lengths, binary target
data_grouped = (csv_data_hdf5[csv_data_hdf5.peptide_length < 16]
                    .groupby(['peptide_length', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['peptide_length', 'allele_type', 'binary', 'ID']])
fig = go.Figure()

for p in np.sort(data_grouped.peptide_length.unique()):
    plot_df = data_grouped[data_grouped.peptide_length == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Peptide length",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and peptide length",
)

#fig.show()
#fig.write_image("allele_target_pep_length.png")

In [None]:
# distribution of alleles types, clusters, binary target
data_grouped = (csv_data_hdf5[csv_data_hdf5.peptide_length < 16]
                    .groupby(['cluster', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cluster', 'allele_type', 'binary', 'ID']])

fig = go.Figure()

for p in np.sort(data_grouped.cluster.unique()):
    plot_df = data_grouped[data_grouped.cluster == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Cluster",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and cluster number",
)

#fig.show()
#fig.write_image("allele_target_cluster.png")