In [None]:
import glob
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## General metafeatures exploration
General (both for CNNs and GNNs), in terms of allele, peptide length, peptides' and alleles' clustering, binary target.

In [None]:
# load csv and pdb files used for generating hdf5 files
project_folder = '/projects/0/einf2380/'
csv_file_name = 'BA_pMHCI_human_quantitative_only_eq.csv'
csv_file_cl_peptide = 'BA_pMHCI_human_quantitative_all_hla_gibbs_clusters.csv'
csv_file_cl_allele = 'BA_pMHCI_human_quantitative_only_eq_alleleclusters_pseudoseq.csv'
models_folder_name = 'HLA_quantitative'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
# csv
csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'
csv_file_cl_peptide_path = f'{project_folder}data/external/processed/I/clusters/{csv_file_cl_peptide}'
csv_file_cl_allele_path = f'{project_folder}data/external/processed/I/clusters/{csv_file_cl_allele}'
csv_data = pd.read_csv(csv_file_path)
csv_cl_peptide = pd.read_csv(csv_file_cl_peptide_path)
csv_cl_allele = pd.read_csv(csv_file_cl_allele_path)
# assigning the correct clusters (20/03/2023)
csv_data = csv_data.merge(csv_cl_peptide[['ID', 'cluster_set_10']], how='left', on='ID').rename(columns={'cluster_set_10': 'cl_peptide'})
# filling nans: the <9mers have not been assigned a cluster due to the nature of the gibbs clustering
csv_data.cl_peptide = csv_data.cl_peptide.fillna(-1)
csv_data = csv_data.merge(csv_cl_allele[['ID', 'allele_clustering']], how='left', on='ID').rename(columns={'allele_clustering': 'cl_allele'})

In [None]:
# creating binary target column
csv_data['binary'] = csv_data.measurement_value.apply(lambda x: int(float(x) <= 500))
# creating peptide length column
csv_data['peptide_length'] = csv_data.peptide.apply(lambda x: len(x))
# creating allele type column
csv_data['allele_type'] = csv_data.allele.str.extract(r'HLA-(\w)\*.+')
# pdbs
models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'
pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
# filter on used pdbs
csv_data_hdf5 = csv_data[csv_data.ID.isin(pdb_ids_csv)]

In [None]:
print(f'Original BA_pMHCI_human_quantitative.csv shape: {csv_data.shape[0]}')
print(f'CSV shape after filtering with PDBs IDs used for generating HDF5 files: {csv_data_hdf5.shape[0]}')
print(f'{csv_data.shape[0] - csv_data_hdf5.shape[0]} PDBs are missing from the pdb/ folder:')
# why?
csv_data_not_hdf5 = csv_data[~csv_data.ID.isin(csv_data_hdf5.ID)]
(csv_data_not_hdf5
    .groupby(['allele_type', 'peptide_length'])['ID']
    .count()
    .reset_index(name="count"))

In [None]:
print(f'Measurement inequalities:\n{csv_data_hdf5.measurement_inequality.value_counts()}')
print(f'Measurement type: {csv_data_hdf5.measurement_type.unique()}')
print(f'Measurement kind: {csv_data_hdf5.measurement_kind.unique()}')
print(f'Clusters: {np.sort(csv_data_hdf5.cl_peptide.unique())}')
print(f'Alleles types:\n{csv_data_hdf5.allele_type.value_counts()}')
print(f'Peptides lengths: {np.sort(csv_data_hdf5.peptide_length.unique())}')
csv_data_hdf5[csv_data_hdf5.peptide_length > 15][['ID', 'allele', 'peptide', 'peptide_length']]

In [None]:
# distribution of alleles types, peptides lengths, binary target
data_grouped = (csv_data_hdf5.groupby(['peptide_length', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['peptide_length', 'allele_type', 'binary', 'ID']])
fig = go.Figure()

for p in np.sort(data_grouped.peptide_length.unique()):
    plot_df = data_grouped[data_grouped.peptide_length == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Peptide length",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and peptide length",
)

fig.show()
fig.write_image("peptide_length_allele_type_binary.png")

In [None]:
# distribution of alleles types, clusters, binary target
data_grouped = (csv_data_hdf5.groupby(['cl_peptide', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cl_peptide', 'allele_type', 'binary', 'ID']])

fig = go.Figure()

for p in np.sort(data_grouped.cl_peptide.unique()):
    plot_df = data_grouped[data_grouped.cl_peptide == p]
    fig.add_trace(
        go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name=str(p)),
    )

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Peptides' clusters",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and peptides' clusters' number",
)

fig.show()
fig.write_image("cl_peptide_allele_type_binary.png")

In [None]:
# distribution of alleles types, alleles clustering, binary target
data_grouped = (csv_data_hdf5.groupby(['cl_allele', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cl_allele', 'allele_type', 'binary', 'ID']])

fig = go.Figure()

plot_df = data_grouped[data_grouped.cl_allele == 0]
fig.add_trace(
    go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name='Training'),
)

plot_df = data_grouped[data_grouped.cl_allele == 1]
fig.add_trace(
    go.Bar(x=[plot_df.allele_type, plot_df.binary], y=plot_df['ID'], name='Testing'),
)

fig.update_layout(
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
    legend_title = "Alleles' clusters",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and allele_clustering",
)

fig.show()
fig.write_image("cl_allele_type_binary.png")