In [1]:
import glob
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## General metafeatures exploration
General (both for CNNs and GNNs), in terms of allele, peptide length, peptides' and alleles' clustering, binary target.

In [2]:
# load csv and pdb files used for generating hdf5 files
project_folder = '/projects/0/einf2380/'
csv_file_name = 'BA_pMHCI_human_quantitative_only_eq.csv'
csv_file_cl_peptide1 = 'BA_pMHCI_human_quantitative_all_hla_gibbs_clusters.csv'
csv_file_cl_peptide2 = 'BA_pMHCI_human_quantitative_clustered_peptides_marieke_fixed.csv'
csv_file_cl_allele = 'BA_pMHCI_human_quantitative_only_eq_alleleclusters_pseudoseq.csv'
models_folder_name = 'HLA_quantitative'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
# csv
csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'
csv_file_cl_peptide1_path = f'{project_folder}data/external/processed/I/clusters/{csv_file_cl_peptide1}'
csv_file_cl_peptide2_path = f'{project_folder}data/external/processed/I/clusters/{csv_file_cl_peptide2}'
csv_file_cl_allele_path = f'{project_folder}data/external/processed/I/clusters/{csv_file_cl_allele}'
csv_data = pd.read_csv(csv_file_path)
csv_cl_peptide1 = pd.read_csv(csv_file_cl_peptide1_path)
csv_cl_peptide2 = pd.read_csv(csv_file_cl_peptide2_path)
csv_cl_allele = pd.read_csv(csv_file_cl_allele_path)
# assigning clusters
csv_data = csv_data.merge(csv_cl_peptide1[['ID', 'cluster_set_10']], how='left', on='ID').rename(columns={'cluster_set_10': 'cl_peptide1'})
# filling nans: the <9mers have not been assigned a cluster due to the nature of the gibbs clustering
csv_data.cl_peptide1 = csv_data.cl_peptide1.fillna(-1)
csv_data = csv_data.merge(csv_cl_peptide2[['ID', 'Marieke_cluster']], how='left', on='ID').rename(columns={'Marieke_cluster': 'cl_peptide2'})
csv_data = csv_data.merge(csv_cl_allele[['ID', 'allele_clustering']], how='left', on='ID').rename(columns={'allele_clustering': 'cl_allele'})
# tests
cl_test_peptide1 = 3
cl_test_peptide2 = 4
cl_test_allele = 1

In [3]:
# creating binary target column
csv_data['binary'] = csv_data.measurement_value.apply(lambda x: int(float(x) <= 500))
# creating peptide length column
csv_data['peptide_length'] = csv_data.peptide.apply(lambda x: len(x))
# creating allele type column
csv_data['allele_type'] = csv_data.allele.str.extract(r'HLA-(\w)\*.+')
# pdbs
models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'
pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
# filter on used pdbs
csv_data_hdf5 = csv_data[csv_data.ID.isin(pdb_ids_csv)]

In [4]:
print(f'Original BA_pMHCI_human_quantitative.csv shape: {csv_data.shape[0]}')
print(f'CSV shape after filtering with PDBs IDs used for generating HDF5 files: {csv_data_hdf5.shape[0]}')
print(f'{csv_data.shape[0] - csv_data_hdf5.shape[0]} PDBs are missing from the pdb/ folder:')
# why?
csv_data_not_hdf5 = csv_data[~csv_data.ID.isin(csv_data_hdf5.ID)]
(csv_data_not_hdf5
    .groupby(['allele_type', 'peptide_length'])['ID']
    .count()
    .reset_index(name="count"))

Original BA_pMHCI_human_quantitative.csv shape: 100270
CSV shape after filtering with PDBs IDs used for generating HDF5 files: 100178
92 PDBs are missing from the pdb/ folder:


Unnamed: 0,allele_type,peptide_length,count
0,A,7,1
1,A,9,2
2,A,10,1
3,B,7,4
4,B,8,1
5,B,9,52
6,B,10,23
7,B,11,2


In [5]:
print(f'Measurement inequalities:\n{csv_data_hdf5.measurement_inequality.value_counts()}')
print(f'Measurement type: {csv_data_hdf5.measurement_type.unique()}')
print(f'Measurement kind: {csv_data_hdf5.measurement_kind.unique()}')
print(f'Gibbs Clusters: {np.sort(csv_data_hdf5.cl_peptide1.unique())}')
print(f'Marieke Clusters: {np.sort(csv_data_hdf5.cl_peptide2.unique())}')
print(f'Alleles types:\n{csv_data_hdf5.allele_type.value_counts()}')
print(f'Peptides lengths: {np.sort(csv_data_hdf5.peptide_length.unique())}')
csv_data_hdf5[csv_data_hdf5.peptide_length > 15][['ID', 'allele', 'peptide', 'peptide_length']]

Measurement inequalities:
=    100178
Name: measurement_inequality, dtype: int64
Measurement type: ['quantitative']
Measurement kind: ['affinity']
Gibbs Clusters: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
Marieke Clusters: [0 1 2 3 4]
Alleles types:
A    72521
B    26401
C     1219
E       37
Name: allele_type, dtype: int64
Peptides lengths: [ 8  9 10 11 12 13 14 15 17 18 20 21]


Unnamed: 0,ID,allele,peptide,peptide_length
1541,BA-49485,HLA-A*01:01,PLMGGAYIAFPTSCHMFI,18
1598,BA-49759,HLA-A*01:01,QFLKFSLPFPFLYKFLL,17
5248,BA-64367,HLA-A*02:01,FSWGAEGQPRPGFGYGGRASD,21
5544,BA-65081,HLA-A*02:01,GFGYGGRASDYKSAHKGFKG,20
5650,BA-65401,HLA-A*02:01,GKGRGLSLSRFSWGAEGQPR,20
...,...,...,...,...
97049,BA-503344,HLA-B*57:01,PLMGGAYIAFPTSCHMFI,18
97069,BA-503656,HLA-B*57:01,QFLKFSLPFPFLYKFLL,17
98299,BA-522339,HLA-B*58:01,PLMGGAYIAFPTSCHMFI,18
98318,BA-522461,HLA-B*58:01,QFLKFSLPFPFLYKFLL,17


## Plotting distribution of alleles types, peptides lengths, binary target
### General data exploration, selecting the most frequent peptides

In [7]:
peptide_grouped = csv_data_hdf5.groupby(['peptide_length']).count().reset_index()[['peptide_length', 'ID']]
peptide_to_plot = list(peptide_grouped[peptide_grouped.ID > 1000].peptide_length)
colors = ['#648FFF', '#DC267F', '#FE6100', '#FFB000']
patter_shape = ['', 'x', '+', '']

data_grouped = (csv_data_hdf5.groupby(['peptide_length', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['peptide_length', 'allele_type', 'binary', 'ID']])
data_grouped = data_grouped[(data_grouped.allele_type != 'E') & (data_grouped.peptide_length.isin(peptide_to_plot))]

fig = go.Figure()
for idx, p in enumerate(np.sort(data_grouped.peptide_length.unique())):
    plot_df = data_grouped[data_grouped.peptide_length == p]
    fig.add_trace(
        go.Bar(
            x=[plot_df.allele_type, plot_df.binary],
            y=plot_df['ID'],
            name=str(p),
            marker_color=colors[idx],
            # marker_pattern_shape=patter_shape[idx]
            )
    )

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)

fig.update_layout(
    plot_bgcolor='white',
    width=900,
    height=600, 
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Number of pMHC complexes"),
    barmode="stack",
    legend_title = "Peptide length",
    legend_traceorder="normal",
    title = "Data grouped by allele type, binary target value, and peptide length",
)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)

fig.show()
fig.write_image("peptide_length_allele_type_binary.png")

## Plotting distribution of binary target, including alleles, for peptide clustering 1 (Gibbs)
### Highlighting training and testing

In [19]:
csv_data_to_group = csv_data_hdf5.copy()
csv_data_to_group['cl_peptide1'] = csv_data_to_group['cl_peptide1'].apply(lambda x: 0 if x != cl_test_peptide1 else 1)

data_grouped = (csv_data_to_group.groupby(['cl_peptide1', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cl_peptide1', 'allele_type', 'binary', 'ID']])
data_grouped = data_grouped[data_grouped.allele_type != 'E']
colors = ['#0072B2', '#CC79A7']
patter_shape = ['x', '+']

fig = go.Figure()

plot_df = data_grouped[data_grouped.cl_peptide1 == 0]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Training',
        marker_color=colors[0],
        # marker_pattern_shape=patter_shape[0]
        ),
)

plot_df = data_grouped[data_grouped.cl_peptide1 == 1]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Testing',
        marker_color=colors[1],
        # marker_pattern_shape=patter_shape[1]
        ),
)

fig.update_layout(
    plot_bgcolor='white',
    width=800,
    height=700, 
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Number of pMHC complexes"),
    barmode="stack",
    legend_traceorder="normal",
    title = "Peptide-clustered configuration 1 (Gibbs)",
)

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.show()
fig.write_image("cl_peptide1_allele_type_binary.png")

In [13]:
csv_train = csv_data_hdf5[~(csv_data_hdf5.cl_peptide1 == cl_test_peptide1)]
csv_test = csv_data_hdf5[csv_data_hdf5.cl_peptide1 == cl_test_peptide1]
print(f"Using cluster value {cl_test_peptide1} for the testing set.")

print(f'Training set: {len(csv_train)} samples, {round(100*len(csv_train)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_train[csv_train.binary == 0])} samples, {round(100*len(csv_train[csv_train.binary == 0])/len(csv_train))}%')
print(f'\t- Class 1: {len(csv_train[csv_train.binary == 1])} samples, {round(100*len(csv_train[csv_train.binary == 1])/len(csv_train))}%')
print(f'Testing set: {len(csv_test)} samples, {round(100*len(csv_test)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_test[csv_test.binary == 0])} samples, {round(100*len(csv_test[csv_test.binary == 0])/len(csv_test))}%')
print(f'\t- Class 1: {len(csv_test[csv_test.binary == 1])} samples, {round(100*len(csv_test[csv_test.binary == 1])/len(csv_test))}%')

Using cluster value 3 for the testing set.
Training set: 92556 samples, 92%
	- Class 0: 51756 samples, 56%
	- Class 1: 40800 samples, 44%
Testing set: 7622 samples, 8%
	- Class 0: 4320 samples, 57%
	- Class 1: 3302 samples, 43%


## Plotting distribution of binary target, including alleles, for peptide clustering 2 (Marieke)
### Highlighting training and testing

In [15]:
csv_data_to_group = csv_data_hdf5.copy()
csv_data_to_group['cl_peptide2'] = csv_data_to_group['cl_peptide2'].apply(lambda x: 0 if x != cl_test_peptide1 else 1)

data_grouped = (csv_data_to_group.groupby(['cl_peptide2', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cl_peptide2', 'allele_type', 'binary', 'ID']])
data_grouped = data_grouped[data_grouped.allele_type != 'E']
colors = ['#0072B2', '#CC79A7']
patter_shape = ['x', '+']

fig = go.Figure()

plot_df = data_grouped[data_grouped.cl_peptide2 == 0]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Training',
        marker_color=colors[0],
        # marker_pattern_shape=patter_shape[0]
        ),
)

plot_df = data_grouped[data_grouped.cl_peptide2 == 1]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Testing',
        marker_color=colors[1],
        # marker_pattern_shape=patter_shape[1]
        ),
)

fig.update_layout(
    plot_bgcolor='white',
    width=800,
    height=700, 
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Number of pMHC complexes"),
    barmode="stack",
    legend_traceorder="normal",
    title = "(Other) peptides' clustering",
)

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.show()
fig.write_image("cl_peptide2_allele_type_binary.png")

In [16]:
csv_train = csv_data_hdf5[~(csv_data_hdf5.cl_peptide2 == cl_test_peptide2)]
csv_test = csv_data_hdf5[csv_data_hdf5.cl_peptide2 == cl_test_peptide2]
print(f"Using cluster value {cl_test_peptide2} for the testing set.")

print(f'Training set: {len(csv_train)} samples, {round(100*len(csv_train)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_train[csv_train.binary == 0])} samples, {round(100*len(csv_train[csv_train.binary == 0])/len(csv_train))}%')
print(f'\t- Class 1: {len(csv_train[csv_train.binary == 1])} samples, {round(100*len(csv_train[csv_train.binary == 1])/len(csv_train))}%')
print(f'Testing set: {len(csv_test)} samples, {round(100*len(csv_test)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_test[csv_test.binary == 0])} samples, {round(100*len(csv_test[csv_test.binary == 0])/len(csv_test))}%')
print(f'\t- Class 1: {len(csv_test[csv_test.binary == 1])} samples, {round(100*len(csv_test[csv_test.binary == 1])/len(csv_test))}%')

Using cluster value 4 for the testing set.
Training set: 82187 samples, 82%
	- Class 0: 44304 samples, 54%
	- Class 1: 37883 samples, 46%
Testing set: 17991 samples, 18%
	- Class 0: 11772 samples, 65%
	- Class 1: 6219 samples, 35%


## Plotting distribution of binary target for allele clustering (Dario's dendrograms)
### Highlighting training and testing

In [20]:
data_grouped = (csv_data_hdf5.groupby(['cl_allele', 'allele_type', 'binary'])
                    .count()
                    .reset_index()[['cl_allele', 'allele_type', 'binary', 'ID']])
data_grouped = data_grouped[data_grouped.allele_type != 'E']
colors = ['#0072B2', '#CC79A7']
patter_shape = ['x', '+']

fig = go.Figure()

plot_df = data_grouped[data_grouped.cl_allele == 0]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Training',
        marker_color=colors[0],
        # marker_pattern_shape=patter_shape[0]
        ),
)

plot_df = data_grouped[data_grouped.cl_allele == 1]
fig.add_trace(
    go.Bar(
        x=[plot_df.allele_type, plot_df.binary],
        y=plot_df['ID'],
        name='Testing',
        marker_color=colors[1],
        # marker_pattern_shape=patter_shape[1]
        ),
)

fig.update_layout(
    plot_bgcolor='white',
    width=800,
    height=700, 
    xaxis=dict(title_text="Binary target per allele"),
    yaxis=dict(title_text="Number of pMHC complexes"),
    barmode="stack",
    legend_traceorder="normal",
    title = "Allele-clustered configuration",
)

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.show()
fig.write_image("cl_allele_type_binary.png")

In [21]:
csv_train = csv_data_hdf5[~(csv_data_hdf5.cl_allele == cl_test_allele)]
csv_test = csv_data_hdf5[csv_data_hdf5.cl_allele == cl_test_allele]
print(f"Using cluster value {cl_test_allele} for the testing set.")

print(f'Training set: {len(csv_train)} samples, {round(100*len(csv_train)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_train[csv_train.binary == 0])} samples, {round(100*len(csv_train[csv_train.binary == 0])/len(csv_train))}%')
print(f'\t- Class 1: {len(csv_train[csv_train.binary == 1])} samples, {round(100*len(csv_train[csv_train.binary == 1])/len(csv_train))}%')
print(f'Testing set: {len(csv_test)} samples, {round(100*len(csv_test)/len(csv_data_hdf5))}%')
print(f'\t- Class 0: {len(csv_test[csv_test.binary == 0])} samples, {round(100*len(csv_test[csv_test.binary == 0])/len(csv_test))}%')
print(f'\t- Class 1: {len(csv_test[csv_test.binary == 1])} samples, {round(100*len(csv_test[csv_test.binary == 1])/len(csv_test))}%')

Using cluster value 1 for the testing set.
Training set: 89779 samples, 90%
	- Class 0: 49958 samples, 56%
	- Class 1: 39821 samples, 44%
Testing set: 10399 samples, 10%
	- Class 0: 6118 samples, 59%
	- Class 1: 4281 samples, 41%
