In [22]:
import pandas as pd
import numpy as np
import os

experiment_csvs_path = '/projects/0/einf2380/data/external/processed/I/experiments'
outputs_csvs_path = '/projects/0/einf2380/data/pop_paper_data'
outfolder = f'{outputs_csvs_path}/manuscript_csvs'

In [23]:
def write_out_csv(in_df, out_csv, add_columns=None):
    # Define the predefined string for NaN values
    nan_string = 'N/A'
    # Replace NaN values with a string representation
    in_df['anchor_0'] = in_df['anchor_0'].fillna(nan_string)
    in_df['anchor_1'] = in_df['anchor_1'].fillna(nan_string)
    # Convert columns to string type
    in_df['anchor_0'] = in_df['anchor_0'].astype(str)
    in_df['anchor_1'] = in_df['anchor_1'].astype(str)
    # Convert string values to integers
    in_df['anchor_0'] = in_df['anchor_0'].apply(lambda x: int(float(x)) if x != nan_string else nan_string)
    in_df['anchor_1'] = in_df['anchor_1'].apply(lambda x: int(float(x)) if x != nan_string else nan_string)

    # Specify columns to keep
    columns_to_keep = ['ID', 'allele', 'peptide', 'measurement_value',
            'label', 'anchor_0', 'anchor_1', 'PAHSE']
    if add_columns!=None:
        columns_to_keep.extend(add_columns)

    # Save DataFrame to CSV keeping only the specified columns
    in_df[columns_to_keep].to_csv(out_csv, index=False)

In [24]:
#%% Write full csv
shuff_train_csv = '/projects/0/einf2380/data/external/processed/I/experiments/BA_pMHCI_human_quantitative_only_eq_pseudoseq_clustered_train_validation.csv'
shuff_test_csv = '/projects/0/einf2380/data/external/processed/I/experiments/BA_pMHCI_human_quantitative_only_eq_pseudoseq_clustered_test.csv'

train_df = pd.read_csv(shuff_train_csv)
train_df['PHASE']='Train'

test_df = pd.read_csv(shuff_test_csv)
train_df['PHASE']='Test'

full_df = pd.concat([train_df, test_df], ignore_index=True)

def extract_numeric_part(id):
    return int(id.split('-')[1])

full_df = full_df.iloc[full_df['ID'].apply(extract_numeric_part).argsort()]
full_df['label'] = np.where(full_df['measurement_value'] < 500, 1.0, 0.0)

write_out_csv(full_df, out_csv=f'{outfolder}/full_dataset.csv', add_columns=['measurement_source'])


In [18]:
#%% Collect GNN outputs from hdf5 into a csv
#shuffled 
dfot = pd.read_hdf( '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_0_230607/output/output_exporter.hdf5' , key="testing") 
dfot['entry']= [x.split(':')[2] for x in dfot['entry']] 
dfot['ID'] = dfot['entry'] 
dfot['output']= [x[1] for x in dfot['output']] 
dfot.to_csv('/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_gnn_outputs.csv')

#allele-clustered 
dfot = pd.read_hdf('/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_0_230607/output/output_exporter.hdf5' , key="testing") 
dfot['entry']= [x.split(':')[2] for x in dfot['entry']] 
dfot['ID'] = dfot['entry'] 
dfot['output']= [x[1] for x in dfot['output']] 
dfot.to_csv('/projects/0/einf2380/data/pop_paper_data/gnn_outputs/allele_gnn_outputs.csv')

In [19]:
def save_models_csvs(models_csvs, full_df, experiment):

    title = 'CNN'
    cnn_dfot = pd.read_csv(models_csvs[title])
    cnn_dfot['TARGET'] = cnn_dfot['TARGET'].astype(float)
    cnn_dfot['ID'] = cnn_dfot['KEY']
    cnn_df = pd.merge(full_df, cnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(cnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])

    # GNN outputs
    title = 'GNN'
    gnn_dfot = pd.read_csv(models_csvs[title])
    #gnn_dfot['entry']= [x.split(':')[2] for x in dfot['entry']]
    #dfot['ID'] = dfot['entry']
    #dfot['output']= [x[1] for x in dfot['output']]
    gnn_df = pd.merge(full_df, gnn_dfot[['ID', 'output']].rename(columns={'output': f'{title}_output'}), on='ID', how='right')
    write_out_csv(gnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])

    #%% EGNN outputs
    def sigmoid(x):
        return np.exp(-np.logaddexp(0, -x))
    title = 'EGNN'
    egnn_dfo = pd.read_csv(models_csvs[title])
    egnn_dfot = egnn_dfo[egnn_dfo['PHASE'].str.contains('validation', case=False)]
    egnn_dfot['OUTPUT_1']=sigmoid(egnn_dfot['OUTPUT_0'])
    egnn_dfot['TARGET'] = egnn_dfot['TARGET'].astype(float)
    egnn_dfot['ID'] = egnn_dfot['KEY']
    egnn_df = pd.merge(full_df, egnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(egnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])

    #%% 3D-SSL
    title = '3D-SSL'
    ssl_dfo = pd.read_csv(models_csvs[title])
    ssl_dfot = ssl_dfo[ssl_dfo['PHASE'].str.contains('validation', case=False)]
    ssl_dfot['OUTPUT_1']=sigmoid(ssl_dfot['OUTPUT_0'])
    ssl_dfot['TARGET'] = ssl_dfot['TARGET'].astype(float)
    ssl_dfot['ID'] = ssl_dfot['KEY']
    ssl_df = pd.merge(full_df, ssl_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(ssl_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])


    #%% MHCflurry outputs
    title='MHCflurry'
    mhcflurry_dfot = pd.read_csv(models_csvs[title],
                    header=None, names=['ID', 'BA', 'OUTPUT_1'])
    mhcflurry_df = pd.merge(full_df, mhcflurry_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(mhcflurry_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])


    #%% MLP 
    title='MLP'
    mlp_dfot = pd.read_csv(models_csvs[title],
                    header=None, names=['ID', 'BA', 'OUTPUT_1'])
    mlp_df = pd.merge(full_df, mlp_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(mlp_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])

In [20]:
#%% Get Shuffled experiment test sets csvs
shuff_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/shuffled_cnn_outputs.csv',
                'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_gnn_outputs.csv',
                'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_supervised_shuffle.csv',
                '3D-SSL':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_ssl_shuffle.csv',
                'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/shuffled_mhcflurry_outputs.csv',
                'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/shuffled_mlp_outputs.csv',
}

save_models_csvs(models_csvs=shuff_models_csvs, full_df=full_df, experiment='shuffled')



In [21]:
#%% Get Allele clustered experiment test sets csvs
allele_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/allele_cnn_outputs.csv',
                'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/allele_gnn_outputs.csv',
                'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_supervised_allele.csv',
                '3D-SSL':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_ssl_allele.csv',
                'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/allele_mhcflurry_outputs.csv',
                'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/allele_mlp_outputs.csv',
}

save_models_csvs(models_csvs=allele_models_csvs, full_df=full_df, experiment='AlleleClustered')