In [1]:
import pandas as pd
import numpy as np
import os

experiment_csvs_path = '/projects/0/einf2380/data/external/processed/I/experiments'
outputs_csvs_path = '/projects/0/einf2380/data/pop_paper_data'
outfolder = f'{outputs_csvs_path}/manuscript_csvs'

In [9]:
def write_out_csv(in_df, out_csv, add_columns=None):
    # Define the predefined string for NaN values
    nan_string = 'N/A'
    # Replace NaN values with a string representation
    in_df['anchor_0'] = in_df['anchor_0'].fillna(nan_string)
    in_df['anchor_1'] = in_df['anchor_1'].fillna(nan_string)
    # Convert columns to string type
    in_df['anchor_0'] = in_df['anchor_0'].astype(str)
    in_df['anchor_1'] = in_df['anchor_1'].astype(str)
    # Convert string values to integers
    in_df['anchor_0'] = in_df['anchor_0'].apply(lambda x: int(float(x)) if x != nan_string else nan_string)
    in_df['anchor_1'] = in_df['anchor_1'].apply(lambda x: int(float(x)) if x != nan_string else nan_string)

    # Specify columns to keep
    columns_to_keep = ['ID', 'allele', 'peptide', 'measurement_value',
            'label', 'anchor_0', 'anchor_1']
    if add_columns!=None:
        columns_to_keep.extend(add_columns)

    # Save DataFrame to CSV keeping only the specified columns
    in_df[columns_to_keep].to_csv(out_csv, index=False)

In [10]:
#%% Write full csv
allele_train_csv = '/projects/0/einf2380/data/external/processed/I/experiments/BA_pMHCI_human_quantitative_only_eq_pseudoseq_clustered_train_validation.csv'
allele_test_csv = '/projects/0/einf2380/data/external/processed/I/experiments/BA_pMHCI_human_quantitative_only_eq_pseudoseq_clustered_test.csv'

train_df = pd.read_csv(allele_train_csv)
train_df['PHASE']='Train'
print(train_df.columns)

test_df = pd.read_csv(allele_test_csv)
test_df['PHASE']='Test'
print(test_df.columns)

full_df = pd.concat([train_df, test_df], ignore_index=True)
#full_df = pd.read_csv('/projects/0/einf2380/data/external/processed/I/CrossValidations/full_dataset.csv')

def extract_numeric_part(id):
    return int(id.split('-')[1])

full_df = full_df.iloc[full_df['ID'].apply(extract_numeric_part).argsort()]
full_df['label'] = np.where(full_df['measurement_value'] < 500, 1.0, 0.0)

print('PHASE' in full_df.columns)

write_out_csv(full_df, out_csv=f'{outfolder}/full_dataset.csv', add_columns=['measurement_source'])


Index(['ID', 'allele', 'peptide', 'measurement_value',
       'measurement_inequality', 'measurement_type', 'measurement_kind',
       'measurement_source', 'original_allele', 'db2_folder', 'cluster',
       'anchor_0', 'anchor_1', 'cluster_set_10', 'allele_clustering', 'PHASE'],
      dtype='object')
Index(['ID', 'allele', 'peptide', 'measurement_value',
       'measurement_inequality', 'measurement_type', 'measurement_kind',
       'measurement_source', 'original_allele', 'db2_folder', 'cluster',
       'anchor_0', 'anchor_1', 'cluster_set_10', 'allele_clustering', 'PHASE'],
      dtype='object')
True


In [11]:
# #%% Collect GNN outputs from hdf5 into a csv
# #shuffled 
# GNN_shuffled_outputs = ['/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_n_run_1_0_240605',
#                         '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_n_run_2_0_240605',
#                         '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_n_run_3_0_240605',
#                         '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_n_run_4_0_240605',
#                         '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_n_run_5_0_240605']

# for i, run in enumerate(GNN_shuffled_outputs):
#     dfot = pd.read_hdf( f'{run}/output/output_exporter.hdf5' , key="testing") 
#     dfot['entry']= [x.split(':')[2] for x in dfot['entry']] 
#     dfot['ID'] = dfot['entry'] 
#     dfot['output']= [x[1] for x in dfot['output']] 
#     dfot.to_csv(f'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_crossval/shuffled_gnn_outputs_n_run_{i+1}.csv')

# #allele-clustered 
# GNN_allele_outputs = ['/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_run_1_1_240612',
#                       '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_run_2_1_240612',
#                       '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_run_3_1_240612',
#                       '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_run_4_2_240613',
#                       '/projects/0/einf2380/data/pMHCI/trained_models/deeprank2/experiments/exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_run_5_2_240613']

# for i, run in enumerate(GNN_shuffled_outputs):
#     dfot = pd.read_hdf(f'{run}/output/output_exporter.hdf5' , key="testing") 
#     dfot['entry']= [x.split(':')[2] for x in dfot['entry']] 
#     dfot['ID'] = dfot['entry'] 
#     dfot['output']= [x[1] for x in dfot['output']] 
#     dfot.to_csv(f'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/allele_crossval/allele_gnn_outputs_n_run_{i+1}.csv')

PermissionError: [Errno 13] Permission denied: '/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_crossval/shuffled_gnn_outputs_n_run_1.csv'

In [31]:
def save_models_csvs(models_csvs, full_df, experiment):

    title = 'CNN'
    for i in range(1, 6):
        csv_name = models_csvs[title].split('/')[-1].replace('1',f'{i}')
        csv_path = ('/').join(models_csvs[title].split('/')[:-1] + [csv_name])
        cnn_dfot = pd.read_csv(csv_path)
        cnn_dfot['TARGET'] = cnn_dfot['TARGET'].astype(float)
        cnn_dfot['ID'] = cnn_dfot['KEY']
        if i == 1:
            cnn_df = pd.merge(full_df, cnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
        else:
            cnn_df = pd.merge(cnn_df, cnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
    output_columns = [f'{title}_fold{i}_output' for i in range(1,6)]
    cnn_df = cnn_df.dropna(subset=output_columns, how='all')
    write_out_csv(cnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test_crossval.csv', add_columns=output_columns)

    # GNN outputs
    title = 'GNN'
    for i in range(1, 6):
        csv_name = models_csvs[title].split('/')[-1].replace('1',f'{i}')
        csv_path = ('/').join(models_csvs[title].split('/')[:-1] + [csv_name])
        gnn_dfot = pd.read_csv(csv_path)
        #gnn_dfot['entry']= [x.split(':')[2] for x in dfot['entry']]
        #dfot['ID'] = dfot['entry']
        #dfot['output']= [x[1] for x in dfot['output']]
        if i == 1:
            gnn_df = pd.merge(full_df, gnn_dfot[['ID', 'output']].rename(columns={'output': f'{title}_fold{i}_output'}), on='ID', how='left')
        else:
            gnn_df = pd.merge(gnn_df, gnn_dfot[['ID', 'output']].rename(columns={'output': f'{title}_fold{i}_output'}), on='ID', how='left')
    output_columns = [f'{title}_fold{i}_output' for i in range(1,6)]
    gnn_df = gnn_df.dropna(subset=output_columns, how='all')
    write_out_csv(gnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test_crossval.csv', add_columns=output_columns)

    #%% EGNN outputs
    def sigmoid(x):
        return np.exp(-np.logaddexp(0, -x))
    title = 'EGNN'
    for i in range(1, 6):
        csv_name = models_csvs[title].split('/')[-1].replace('1',f'{i}')
        csv_path = ('/').join(models_csvs[title].split('/')[:-1] + [csv_name])
        egnn_dfot = pd.read_csv(csv_path)
        #egnn_dfot = egnn_dfo[egnn_dfo['PHASE'].str.contains('validation', case=False)]
        egnn_dfot['OUTPUT_1']=sigmoid(egnn_dfot['Output'])
        #egnn_dfot['TARGET'] = egnn_dfot['TARGET'].astype(float)
        egnn_dfot['ID'] = egnn_dfot['Key']
        if i == 1:
            egnn_df = pd.merge(full_df, egnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
        else:
            egnn_df = pd.merge(egnn_df, egnn_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
    output_columns = [f'{title}_fold{i}_output' for i in range(1,6)]
    egnn_df = egnn_df.dropna(subset=output_columns, how='all')
    write_out_csv(egnn_df, out_csv=f'{outfolder}/{title}_{experiment}_test_crossval.csv', add_columns=output_columns)

    #%% 3D-SSL
    title = '3D-SSL'
    for i in range(1, 6):
        csv_name = models_csvs[title].split('/')[-1].replace('1',f'{i}')
        csv_path = ('/').join(models_csvs[title].split('/')[:-1] + [csv_name])
        ssl_dfot = pd.read_csv(csv_path)
        #ssl_dfot = ssl_dfo[ssl_dfo['PHASE'].str.contains('validation', case=False)]
        ssl_dfot['OUTPUT_1']=sigmoid(ssl_dfot['Log_prob'])
        #ssl_dfot['TARGET'] = ssl_dfot['TARGET'].astype(float)
        ssl_dfot['ID'] = ssl_dfot['Key']
        if i == 1:
            ssl_df = pd.merge(full_df, ssl_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
        else:
            ssl_df = pd.merge(ssl_df, ssl_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
    output_columns = [f'{title}_fold{i}_output' for i in range(1,6)]
    ssl_df = ssl_df.dropna(subset=output_columns, how='all')
    write_out_csv(ssl_df, out_csv=f'{outfolder}/{title}_{experiment}_test_crossval.csv', add_columns=output_columns)


    #%% MHCflurry outputs
    title='MHCflurry'
    mhcflurry_dfot = pd.read_csv(models_csvs[title],
                    header=None, names=['ID', 'BA', 'OUTPUT_1'])
    mhcflurry_df = pd.merge(full_df, mhcflurry_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_output'}), on='ID', how='right')
    write_out_csv(mhcflurry_df, out_csv=f'{outfolder}/{title}_{experiment}_test.csv', add_columns=[f'{title}_output'])


    #%% MLP 
    title='MLP'
    for i in range(1, 6):
        csv_name = models_csvs[title].split('/')[-1].replace('1',f'{i}')
        csv_path = ('/').join(models_csvs[title].split('/')[:-1] + [csv_name])
        mlp_dfot = pd.read_csv(csv_path,
                        header=None, names=['ID', 'BA', 'OUTPUT_1'])
        if i == 1:
            mlp_df = pd.merge(full_df, mlp_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
        else:
            mlp_df = pd.merge(mlp_df, mlp_dfot[['ID', 'OUTPUT_1']].rename(columns={'OUTPUT_1': f'{title}_fold{i}_output'}), on='ID', how='left')
    output_columns = [f'{title}_fold{i}_output' for i in range(1,6)]
    mlp_df = mlp_df.dropna(subset=output_columns, how='all')
    write_out_csv(mlp_df, out_csv=f'{outfolder}/{title}_{experiment}_test_crossval.csv', add_columns=output_columns)

In [32]:
#%% Get Shuffled experiment test sets csvs
# shuff_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/shuffled_cnn_outputs.csv',
#                 'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_gnn_outputs.csv',
#                 'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_supervised_shuffle.csv',
#                 '3D-SSL':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_ssl_shuffle.csv',
#                 'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/shuffled_mhcflurry_outputs.csv',
#                 'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/shuffled_mlp_outputs.csv',
# }

shuff_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/shuffled_crossval/shuffled_cnn_outputs_fold_1.csv',
                'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/shuffled_crossval/shuffled_gnn_outputs_n_run_1.csv',
                'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/shuffled_crossval/Shuffled_1_final_test_test_results.csv',
                'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/shuffled_mhcflurry_outputs.csv',
                'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/shuffled_crossval/Shuffled_1.csv',
}

save_models_csvs(models_csvs=shuff_models_csvs, full_df=full_df, experiment='shuffled')



In [33]:
full_df

Unnamed: 0,ID,allele,peptide,measurement_value,measurement_inequality,measurement_type,measurement_kind,measurement_source,original_allele,db2_folder,cluster,anchor_0,anchor_1,cluster_set_10,allele_clustering,PHASE,label
89815,BA-41323,HLA-A*01:01,AADKAAAAAY,45.000000,=,quantitative,affinity,Michel - purified MHC/competitive/radioactivity,HLA-A*01:01,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,10,0.0,1,Test,1.0
89816,BA-41324,HLA-A*01:01,AADKAAAAY,50.000000,=,quantitative,affinity,Sette - purified MHC/competitive/radioactivity,HLA-A*01:01,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,3.0,2,9,5.0,1,Test,1.0
89817,BA-41330,HLA-A*01:01,AADSFATSY,76.300000,=,quantitative,affinity,Buus - purified MHC/direct/fluorescence,HLA-A*01:01,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,2.0,2,9,5.0,1,Test,1.0
89818,BA-41350,HLA-A*01:01,AAHSARPPPY,18800.000000,=,quantitative,affinity,Sette - purified MHC/competitive/radioactivity,HLA-A*01:01,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,10,2.0,1,Test,0.0
89819,BA-41364,HLA-A*01:01,AASGFTFSSY,4972.665793,=,quantitative,affinity,kim2014,HLA-A*01:01,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,10,10.0,1,Test,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89810,BA-577885,HLA-E*01:03,VMAPLGPIL,800.000000,=,quantitative,affinity,kim2014,HLA-E*01:03,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,7.0,2,9,5.0,0,Train,0.0
89811,BA-577886,HLA-E*01:03,VMATRRNVL,16000.000000,=,quantitative,affinity,kim2014,HLA-E*01:03,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,9,4.0,0,Train,0.0
89812,BA-577887,HLA-E*01:03,VMMSEIAGL,13000.000000,=,quantitative,affinity,kim2014,HLA-E*01:03,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,9,5.0,0,Train,0.0
89813,BA-577888,HLA-E*01:03,VMTTVLATL,1800.000000,=,quantitative,affinity,kim2014,HLA-E*01:03,/projects/0/einf2380/data/pMHCI/3d_models/BA_1...,6.0,2,9,5.0,0,Train,0.0


In [35]:
#%% Get Allele clustered experiment test sets csvs
# allele_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/allele_cnn_outputs.csv',
#                 'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/allele_gnn_outputs.csv',
#                 'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_supervised_allele.csv',
#                 '3D-SSL':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/egnn_ssl_allele.csv',
#                 'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/allele_mhcflurry_outputs.csv',
#                 'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/allele_mlp_outputs.csv',
# }

allele_models_csvs = {'CNN':'/projects/0/einf2380/data/pop_paper_data/cnn_outputs/allele_crossval/allele_cnn_outputs_fold_1.csv',
                'GNN':'/projects/0/einf2380/data/pop_paper_data/gnn_outputs/allele_crossval/allele_gnn_outputs_n_run_1.csv',
                'EGNN':'/projects/0/einf2380/data/pop_paper_data/egnn_outputs/allele_crossval/AlleleClustered_1_final_test_test_results.csv',
                '3D-SSL':'/projects/0/einf2380/data/pop_paper_data/3DSSL_outputs/xray_only/xray_1_test_results.csv',
                'MHCflurry':'/projects/0/einf2380/data/pop_paper_data/mhcflurry_outputs/allele_mhcflurry_outputs.csv',
                'MLP':'/projects/0/einf2380/data/pop_paper_data/mlp_outputs/allele_crossval/AlleleClustered_1.csv',
}

save_models_csvs(models_csvs=allele_models_csvs, full_df=full_df, experiment='AlleleClustered')