In [None]:
import numpy as np
import pandas as pd

In [None]:
split = 'chemical_cv4' # Run for each split of interest
train_assay_file = '../predictions/{}/assay_matrix_discrete_train_scaff.csv'.format(split)
test_assay_file = '../predictions/{}/assay_matrix_discrete_test_scaff.csv'.format(split)
aucs_file = '../predictions/{}/2022_01_evaluation_all_data.csv'.format(split)

train_assay_df = pd.read_csv(train_assay_file)
test_assay_df = pd.read_csv(test_assay_file)
aucs_df = pd.read_csv(aucs_file).set_index('assay_id')
modalities = ['mobc_es_op', 'cp_es_op', 'ge_es_op', 'late_fusion_cs_ge', 'late_fusion_cs_ge_mobc', 'late_fusion_cs_mobc', 'late_fusion_ge_mobc']
aucs_df = aucs_df[aucs_df['descriptor'].isin(modalities)]
modalities_dict = {'mobc_es_op':'MO', 'cp_es_op':'CS', 'ge_es_op':'GE', 'late_fusion_cs_ge':'CS+GE', 'late_fusion_cs_ge_mobc':'CS+GE+MO', 'late_fusion_cs_mobc':'CS+MO', 'late_fusion_ge_mobc':'GE+MO'}
aucs_df['descriptor'].replace(modalities_dict, inplace=True)
assays = train_assay_df.columns[1:]

In [None]:
readouts = {}
readouts_compounds = {}
hits = {}
hit_rate = {}
hits_compounds = {}
for a in assays:
    readouts[a] = np.count_nonzero(~np.isnan(test_assay_df[a].to_numpy()))
    readouts_compounds[a] = set(test_assay_df[test_assay_df[a].notna()]['smiles'])
    hits[a] = int(np.nansum(test_assay_df[a].to_numpy()))
    hits_compounds[a] = set(test_assay_df[test_assay_df[a] > 0]['smiles'])
    if readouts[a] != 0:
        hit_rate[a] = hits[a] / readouts[a]
    else:
        hit_rate[a] = 'NA'

readouts_df = pd.Series(readouts, name="readouts").to_frame()
hits_df = pd.Series(hits, name="hits").to_frame()
hit_rate_df = pd.Series(hit_rate, name="hit_rate").to_frame()

In [None]:
aucs_df_readouts_hits_df  = pd.merge(aucs_df, readouts_df, left_index=True, right_index=True)
aucs_df_readouts_hits_df  = pd.merge(aucs_df_readouts_hits_df, hits_df, left_index=True, right_index=True)
aucs_df_readouts_hits_df  = pd.merge(aucs_df_readouts_hits_df, hit_rate_df, left_index=True, right_index=True)
aucs_df_readouts_hits_df.index.name = 'assay_id'
aucs_df_readouts_hits_df

In [None]:
top_hit_rate_dict = {'CS':{}, 'GE': {}, 'MO':{}, 'CS+GE':{}, 'GE+MO':{}, 'CS+MO':{}, 'CS+GE+MO': {} }
list_dataframe = []
# ~3000 compounds / rank / rank only readouts / remove others 
# hit rate normalized / total number of hits (top number of hits)
predictions_cs_file = '../predictions/{}/predictions/predictions_cp_es_op.csv'.format(split)
predictions_cs_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_cs_df[predictions_cs_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['CS'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'CS', top_hit_rate_dict['CS'][a]])
    else:
        top_hit_rate_dict['CS'][a] = 'NA'
        list_dataframe.append([a,'CS','NA'])

        
predictions_mo_file = '../predictions/{}/predictions/predictions_mobc_es_op.csv'.format(split)
predictions_mo_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_mo_df[predictions_mo_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['MO'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'MO', top_hit_rate_dict['MO'][a]])
    else:
        top_hit_rate_dict['MO'][a] = 'NA'
        list_dataframe.append([a,'MO','NA'])
        
predictions_ge_file = '../predictions/{}/predictions/predictions_ge_es_op.csv'.format(split)
predictions_ge_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_ge_df[predictions_ge_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['GE'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'GE', top_hit_rate_dict['GE'][a]])
    else:
        top_hit_rate_dict['GE'][a] = 'NA'
        list_dataframe.append([a,'GE','NA'])
        
        
predictions_gemo_file = '../predictions/{}/predictions/late_fusion_ge_mobc.csv'.format(split)
predictions_gemo_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_gemo_df[predictions_gemo_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['GE+MO'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'GE+MO', top_hit_rate_dict['GE+MO'][a]])
    else:
        top_hit_rate_dict['GE+MO'][a] = 'NA'
        list_dataframe.append([a,'GE+MO','NA'])
        
        
predictions_csmo_file = '../predictions/{}/predictions/late_fusion_cs_mobc.csv'.format(split)
predictions_csmo_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_csmo_df[predictions_csmo_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['CS+MO'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'CS+MO', top_hit_rate_dict['CS+MO'][a]])
    else:
        top_hit_rate_dict['CS+MO'][a] = 'NA'
        list_dataframe.append([a,'CS+MO','NA'])
        

predictions_csge_file = '../predictions/{}/predictions/late_fusion_cs_ge.csv'.format(split)
predictions_csge_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_csge_df[predictions_csge_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['CS+GE'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'CS+GE', top_hit_rate_dict['CS+GE'][a]])
    else:
        top_hit_rate_dict['CS+GE'][a] = 'NA'
        list_dataframe.append([a,'CS+GE','NA'])
        
        
predictions_csgemo_file = '../predictions/{}/predictions/late_fusion_cs_ge_mobc.csv'.format(split)
predictions_csgemo_df = pd.read_csv(predictions_cs_file)
for a in assays:
    if hits[a] != 0:
        cut = predictions_csgemo_df[predictions_csgemo_df['smiles'].isin(readouts_compounds[a])]
        cut = cut.sort_values(by=a, ascending=False, na_position='last').head(hits[a])[['smiles',a]]
        hits_top1 = set(cut['smiles'])
        top_hit_rate_dict['CS+GE+MO'][a] = len(hits_compounds[a].intersection(hits_top1)) / hits[a]
        list_dataframe.append([a,'CS+GE+MO', top_hit_rate_dict['CS+GE+MO'][a]])
    else:
        top_hit_rate_dict['CS+GE+MO'][a] = 'NA'
        list_dataframe.append([a,'CS+GE+MO','NA'])

In [None]:
top1_hit_rate_df = pd.DataFrame(list_dataframe, columns=['assay_id', 'descriptor','top_rank_hit_rate'])
top1_hit_rate_df

In [None]:
final_df = pd.merge(aucs_df_readouts_hits_df.reset_index(level=0), top1_hit_rate_df)

In [None]:
final_df

In [None]:
final_df.to_csv('../predictions/toprank_{}_hitsnorm.csv'.format(split))