In [1]:
import sys
import torch

seed = 42
#torch.set_default_dtype(torch.float64)
torch.manual_seed(seed)
torch.set_printoptions(precision=2, sci_mode=False)


import pandas as pd
import numpy as np
import ast
import copy

# Load Modules
sys.path.append("..")
from os.path import expanduser
home = expanduser("~")
from fiora.MOL.constants import DEFAULT_PPM, PPM, DEFAULT_MODES
from fiora.IO.LibraryLoader import LibraryLoader
from fiora.MOL.FragmentationTree import FragmentationTree 
import fiora.visualization.spectrum_visualizer as sv

from sklearn.metrics import r2_score
import scipy
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

print(f'Working with Python {sys.version}')


Working with Python 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]


## Data

In [2]:
from typing import Literal
lib: Literal["NIST", "MSDIAL", "NIST/MSDIAL"] = "NIST/MSDIAL"
print(f"Preparing {lib} library")

test_run = False # Default: False
if test_run:
    print("+++ This is a test run with a small subset of data points. Results are not representative. +++")

Preparing NIST/MSDIAL library


In [3]:
# key map to read metadata from pandas DataFrame
metadata_key_map = {
                "name": "Name",
                "collision_energy":  "CE", 
                "instrument": "Instrument_type",
                "ionization": "Ionization",
                "precursor_mz": "PrecursorMZ",
                "precursor_mode": "Precursor_type",
                "retention_time": "RETENTIONTIME",
                "ccs": "CCS"
                }


#
# Load specified libraries and align metadata
#

def load_training_data():
    L = LibraryLoader()
    df = L.load_from_csv(f"{home}/data/metabolites/preprocessed/training_min2.csv")
    return df

df = load_training_data()

# Restore dictionary values
dict_columns = ["peaks", "summary"]
for col in dict_columns:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x.replace('nan', 'None')))
    #df[col] = df[col].apply(ast.literal_eval)
    
df['group_id'] = df['group_id'].astype(int)


In [4]:
# import polars as pl
# pdf = pl.from_pandas(df)

In [5]:
%%capture
from fiora.MOL.Metabolite import Metabolite
from fiora.GNN.AtomFeatureEncoder import AtomFeatureEncoder
from fiora.GNN.BondFeatureEncoder import BondFeatureEncoder
from fiora.GNN.SetupFeatureEncoder import SetupFeatureEncoder


CE_upper_limit = 80.0
weight_upper_limit = 800.0


if test_run:
    df = df.iloc[:10000,:]
    #df = df.iloc[5000:20000,:]


df["Metabolite"] = df["SMILES"].apply(Metabolite)
df["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

node_encoder = AtomFeatureEncoder(feature_list=["symbol", "num_hydrogen", "ring_type"])
bond_encoder = BondFeatureEncoder(feature_list=["bond_type", "ring_type"])
setup_encoder = SetupFeatureEncoder(feature_list=["collision_energy", "molecular_weight", "precursor_mode", "instrument"])
rt_encoder = SetupFeatureEncoder(feature_list=["molecular_weight", "precursor_mode", "instrument"])

setup_encoder.normalize_features["collision_energy"]["max"] = CE_upper_limit 
setup_encoder.normalize_features["molecular_weight"]["max"] = weight_upper_limit 
rt_encoder.normalize_features["molecular_weight"]["max"] = weight_upper_limit 

df["Metabolite"].apply(lambda x: x.compute_graph_attributes(node_encoder, bond_encoder))
df.apply(lambda x: x["Metabolite"].set_id(x["group_id"]) , axis=1)

#df["summary"] = df.apply(lambda x: {key: x[name] for key, name in metadata_key_map.items()}, axis=1)
df.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)


In [6]:
%%capture
df["Metabolite"].apply(lambda x: x.fragment_MOL(depth=1))
df.apply(lambda x: x["Metabolite"].match_fragments_to_peaks(x["peaks"]["mz"], x["peaks"]["intensity"], tolerance=x["ppm_peak_tolerance"]), axis=1)

##### Load Casmi data

In [7]:
casmi16_path = f"{home}/data/metabolites/CASMI_2016/casmi16_withCSS.csv"
casmi22_path = f"{home}/data/metabolites/CASMI_2022/casmi22_withCSS.csv"

df_cas = pd.read_csv(casmi16_path, index_col=[0], low_memory=False)
df_cas22 = pd.read_csv(casmi22_path, index_col=[0], low_memory=False)

# Restore dictionary values
dict_columns = ["peaks", "Candidates"]
for col in dict_columns:
    df_cas[col] = df_cas[col].apply(ast.literal_eval)

df_cas22["peaks"] = df_cas22["peaks"].apply(ast.literal_eval)

In [8]:
%%capture
from fiora.MOL.collision_energy import NCE_to_eV

df_cas["RETENTIONTIME"] = df_cas["RTINSECONDS"] / 60.0
df_cas["Metabolite"] = df_cas["SMILES"].apply(Metabolite)
df_cas["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

df_cas["Metabolite"].apply(lambda x: x.compute_graph_attributes(node_encoder, bond_encoder))
df_cas["CE"] = 20.0 # actually stepped 20/35/50
df_cas["Instrument_type"] = "HCD" # CHECK if correct Orbitrap

metadata_key_map16 = {"collision_energy":  "CE", 
                 "instrument": "Instrument_type",
                 "precursor_mz": "PRECURSOR_MZ",
                 'precursor_mode': "Precursor_type",
                 "retention_time": "RETENTIONTIME"
                 }

df_cas["summary"] = df_cas.apply(lambda x: {key: x[name] for key, name in metadata_key_map16.items()}, axis=1)
df_cas.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder), axis=1)

# Fragmentation
df_cas["Metabolite"].apply(lambda x: x.fragment_MOL(depth=1))
df_cas.apply(lambda x: x["Metabolite"].match_fragments_to_peaks(x["peaks"]["mz"], x["peaks"]["intensity"], tolerance=100 * PPM), axis=1) # Optional: use mz_cut instead

#
# CASMI 22
#

df_cas22["Metabolite"] = df_cas22["SMILES"].apply(Metabolite)
df_cas22["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

df_cas22["Metabolite"].apply(lambda x: x.compute_graph_attributes(node_encoder, bond_encoder))
df_cas22["CE"] = df_cas22.apply(lambda x: NCE_to_eV(x["NCE"], x["precursor_mz"]), axis=1)

metadata_key_map22 = {"collision_energy":  "CE", 
                 "instrument": "Instrument_type",
                 "precursor_mz": "precursor_mz",
                 'precursor_mode': "Precursor_type",
                 "retention_time": "ChallengeRT"
                 }

df_cas22["summary"] = df_cas22.apply(lambda x: {key: x[name] for key, name in metadata_key_map22.items()}, axis=1)
df_cas22.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)

# Fragmentation
df_cas22["Metabolite"].apply(lambda x: x.fragment_MOL(depth=1))
df_cas22.apply(lambda x: x["Metabolite"].match_fragments_to_peaks(x["peaks"]["mz"], x["peaks"]["intensity"], tolerance=100 * PPM), axis=1) # Optional: use mz_cut instead

df_cas22 = df_cas22.reset_index()

## Model Training and Testing

In [9]:
from fiora.GNN.Trainer import Trainer
import torch_geometric as geom

if torch.cuda.is_available(): 
 dev = "cuda:3"
else: 
 dev = "cpu" 
 
print(f"Running on device: {dev}")




Running on device: cuda:3


##### Load Data

In [10]:
print(df.groupby("dataset")["group_id"].unique().apply(len))

df_test = df[df["dataset"] == "test"]
df_train = df[df["dataset"].isin(["train", "validation"])]

dataset
test           1296
train         10366
validation     1296
Name: group_id, dtype: int64


In [11]:
geo_data = df_train["Metabolite"].apply(lambda x: x.as_geometric_data().to(dev)).values
print(f"Prepared training/validation with {len(geo_data)} data points")

Prepared training/validation with 88737 data points


##### Prepare Model

In [12]:
model_params = {
    'param_tag': 'default',
    'gnn_type': 'RGCNConv',
    'depth': 5,
    'hidden_dimension': 450,
    'dense_layers': 2,
    'embedding_aggregation': 'concat',
    'embedding_dimension': 450,
    'input_dropout': 0.2,
    'latent_dropout': 0.1,
    'node_feature_layout': node_encoder.feature_numbers,
    'edge_feature_layout': bond_encoder.feature_numbers,    
    'static_feature_dimension': geo_data[0]["static_edge_features"].shape[1],
    'static_rt_feature_dimension': geo_data[0]["static_rt_features"].shape[1],
    'output_dimension': len(DEFAULT_MODES) * 2, # per edge 
}
training_params = {
    'epochs': 200 if not test_run else 10, 
    'batch_size': 256, #128,
    #'train_val_split': 0.90,
    'learning_rate': 0.0004,#0.001,
    'with_RT': True,
    'with_CCS': True
}

In [13]:

fixed_params = {"gnn_type": "RGCNConv"} # Mainly used for clarity
grid_params = [{'depth': 0}, {'depth': 1}, {'depth': 2}, {'depth': 3}, {'depth': 4}, {'depth': 5}, {'depth': 6}, {'depth': 7}, {'depth': 8}]
for p in grid_params:
    p.update(fixed_params)
#grid_params = [{'gnn_type': "GraphConv"}, {'gnn_type': "RGCNConv"}, {'gnn_type': "GAT"}, {'gnn_type': "TransformerConv"}]
#grid_params = [{'embedding_dimension': 300}, {'embedding_dimension': 400}, {'embedding_dimension': 500}]

##### Begin Training

In [14]:
from fiora.GNN.GNNModules import GNNCompiler
from fiora.MS.SimulationFramework import SimulationFramework
fiora = SimulationFramework(None, dev=dev, with_RT=training_params["with_RT"], with_CCS=training_params["with_CCS"])
np.seterr(invalid='ignore')
val_interval = 200


def train_new_model():
    model = GNNCompiler(model_params).to(dev)
    
    y_label = 'compiled_probsALL'
    train_keys, val_keys = df_train[df_train["dataset"] == "train"]["group_id"].unique(), df_train[df_train["dataset"] == "validation"]["group_id"].unique()
    trainer = Trainer(geo_data, y_tag=y_label, problem_type="regression", train_keys=train_keys, val_keys=val_keys, split_by_group=True, seed=seed, device=dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=training_params["learning_rate"])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.98)    

    loss_fn = torch.nn.MSELoss()
    
    # from accelerate import notebook_launcher
    # args = {"model": model, "optimizer": optimizer, "loss_fn": loss_fn, "scheduler": scheduler, "batch_size": training_params['batch_size'], "epochs": training_params["epochs"], "val_every_n_epochs": val_interval, "with_RT": True, "masked_validation": training_params["with_RT"], "mask_name": "compiled_validation_maskALL"}
    # notebook_launcher(trainer.train, args,  num_processes=4)
    trainer.train(model, optimizer, loss_fn, scheduler=scheduler, batch_size=training_params['batch_size'], epochs=training_params["epochs"], val_every_n_epochs=val_interval, with_RT=training_params["with_RT"], masked_validation=False) #, mask_name="compiled_validation_maskALL")
    
    return model



def simulate_all(model, DF):
    return fiora.simulate_all(DF, model)

    
def test_model(model, DF):
    dft = simulate_all(model, DF)
    
    return dft["spectral_sqrt_cosine"].values

##### Test CASMI 16 and 22

In [15]:
from fiora.MOL.collision_energy import NCE_to_eV
from fiora.MS.spectral_scores import spectral_cosine, spectral_reflection_cosine, reweighted_dot
from fiora.MS.ms_utility import merge_annotated_spectrum

def test_cas16(model, df_cas=df_cas):
    
    df_cas["NCE"] = 20.0 # actually stepped NCE 20/35/50
    df_cas["CE"] = df_cas[["NCE", "PRECURSOR_MZ"]].apply(lambda x: NCE_to_eV(x["NCE"], x["PRECURSOR_MZ"]), axis=1)
    df_cas["step1_CE"] = df_cas["CE"]
    df_cas["summary"] = df_cas.apply(lambda x: {key: x[name] for key, name in metadata_key_map16.items()}, axis=1)
    df_cas.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)
    df_cas = fiora.simulate_all(df_cas, model, suffix="_20")

    df_cas["NCE"] = 35.0 # actually stepped NCE 20/35/50
    df_cas["CE"] = df_cas[["NCE", "PRECURSOR_MZ"]].apply(lambda x: NCE_to_eV(x["NCE"], x["PRECURSOR_MZ"]), axis=1)
    df_cas["step2_CE"] = df_cas["CE"]
    df_cas["summary"] = df_cas.apply(lambda x: {key: x[name] for key, name in metadata_key_map16.items()}, axis=1)
    df_cas.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)
    df_cas = fiora.simulate_all(df_cas, model, suffix="_35")


    df_cas["NCE"] = 50.0 # actually stepped NCE 20/35/50
    df_cas["CE"] = df_cas[["NCE", "PRECURSOR_MZ"]].apply(lambda x: NCE_to_eV(x["NCE"], x["PRECURSOR_MZ"]), axis=1)
    df_cas["step3_CE"] = df_cas["CE"]
    df_cas["summary"] = df_cas.apply(lambda x: {key: x[name] for key, name in metadata_key_map16.items()}, axis=1)
    df_cas.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)
    df_cas = fiora.simulate_all(df_cas, model, suffix="_50")

    df_cas["avg_CE"] = (df_cas["step1_CE"] + df_cas["step2_CE"] + df_cas["step3_CE"]) / 3

    df_cas["merged_peaks"] = df_cas.apply(lambda x: merge_annotated_spectrum(merge_annotated_spectrum(x["sim_peaks_20"], x["sim_peaks_35"]), x["sim_peaks_50"]) , axis=1)
    df_cas["merged_cosine"] = df_cas.apply(lambda x: spectral_cosine(x["peaks"], x["merged_peaks"]), axis=1)
    df_cas["merged_sqrt_cosine"] = df_cas.apply(lambda x: spectral_cosine(x["peaks"], x["merged_peaks"], transform=np.sqrt), axis=1)
    df_cas["merged_refl_cosine"] = df_cas.apply(lambda x: spectral_reflection_cosine(x["peaks"], x["merged_peaks"], transform=np.sqrt), axis=1)
    df_cas["merged_steins"] = df_cas.apply(lambda x: reweighted_dot(x["peaks"], x["merged_peaks"]), axis=1)
    df_cas["spectral_sqrt_cosine"] = df_cas["merged_sqrt_cosine"] # just remember it is merged

    df_cas["coverage"] = df_cas["Metabolite"].apply(lambda x: x.match_stats["coverage"])
    df_cas["RT_pred"] = df_cas["RT_pred_35"]
    df_cas["RT_dif"] = df_cas["RT_dif_35"]
    df_cas["CCS_pred"] = df_cas["CCS_pred_35"]
    df_cas["library"] = "CASMI-16"
    
    return df_cas["merged_sqrt_cosine"].values

In [16]:
results = []
for params in grid_params:
    print(f"Testing {params}")
    model_params.update(params)
    current_model = train_new_model()
    val_results = test_model(current_model, df_train[df_train["dataset"]== "validation"])
    test_results = test_model(current_model, df_test)
    casmi16_results = test_cas16(current_model)
    casmi16_p = test_cas16(current_model, df_cas[df_cas["Precursor_type"] == "[M+H]+"])
    casmi16_n = test_cas16(current_model, df_cas[df_cas["Precursor_type"] == "[M-H]-"])
    casmi22_results = test_model(current_model, df_cas22)
    casmi22_p = test_model(current_model, df_cas22[df_cas22["Precursor_type"] == "[M+H]+"])
    casmi22_n = test_model(current_model, df_cas22[df_cas22["Precursor_type"] == "[M-H]-"])
    
    results.append({**params, "model": copy.deepcopy(current_model), "validation": val_results, "test": test_results, "casmi16": casmi16_results, "casmi22": casmi22_results, "casmi16+": casmi16_p, "casmi16-": casmi16_n, "casmi22+": casmi22_p, "casmi22-": casmi22_n})
    

Testing {'depth': 0, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
	Validation RMSE: 0.03590353
Finished Training!


  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec = vec / np.linalg.norm(vec)
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec 

Testing {'depth': 1, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
	Validation RMSE: 0.03110272
Finished Training!


  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_

Testing {'depth': 2, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
	Validation RMSE: 0.02990245
Finished Training!


  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_

Testing {'depth': 3, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
	Validation RMSE: 0.02980240
Finished Training!


  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_

Testing {'depth': 4, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
	Validation RMSE: 0.03000244
Finished Training!


  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  df = pd.concat([df, pd.DataFrame(columns=[x + suffix for x in ["cosine_similarity", "kl_div", "sim_peaks", "spectral_cosine", "spectral_sqrt_cosine", "spectral_sqrt_cosine_wo_prec", "spectral_refl_cosine", "spectral_bias", "spectral_sqrt_bias", "spectral_sqrt_bias_wo_prec", "spectral_refl_bias", "steins_cosine", "steins_bias", "RT_pred", "RT_dif", "CCS_pred"]])])
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_other = vec_other / np.linalg.norm(vec_other)
  vec = vec / np.linalg.norm(vec)
  vec_

Testing {'depth': 5, 'gnn_type': 'RGCNConv'}
Using pre-arranged train/validation set
Epoch 184/200:  RMSE: 0.0249

In [None]:
LOG = pd.DataFrame(results)
eval_columns = LOG.columns[3:]

home_path = f"{home}/data/metabolites/benchmarking/"
NAME = model_params["gnn_type"] + "_depth_450d.csv"
for col in eval_columns:
    LOG[col] = LOG[col].apply(lambda x: str(list(x)))
LOG.to_csv(home_path + NAME, index=False, sep="\t")

In [None]:
LOGIC = pd.read_csv(home_path + NAME, sep="\t")
for col in eval_columns:
    LOGIC[col] = LOGIC[col].apply(lambda x: ast.literal_eval(x.replace('nan', 'None')))
#LOGIC[eval_columns].apply(lambda x: x.apply(np.median))

In [None]:
LOGIC[eval_columns] = LOGIC[eval_columns].apply(lambda x: x.apply(np.mean))
LOGIC


Unnamed: 0,depth,gnn_type,model,validation,test,casmi16,casmi22,casmi16+,casmi16-,casmi22+,casmi22-
0,0,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.581474,0.57161,0.569261,0.259477,0.543177,0.610159,0.252999,0.269566
1,1,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.67128,0.657462,0.642165,0.305777,0.613824,0.6866,0.302795,0.310421
2,2,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.681205,0.666663,0.655706,0.306965,0.62781,0.699444,0.305578,0.309124
3,3,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.680244,0.666641,0.651277,0.311274,0.620167,0.700054,0.303985,0.322627
4,4,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.671308,0.657519,0.647004,0.308702,0.616216,0.695276,0.30421,0.315698
5,5,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.670654,0.653653,0.648508,0.304933,0.61581,0.699775,0.301935,0.309602
6,6,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.666236,0.649947,0.6363,0.30498,0.610344,0.676996,0.30094,0.311273
7,7,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.388601,0.38524,0.509884,0.123084,0.468257,0.575151,0.137124,0.101218
8,8,CGConv,GNNCompiler(\n (node_embedding): FeatureEmbed...,0.441292,0.431561,0.381509,0.254741,0.376705,0.389042,0.239728,0.278123
