## 0. Imports

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import glob

from sqlalchemy.sql.functions import rank

from uqdd import DEVICE, FIGS_DIR, MODELS_DIR
# from uqdd.utils import get_config
from uqdd.utils import load_df, create_logger
from uqdd.models.utils_models import load_model, get_model_config
from uqdd.models.baseline import BaselineDNN
from uqdd.models.ensemble import EnsembleDNN
from uqdd.models.mcdropout import mc_predict
from uqdd.models.evidential import EvidentialDNN, ev_predict
from uqdd.models.eoe import EoEDNN
from uqdd.models.emc import emc_predict

# METRICS
from uqdd.models.utils_train import predict, evaluate_predictions, recalibrate_model, get_dataloader
from uqdd.models.utils_metrics import MetricsTable, process_preds, create_df_preds

# # Importing models and their predict functions
# from uqdd.models.utils_train import predict # for ensemble
import matplotlib.pyplot as plt
import ast
# Turn off interactive mode
plt.ioff()

Device: cpu


<contextlib.ExitStack at 0x7f30e1554e50>

In [10]:
data_name = 'papyrus'
activity_type = 'kx'
# activity_type = 'xc50'
type_n_targets = 'all'
project_name = '2024-11-20-kx-all'
project_out_name = f'reassess-{project_name}'
data_specific_path = f'{data_name}/{activity_type}/{type_n_targets}'
descriptor_protein='ankh-large'
descriptor_chemical='ecfp2048'
prot_input_dim=1536
chem_input_dim=2048

# PATHS
preds_dirpath = f'/users/home/bkhalil/Repos/uqdd/uqdd/data/predictions/{data_specific_path}/'
models_dir = f'/users/home/bkhalil/Repos/uqdd/uqdd/models/saved_models/{data_specific_path}/'
runs1_path = f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/kx-other.csv'
runs2_path = f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/evidential-kx-before-hpo.csv'
run_path= f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/kx-all.csv'

In [6]:

# Function to convert string representation of list to actual list of integers
def convert_to_list(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

runs1_df = load_df(
        runs1_path, 
        converters={
            'chem_layers': convert_to_list, 
            'prot_layers': convert_to_list, 
            'regressor_layers': convert_to_list
        })
runs2_df = load_df(
        runs2_path, 
        converters={
            'chem_layers': convert_to_list, 
            'prot_layers': convert_to_list, 
            'regressor_layers': convert_to_list
        })

# concatenate the two runs_df files and save them
runs_df = pd.concat([runs1_df, runs2_df], ignore_index=True)

In [7]:
runs_df.shape

(120, 85)

In [8]:
runs_df.head()

Unnamed: 0,Name,Created,Runtime,Hostname,Notes,State,Tags,ID,User,Group,...,val/loss,val/r2,val/rmse,test/epis_mean,test/epis_var,train/epis_mean,train/epis_var,val/epis_mean,val/epis_var,sweep
0,wobbly-cloud-15,2024-07-22T20:01:21.000Z,8630,wrhel01x5834061,-,finished,"20240722, ST, ankh-large, ecfp2048, ev_lamb=No...",xoqamyq8,,,...,-0.070576,0.755099,0.637134,,,,,,,
1,vague-salad-103,2024-07-30T22:08:22.000Z,297541,wrhel01x5834056,-,finished,"20240731, ST, ankh-large, ecfp2048, ensemble, ...",teijx5mc,,,...,0.458976,0.360001,0.994493,,,,,,,
2,royal-rain-29,2024-07-22T20:11:51.000Z,7335,wrhel01x5834056,-,finished,"20240722, ST, ankh-large, ecfp2048, ev_lamb=No...",o9d3ilk1,,,...,0.451906,0.36599,0.989829,,,,,,,
3,ancient-wildflower-97,2024-08-13T13:16:25.000Z,956080,wrhel01x5834061,-,failed,"20240813, ST, ankh-large, ecfp2048, ensemble, ...",m8kgrazw,,,...,0.839873,-0.023162,1.296771,,,,,,,
4,leafy-cherry-13,2024-07-22T20:00:56.000Z,13082,wrhel01x5834061,-,finished,"20240722, ST, ankh-large, ecfp2048, ev_lamb=No...",hodvavjw,,,...,-0.027205,0.742273,0.653605,,,,,,,


In [11]:
# save
runs_df.to_csv(run_path, index=False)

## 1. Predefined variables and Paths

In [4]:
data_name = 'papyrus'
# activity_type = 'kx'
activity_type = 'xc50'
type_n_targets = 'all'
# project_name = '2024-06-25-all-models-100'
project_name = 'evidential-models'
# project_name = '2024-07-22-all-models-100-kx'
# project_out_name = f'reassess-{project_name}-evidential-eps-test'
project_out_name = f'reassess-{project_name}'
data_specific_path = f'{data_name}/{activity_type}/{type_n_targets}'
# DESCRIPTORS
descriptor_protein='ankh-large'
descriptor_chemical='ecfp2048'
prot_input_dim=1536
chem_input_dim=2048

# PATHS
preds_dirpath = f'/users/home/bkhalil/Repos/uqdd/uqdd/data/predictions/{data_specific_path}/'
models_dir = f'/users/home/bkhalil/Repos/uqdd/uqdd/models/saved_models/{data_specific_path}/'
# runs_path = f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/{project_name}/runs.csv'
runs_path = f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/evidential-runs.csv'

# FIGS OUT PATH 
figs_out_path = f'/users/home/bkhalil/Repos/uqdd/uqdd/figures/{data_specific_path}/{project_out_name}/'

# check if the runs_path exists
if not os.path.exists(runs_path):
    raise FileNotFoundError(f"File {runs_path} not found")

# create figs_out_path if it does not exist
os.makedirs(figs_out_path, exist_ok=True)

logger = create_logger(
    name="reassess",
    file_level="debug",
    stream_level="debug",
)

2024-09-16 18:27:12:DEBUG:reassess:Logger reassess initialized:60857


In [5]:
os.makedirs(figs_out_path, exist_ok=True) 

## 2. Helper functions

In [10]:
from uqdd.models.utils_models import calculate_means


# Function to convert string representation of list to actual list of integers
def convert_to_list(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

# RUNS preprocessing
def preprocess_runs(
        runs_path,
        models_dir=MODELS_DIR,
        data_name='papyrus',
        activity_type='xc50',
        descriptor_protein='ankh-large',
        descriptor_chemical='ecfp2048',
        data_specific_path='papyrus/xc50/all',
        prot_input_dim=1536,
        chem_input_dim=2048,
):
    # Load csv runs and model names file
    runs_df = load_df(
        runs_path, 
        converters={
            'chem_layers': convert_to_list, 
            'prot_layers': convert_to_list, 
            'regressor_layers': convert_to_list
        })
    runs_df.rename(columns={'Name': 'run_name'}, inplace=True)
    
    # DEALING WITH MODEL_NAME IF DOESN'T EXIST
    runs_df['model_name'] = runs_df.apply(
    lambda row: f"{data_name}_{activity_type}_{row['model_type']}_{row['split_type']}_{descriptor_protein}_{descriptor_chemical}_{row['run_name']}"
    if pd.isna(row['model_name']) else row['model_name'], axis=1)
    
    i = 1
    # Update and match model_name with model saved files
    for index, row in runs_df.iterrows():
        model_name = row['model_name']
        model_file_pattern = os.path.join(models_dir, f"*{model_name}.pt")
        model_files = glob.glob(model_file_pattern)
        if model_files:
            model_file_path = model_files[0]
            model_name = os.path.basename(model_file_path).replace('.pt', '')
            # print(model_name)
            runs_df.at[index, 'model_name'] = model_name
            # add model_file_path to the runs_df
            runs_df.at[index, 'model_path'] = model_file_path
        else:
            print(f"{i} Model file(s) not found for {model_name} \n with pattern {model_file_pattern}")
            runs_df.at[index, 'model_path'] = ''
            i+=1
            
    # Ensure rest of variables set correctly across the runs
    runs_df['data_name'] = data_name
    runs_df['activity_type'] = activity_type
    runs_df['descriptor_protein'] = descriptor_protein
    runs_df['descriptor_chemical'] = descriptor_chemical
    runs_df['chem_input_dim'] = chem_input_dim
    runs_df['prot_input_dim'] = prot_input_dim
    runs_df['data_specific_path'] = data_specific_path
    
    return runs_df

# Get model class and predict function

def get_model_class(model_type: str):
    if model_type.lower() in ['baseline', 'mcdropout']:
        model_class = BaselineDNN
    elif model_type.lower() == 'ensemble':
        model_class = EnsembleDNN
    elif model_type.lower() in ['evidential','emc']:
        model_class = EvidentialDNN
    elif model_type.lower() == 'eoe':
        model_class = EoEDNN
    else:
        raise ValueError(f"Model type {model_type} not recognized")
    return model_class

def get_predict_fn(model_type: str, num_mc_samples=100):
    if model_type.lower() == 'mcdropout':
        predict_fn = mc_predict
        predict_kwargs = {"num_mc_samples": num_mc_samples}
        
    elif model_type.lower() in ['ensemble', 'baseline']:
        predict_fn = predict
        predict_kwargs = {}
        
    elif model_type.lower() in ['evidential','eoe']:
        predict_fn = ev_predict
        predict_kwargs = {}
    elif model_type.lower() == 'emc':
        predict_fn = emc_predict
        predict_kwargs = {"num_mc_samples": num_mc_samples}
    else:
        raise ValueError(f"Model type {model_type} not recognized")
    return predict_fn, predict_kwargs


def get_preds(model, dataloaders, model_type, subset='test', num_mc_samples=100):
    predict_fn, predict_kwargs = get_predict_fn(model_type, num_mc_samples=num_mc_samples)
    preds_res = predict_fn(model, dataloaders[subset], device=DEVICE, **predict_kwargs)
    if model_type in ['evidential', 'eoe', 'emc']:
        preds, labels, alea_vars, epi_vars = preds_res
    else:
        preds, labels, alea_vars = preds_res
        epi_vars = None
    if model_type in ['eoe', 'emc']:
        preds, alea_vars, epi_vars = calculate_means(preds, alea_vars, epi_vars)
    return preds, labels, alea_vars, epi_vars


def pkl_preds_export(preds, labels, alea_vars, epi_vars, outpath):
    # preds, labels = predict(model, dataloaders["test"], return_targets=True)
    y_true, y_pred, y_eps, y_err, y_alea = process_preds(
        preds, labels, alea_vars, epi_vars, None
    )
    df = create_df_preds(
        y_true,
        y_pred,
        y_alea,  # y_std,
        y_err,
        y_eps,  # y_alea,
        export=False,
        logger=logger,
    )
    
    df.to_pickle(os.path.join(outpath,"preds.pkl"))
    return df

def reassess_metrics(
        runs_df,
        figs_out_path,
        csv_out_path,
        project_out_name,
        logger,
):
    # Reversing rows
    # runs_df = runs_df[::-1]
    runs_df = runs_df.sample(frac=1).reset_index(drop=True)
    # Reassessing metrics and recalibration of the pretrained models
    # iterate but reverse the order of iter
    # only one model
    # runs_df = runs_df[runs_df['model_type'] == 'evidential']
    
    for index, row in runs_df.iterrows():
        # if index !=1: # Debugging
        #     continue
        model_path = row["model_path"]
        model_name = row["model_name"]

        # print(type(model_path))
        rowkwargs = row.to_dict()
        # popping the model_type
        model_type = rowkwargs.pop("model_type")
        activity_type = rowkwargs.pop("activity_type")

        if model_path:
            model_fig_out_path = os.path.join(figs_out_path, model_name)
            if os.path.exists(model_fig_out_path):
                print(f"Model {model_name} already reassessed")
                continue
            os.makedirs(model_fig_out_path, exist_ok=True)
            config = get_model_config(model_type=model_type, activity_type=activity_type, **rowkwargs)
            num_mc_samples = config.get("num_mc_samples", 100)
            model_class = get_model_class(model_type)
            prefix = "models." if model_type == 'eoe' else ''
            model = load_model(model_class, model_path, prefix_to_state_keys=prefix, config=config).to(DEVICE)

            # Getting DataLoaders
            dataloaders = get_dataloader(config, device=DEVICE, logger=logger)

            # RePredict and Evaluate preds
            preds, labels, alea_vars, epi_vars = get_preds(
                model, dataloaders, model_type, subset="test", num_mc_samples=num_mc_samples
            )
            df = pkl_preds_export(preds, labels, alea_vars, epi_vars, model_fig_out_path)

            # Calculate the metrics
            metrics, plots, uct_logger = evaluate_predictions(
                config,
                preds,
                labels,
                epi_vars, #alea_vars,
                model_type,
                logger,
                epi_vars=alea_vars, #epi_vars,
                wandb_push=False,
                run_name=config["run_name"],
                project_name=project_out_name,  # for the csv file
                figpath=model_fig_out_path,
                export_preds=False,
                verbose=False,
                csv_path=csv_out_path,
            )

            # Recalibrate model
            preds_val, labels_val, alea_vars_val, epi_vars_val = get_preds(
                model, dataloaders, model_type, subset="val"
            )

            iso_recal_model = recalibrate_model(
                preds_val,
                labels_val,
                epi_vars_val, #alea_vars_val,
                preds,
                labels,
                epi_vars, #alea_vars,
                config=config,
                epi_val=alea_vars_val, #epi_vars_val,
                epi_test=alea_vars, # epi_vars,
                uct_logger=uct_logger,
                figpath=model_fig_out_path,
            )

            # Log the metrics to the CSV file
            uct_logger.csv_log()

In [7]:
# Load runs_df and preprocess it
runs_df = preprocess_runs(
    runs_path,
    models_dir=models_dir,
    data_name=data_name,
    activity_type=activity_type,
    descriptor_protein=descriptor_protein,
    descriptor_chemical=descriptor_chemical,
    data_specific_path=data_specific_path,
    prot_input_dim=prot_input_dim,
    chem_input_dim=chem_input_dim
)

In [8]:
runs_df.head()

Unnamed: 0,run_name,Created,Runtime,Hostname,Notes,GPU Type,GPU Count,State,Tags,ID,...,train/rmse,val/alea_mean,val/alea_var,val/epis_mean,val/epis_var,val/evs,val/loss,val/r2,val/rmse,model_path
0,avid-tree-8,2024-09-12T09:43:30.000Z,2142,wrhel85x0203435,-,,,finished,"20240912, ST, ankh-large, ecfp2048, ev_lamb=0....",bzbjuvpu,...,0.56964,0.108124,0.001743,0.44611,0.013941,0.648331,1.036259,0.648193,0.7102,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...
1,balmy-cosmos-19,2024-09-12T10:21:21.000Z,971,wrhel85x0902954,-,"[NVIDIA GeForce RTX 3080, NVIDIA GeForce RTX 3...",3.0,finished,"20240912, ST, ankh-large, ecfp2048, ev_lamb=0....",46p9m0o2,...,0.852387,0.460061,0.006864,0.51552,0.003308,0.448427,1.319654,0.447717,0.916857,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...
2,clear-lake-26,2024-09-12T10:24:51.000Z,1974,wrhel85x0203435,-,,,finished,"20240912, ST, ankh-large, ecfp2048, ev_lamb=0....",v3654xvp,...,1.003982,0.686105,0.014317,0.569359,0.006179,0.065126,1.613954,0.038869,1.17378,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...
3,deft-morning-1,2024-09-12T09:43:25.000Z,959,wrhel85x0902954,-,"[NVIDIA GeForce RTX 3080, NVIDIA GeForce RTX 3...",3.0,finished,"20240912, ST, ankh-large, ecfp2048, ev_lamb=0....",qd90k881,...,0.561006,0.112018,0.00215,0.429717,0.012069,0.647882,1.050448,0.646402,0.712005,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...
4,distinctive-flower-21,2024-09-12T10:22:25.000Z,881,wrhel85x1a02514,-,"[NVIDIA GeForce RTX 3090, NVIDIA GeForce RTX 3...",8.0,finished,"20240912, ST, ankh-large, ecfp2048, ev_lamb=0....",rc7b5ixx,...,1.017255,0.785353,0.017982,0.51283,0.003159,0.11623,1.566719,0.102099,1.134513,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...


In [9]:
runs_df.shape

(30, 86)

In [8]:
# runs_df.shape
# runs_df.columns
# runs_df[runs_df['model_type'] == 'ensemble'].shape
print(runs_df[runs_df['split_type'] == 'time'].shape)
runs_df[runs_df['split_type'] == 'time'].head()


(50, 62)


Unnamed: 0,run_name,Commit,Created,Runtime,GitHub,Hostname,Notes,GPU Type,GPU Count,State,...,val/epis_var,val/evs,val/loss,val/r2,val/rmse,model_path,data_name,activity_type,descriptor_protein,descriptor_chemical
20,vivid-plant-104,1ec3f45a84840b8d4d6cececb684d04c700fad15,2024-08-13T13:39:01.000Z,86619,https://github.com/bolak92/uqdd/tree/1ec3f45a8...,wrhel01x5834056,-,Tesla K80,16,finished,...,,0.098768,0.626355,0.090651,1.141723,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...,papyrus,xc50,ankh-large,ecfp2048
21,floral-snowball-104,1ec3f45a84840b8d4d6cececb684d04c700fad15,2024-08-13T13:39:01.000Z,84364,https://github.com/bolak92/uqdd/tree/1ec3f45a8...,wrhel01x5834056,-,Tesla K80,16,finished,...,,0.103932,0.619658,0.094918,1.139041,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...,papyrus,xc50,ankh-large,ecfp2048
22,comic-dragon-106,1ec3f45a84840b8d4d6cececb684d04c700fad15,2024-08-13T13:39:08.000Z,87786,https://github.com/bolak92/uqdd/tree/1ec3f45a8...,wrhel01x5834056,-,Tesla K80,16,finished,...,,0.109212,0.619366,0.098738,1.136635,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...,papyrus,xc50,ankh-large,ecfp2048
23,ethereal-cherry-107,1ec3f45a84840b8d4d6cececb684d04c700fad15,2024-08-13T13:39:18.000Z,84545,https://github.com/bolak92/uqdd/tree/1ec3f45a8...,wrhel01x5834056,-,Tesla K80,16,finished,...,,0.117048,0.614751,0.103298,1.133755,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...,papyrus,xc50,ankh-large,ecfp2048
24,balmy-butterfly-108,1ec3f45a84840b8d4d6cececb684d04c700fad15,2024-08-13T13:39:35.000Z,87740,https://github.com/bolak92/uqdd/tree/1ec3f45a8...,wrhel01x5834056,-,Tesla K80,16,finished,...,,0.096367,0.631799,0.078204,1.14951,/users/home/bkhalil/Repos/uqdd/uqdd/models/sav...,papyrus,xc50,ankh-large,ecfp2048


In [9]:
runs_df.columns

Index(['run_name', 'Commit', 'Created', 'Runtime', 'GitHub', 'Hostname',
       'Notes', 'GPU Type', 'GPU Count', 'State', 'Tags', 'MT', 'batch_size',
       'chem_input_dim', 'ckpt_name', 'ckpt_path', 'data_specific_path',
       'dropout', 'ensemble_size', 'lamb', 'loss', 'model', 'model_name',
       'model_type', 'prot_input_dim', 'prot_layers', 'repeats', 'seed',
       'split_type', 'weight_decay', 'wt_resampler', 'model/gnorm',
       'model/pnorm', 'test/alea_mean', 'test/alea_var', 'test/epis_mean',
       'test/epis_var', 'test/evs', 'test/loss', 'test/r2', 'test/rmse',
       'train/alea_mean', 'train/alea_var', 'train/epis_mean',
       'train/epis_var', 'train/evs', 'train/loss', 'train/r2', 'train/rmse',
       'val/alea_mean', 'val/alea_var', 'val/epis_mean', 'val/epis_var',
       'val/evs', 'val/loss', 'val/r2', 'val/rmse', 'model_path', 'data_name',
       'activity_type', 'descriptor_protein', 'descriptor_chemical'],
      dtype='object')

In [10]:
# overwrite runs_df
# runs_df.to_csv(runs_path, index=False)

In [11]:
csv_out_path = figs_out_path + "evidential-metrics.csv"

In [None]:
reassess_metrics(runs_df, figs_out_path, csv_out_path, project_out_name, logger)

Evidential prediction: 100%|██████████| 56/56 [00:00<00:00, 97.62it/s] 


In [12]:
csv_out_path = figs_out_path + "metrics2.csv"
# reassess_metrics(runs_df, figs_out_path, csv_out_path, project_out_name, logger)

# DEV trials

In [8]:
eoe = runs_df[runs_df['model_type'] == 'eoe']
one_eoe = eoe.iloc[0]

In [9]:
one_eoe

run_name                                          misunderstood-pond-138
Commit                          ea29aec992ca6d5ff06a4ff4c5eabcb194327bbf
Created                                         2024-08-20T09:01:51.000Z
Runtime                                                            82782
GitHub                 https://github.com/bolak92/uqdd/tree/ea29aec99...
                                             ...                        
model_path             /users/home/bkhalil/Repos/uqdd/uqdd/models/sav...
data_name                                                        papyrus
activity_type                                                       xc50
descriptor_protein                                            ankh-large
descriptor_chemical                                             ecfp2048
Name: 30, Length: 62, dtype: object

In [10]:
model_path = one_eoe['model_path']
model_name = one_eoe['model_name']

one_eoe_dict = one_eoe.to_dict()
model_type = one_eoe_dict.pop('model_type')
activity_type = one_eoe_dict.pop('activity_type')

print(model_path)

/users/home/bkhalil/Repos/uqdd/uqdd/models/saved_models/papyrus/xc50/all/20240820-papyrus_xc50_eoe_random_ankh-large_ecfp2048_misunderstood-pond-138.pt


In [11]:
model_fig_out_path = os.path.join(figs_out_path, model_name+'-eoe-test')
config = get_model_config(model_type=model_type, activity_type=activity_type, **one_eoe_dict)
model_class = get_model_class(model_type)
print(config)
prefix = "models." if model_type == 'eoe' else ''

model = load_model(model_class, model_path, prefix_to_state_keys=prefix, config=config).to(DEVICE)

{'model_type': 'eoe', 'ensemble_size': 10, 'lamb': 1.0, 'loss': 'evidential_regression', 'aleatoric': True, 'chem_layers': [512], 'prot_layers': [1024, 512, 256], 'regressor_layers': [64], 'dropout': 0.1, 'batch_size': 256, 'epochs': 3000, 'early_stop': 100, 'loss_reduction': 'mean', 'optimizer': 'adagrad', 'lr': 0.0005, 'weight_decay': 0.01, 'lr_scheduler': None, 'max_norm': 50.0, 'activity_type': 'xc50', 'run_name': 'misunderstood-pond-138', 'Commit': 'ea29aec992ca6d5ff06a4ff4c5eabcb194327bbf', 'Created': '2024-08-20T09:01:51.000Z', 'Runtime': 82782, 'GitHub': 'https://github.com/bolak92/uqdd/tree/ea29aec992ca6d5ff06a4ff4c5eabcb194327bbf', 'Hostname': 'wrhel01x5834056', 'Notes': '-', 'GPU Type': 'Tesla K80', 'GPU Count': 16, 'State': 'finished', 'Tags': '20240820, ST, ankh-large, ecfp2048, eoe, ev_lamb=1.0, max_norm=50.0, no_median_scaling, papyrus, random, regression, xc50', 'MT': nan, 'chem_input_dim': 2048, 'ckpt_name': nan, 'ckpt_path': nan, 'data_specific_path': 'papyrus/xc50/al

In [12]:
model

EoEDNN(
  (models): ModuleList(
    (0-9): 10 x EvidentialDNN(
      (chem_feature_extractor): Sequential(
        (0): Linear(in_features=2048, out_features=512, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
      )
      (prot_feature_extractor): Sequential(
        (0): Linear(in_features=1536, out_features=1024, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=1024, out_features=512, bias=True)
        (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU()
        (6): Dropout(p=0.1, inplace=False)
        (7): Linear(in_features=512, out_features=256, bias=True)
        (8): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (9): ReLU()
        (10): Dropout(p=0.1, inplace=False)
      )
      (regressor_or_classifier): Sequential(
        (0): Linear(in_features=768, out_features=64, bias=True)
        (1): R

In [23]:
# Getting DataLoaders
dataloaders = get_dataloader(config, device=DEVICE, logger=logger)
        
        # RePredict and Evaluate preds
preds, labels, alea_vars, epi_vars = get_preds(model, dataloaders, model_type, subset='test')

Evidential prediction: 100%|██████████| 56/56 [00:00<00:00, 99.38it/s] 


In [24]:
(preds.shape, labels.shape, alea_vars.shape, epi_vars.shape)

(torch.Size([14088, 1]),
 torch.Size([14088, 1]),
 torch.Size([14088, 1]),
 torch.Size([14088, 1]))

In [25]:
y_true, y_pred, y_std, y_err, vars_ = process_preds(preds, labels.squeeze(-1), alea_vars, epi_vars)

In [None]:
# Reassessing metrics and recalibration of the pretrained models
for index, row in runs_df.iterrows():
    if index != 1: # Debugging
        continue
    model_path = row['model_path']
    model_name = row['model_name']
    # print(type(model_path))
    rowkwargs = row.to_dict()
    # popping the model_type 
    model_type = rowkwargs.pop('model_type')

    if model_path:
        model_fig_out_path = os.path.join(figs_out_path, model_name)
        config = get_model_config(model_type=model_type, **rowkwargs)
        model_class = get_model_class(model_type)
        model = load_model(model_class, model_path, config=config).to(DEVICE)
        
        # Getting DataLoaders
        dataloaders = get_dataloader(config, device=DEVICE, logger=logger)
        
        # RePredict and Evaluate preds
        preds, labels, alea_vars, epi_vars = get_preds(model, dataloaders, model_type, subset='test')
        process_preds(preds, labels, alea_vars, epi_vars)
        # predict_fn, predict_kwargs = get_predict_fn(model_type)
        # print(predict_fn)
        
        # preds_res = predict_fn(model, dataloaders["test"], device=DEVICE, **predict_kwargs)

        # Calculate the metrics
        metrics, plots, uct_logger = evaluate_predictions(
            config, 
            preds, 
            labels, 
            alea_vars, 
            model_type, 
            logger, 
            epi_vars=epi_vars,
            wandb_push=False,
            run_name=config['run_name'], 
            project_name=project_out_name, # for the csv file
            figpath=model_fig_out_path, 
            export_preds=False, 
            verbose=False
        )
        
        # Recalibrate model
        preds_val, labels_val, alea_vars_val, epi_vars_val = get_preds(model, dataloaders, model_type, subset='val')
        # # Recalibration (Validation Set)
        # preds_val, labels_val, alea_vars_val = predict_fn(model, dataloaders["val"], device=DEVICE, **predict_kwargs)
        # 
        iso_recal_model = recalibrate_model(
            preds_val, 
            labels_val,
            alea_vars_val,
            preds, 
            labels, 
            alea_vars,
            config=config, 
            epi_val=epi_vars_val,
            epi_test=epi_vars,
            uct_logger=uct_logger,
            figpath=model_fig_out_path
        )
        
        # Log the metrics to the CSV file
        # uct_logger.csv_log()
        break
        

In [None]:
predict_kwargs

In [None]:
uct_logger.csv_log()

In [None]:
# Load csv runs and model names file
runs_df = load_df(runs_path, converters={'chem_layers': convert_to_list, 'prot_layers': convert_to_list, 'regressor_layers': convert_to_list})
runs_df.shape

In [None]:
# runs_df.columns, runs_df.shape, runs_df.dtypes

In [None]:
runs_df.head()

In [None]:
### Adding model_name
# date-{data_name}_{activity_type}_{split_type}_{descriptor_protein}_{descriptor_chemical}_{data_specific_path}_{run_name}
# 20240625-papyrus_xc50_mcdropout_random_ankh-large_ecfp2048_stoic-firebrand-80
# Fill missing model_name with the required format
runs_df['model_name'] = runs_df.apply(
    lambda row: f"papyrus_xc50_{row['model_type']}_{row['split_type']}_ankh-large_ecfp2048_{row['Name']}"
    if pd.isna(row['model_name']) else row['model_name'], axis=1)

In [None]:
runs_df.head()

In [None]:
# Update model_name
for index, row in runs_df.iterrows():
    model_name = row['model_name']
    model_file_pattern = os.path.join(models_dir, f"*{model_name}.pt")
    model_files = glob.glob(model_file_pattern)
    if model_files:
        model_file_path = model_files[0]
        model_name = os.path.basename(model_file_path).replace('.pt', '')
        # print(model_name)
        runs_df.at[index, 'model_name'] = model_name
        # add model_file_path to the runs_df
        runs_df.at[index, 'model_file_path'] = model_file_path
    else:
        print(model_file_pattern)
        print(model_files)

In [None]:
runs_df.head(-5)

In [None]:
runs_df['data_name'] = 'papyrus'
runs_df['activity_type'] = 'xc50'
runs_df['descriptor_protein'] = 'ankh-large'
runs_df['descriptor_chemical'] = 'ecfp2048'
runs_df['chem_input_dim'] = 2048
runs_df['prot_input_dim'] = 1536
runs_df['data_specific_path'] = 'papyrus/xc50/all'

In [None]:
runs_df.head()

In [None]:
def get_model_class(model_type: str):
    if model_type.lower() in ['baseline', 'mcdropout']:
        model_class = BaselineDNN
    elif model_type.lower() == 'ensemble':
        model_class = EnsembleDNN
    elif model_type.lower() == 'evidential':
        model_class = EvidentialDNN
    else:
        raise ValueError(f"Model type {model_type} not recognized")
    return model_class


In [None]:
def get_predict_fn(model_type: str):
    if model_type.lower() == 'mcdropout':
        predict_fn = mc_predict
        predict_kwargs = {"aleatoric": True, "num_mc_samples": 100}
        
    elif model_type.lower() in ['ensemble', 'baseline']:
        predict_fn = predict
        predict_kwargs = {}
        
    elif model_type.lower() == 'evidential':
        predict_fn = ev_predict
        predict_kwargs = {}
    else:
        raise ValueError(f"Model type {model_type} not recognized")
    return predict_fn, predict_kwargs

In [None]:
logger = create_logger(
    name="reassess",
    file_level="info",
    stream_level="info",
)

In [None]:
type(runs_df.iloc[2]['chem_layers'])

In [None]:
runs_df.iloc[2] 

In [None]:
runs_df.model_file_path.dtype

In [None]:

        # else:
        #     yield None, None

In [None]:
# for x, y in get_model(runs_df):
#     print(x, y)
#     break

In [None]:
row = runs_df.iloc[20]  # FOR TESTING 20 mcdp 1 ensemble 30 evidential

run_name = row['Name']
model_type = row['model_type']
print(model_type)
split_type = row['split_type']
model_name = row['model_name']
model_path = row['model_file_path']
chem_input_dim = row['chem_input_dim']
prot_input_dim = row['prot_input_dim']
chem_layers = row['chem_layers']
print(f'{chem_layers=}')
prot_layers = row['prot_layers']
print(f'{prot_layers=}')
regressor_layers = row['regressor_layers']
print(f'{regressor_layers=}')

dropout = row['dropout']
lamb = row['lamb']
ensemble_size = row['ensemble_size']
lr = row['lr']

config = get_model_config(
    model_type=model_type,
    split_type=split_type,
    data_name="papyrus",
    activity_type="xc50",
    descriptor_protein="ankh-large",
    descriptor_chemical="ecfp2048",
    data_specific_path="papyrus/xc50/all",
    model_name=model_name,
    chem_input_dim=chem_input_dim,
    prot_input_dim=prot_input_dim,
    chem_layers=chem_layers,
    prot_layers=prot_layers,
    regressor_layers=regressor_layers,
    dropout=dropout,
    lr=lr,
    lamb=lamb,
    ensemble_size=ensemble_size,
)
if model_path:
    print(model_type)
    model_class = get_model_class(model_type)
    print(model_class)
    model = load_model(model_class, model_path, config=config).to(DEVICE)


In [None]:
model

In [None]:
config['batch_size']

In [None]:
dataloaders = get_dataloader(config, device=DEVICE, logger=logger)
print(dataloaders.keys())
print(dataloaders["train"].dataset.__len__())
print(dataloaders["val"].dataset.__len__())
print(dataloaders["test"].dataset.__len__())
print(len(dataloaders["train"]))
print(len(dataloaders["val"]))
print(len(dataloaders["test"]))
len(dataloaders["train"].dataset.data)


In [None]:
predict_fn, predict_kwargs = get_predict_fn(model_type)
print(predict_fn)
preds, labels, alea_vars = predict_fn(model, dataloaders["test"], device=DEVICE, **predict_kwargs)

In [None]:
print(preds.shape)
print(labels.shape)
print(alea_vars.shape)

In [None]:
alea_vars

In [None]:
figpath = FIGS_DIR / 'papyrus/xc50/all' / '2024-06-25-all-models-100-reassess' / model_name
print(figpath)
# figpath.mkdir(parents=True, exist_ok=True)

In [None]:

# Calculate the metrics
metrics, plots, uct_logger = evaluate_predictions(config, preds, labels, alea_vars, model_type, logger, run_name=run_name, project_name="test", figpath=figpath, export_preds=False, verbose=False)

In [None]:
# Recalibrate model
preds_val, labels_val, alea_vars_val = predict(model, dataloaders["val"], device=DEVICE)
y_true_val, y_pred_val, y_std_val, y_err_val, y_alea_val = process_preds(
        preds_val, labels_val, alea_vars_val, epi_vars=None
    )

In [None]:
# Recalibrate model
preds_val, labels_val, alea_vars_val = predict(model, dataloaders["val"], device=DEVICE)
iso_recal_model = recalibrate_model(
        preds_val, 
        labels_val,
        alea_vars_val,
        preds, 
        labels, 
        alea_vars,
        config=config, 
        uct_logger=uct_logger,
        figpath=figpath
    )

In [None]:
# from uqdd.models.utils_metrics import process_preds
# 
# y_true_val, y_pred_val, y_std_val, y_err_val, y_alea_val = process_preds(
#         preds_val, labels_val,  epi_vars=None
#     )

# Final workflow
## 1. Preprocessing runs_df

In [None]:
y_alea_val.shape

In [None]:
# Iterate over each run in the runs.csv file
# i = 0
for index, row in runs_df.iterrows():
    run_name = row['Name']
    model_type = row['model_type']
    split_type = row['split_type']
    model_name = row['model_name']
    model_path = row['model_file_path']
    if model_path:
        print(model_type)
        model_class = get_model_class(model_type)
        model = load_model(model_class, model_path)
        config = get_model_config(
            model_type=model_type,
            split_type=split_type,
            data_name="papyrus",
            activity_type="xc50",
            descriptor_protein="ankh-large",
            descriptor_chemical="ecfp2048",
            data_specific_path="papyrus/xc50/all",
            model_name=model_name,
        )
        # Get dataloaders
        dataloaders = get_dataloader(config, device=DEVICE, logger=logger)
        print(dataloaders.keys())
        print(dataloaders["test"].dataset.__len__())
        print(f"Model: {model_name}")
        print(f"Model type: {model_type}")
        print(f"Split type: {split_type}")
        
        break        
        # # Generate predictions
        # preds, labels, alea_vars = predict(model, dataloaders["test"], device=DEVICE)
        # 
        # break
        
#     if not model_files:
#         print(model_file_pattern)
#         print(model_files)
#         i+=1
# print(i)
    # if index == 5:
    #     break

In [None]:
# Load predictions for each model_name
# predictions_path = 'predictions.csv'

filename = 'preds.csv'
predictions_df = load_df(preds_dirpath+filename)


## Apply some fixes to runs.csv file
1. replace _ between scaffold_cluster with -
2. create the relevant model_name if not present (fixing a bug with ensemble)
3. match with file names after the date to avoid descripancies

In [None]:
# Iterate over each run in the runs.csv file
for index, row in runs_df.iterrows():
    run_name = row['Name']
    model_type = row['model_type']
    split_type = row['split_type']
    model_name = row['model_name']
    
    model_file_path = os.path.join(models_dir, f"{model_name}.pt")
    
    if os.path.exists(model_file_path):
        
        
        model = load_model(model_class, model_file_path)
        
        # Create the model configuration
        config = get_model_config(
            model_type=model_type,
            data_name="papyrus",
            activity_type="xc50",
            split_type=split_type,
            descriptor_protein="ankh-large",
            descriptor_chemical="ecfp2048",
            data_specific_path="papyrus/xc50/all"
        )
        
        # Get dataloaders
        dataloaders = get_dataloader(config, device=DEVICE, logger=LOGGER)
        
        # Generate predictions
        preds, labels, alea_vars = predict(model, dataloaders["test"], device=DEVICE)
        
        # Calculate the metrics
        metrics, plots, uct_logger = evaluate_predictions(config, preds, labels, alea_vars, model_type, LOGGER)
        
        # Recalibrate model
        preds_val, labels_val, alea_vars_val = predict(model, dataloaders["val"], device=DEVICE)
        iso_recal_model, std_recal = recalibrate_model(preds_val, labels_val, preds, labels, config, uct_logger=uct_logger)
        
        # Log the metrics to the CSV file
        uct_logger.csv_log()

In [None]:
dataloaders = get_dataloader(config, device=DEVICE, logger=LOGGER)

preds, labels, alea_vars = predict(
    ensemble_model, dataloaders["test"], device=DEVICE
)
# Then comes the predict metrics part
metrics, plots, uct_logger = evaluate_predictions(
    config, preds, labels, alea_vars, "mcdropout", LOGGER
)

# RECALIBRATION # Get Calibration / Validation Set # This predict is changeable according to function
preds_val, labels_val, alea_vars_val = predict(
    ensemble_model, dataloaders["val"], device=DEVICE
)
iso_recal_model, std_recal = recalibrate_model(
    preds_val, labels_val, preds, labels, config_, uct_logger=uct_logger
)



In [None]:

# get arrays from df cols
y_true = predictions_df['y_true'].values
y_pred = predictions_df['y_pred'].values
y_err = predictions_df['y_err'].values
y_alea = predictions_df['y_alea'].values
eps_col = 'y_std' if 'y_std' in predictions_df.columns else 'y_eps'
y_eps = predictions_df[eps_col].values

# we need to get model_type and config from runs_df
model_type = "ensemble"

config = get_model_config(
    model_type,
    data_name="papyrus",
    activity_type="xc50",
    split_type="random",
    descriptor_protein="ankh-large",
    descriptor_chemical="ecfp2048",
    data_specific_path="papyrus/xc50/all"
)


uct_metrics_logger = MetricsTable(
    model_type=model_type,
    config=config,
    add_plots_to_table=False,
    # * we can turn on if we want to see them in wandb * #
    # logger=logger,
    project_name=None, # to change as this will become the output csv file name
    run_name=None, # that would be the same run name from the runs.csv file
)
task_name = "PCM"
metrics, plots = uct_metrics_logger(
        y_pred=y_pred,
        y_std=y_alea,
        y_true=y_true,
        y_err=y_err,
        # y_alea=y_alea,
        y_eps=y_eps,
        task_name=task_name,
        figpath=None, # Here we need to define the fig path 
        # the figpath default was FIGS_DIR / data_specific_path / self.model_name
        # we can change it to mark the reassessment of the model somehoe (maybe add a prefix to model_name like reasses)
        # FIGS_DIR is imported variable from uqdd
        # data_specific_path is predefined var data_specific_path="papyrus/xc50/all"
    )

# * calculate metrics for a subset of the datapoints * #
submetrics, subplots = uct_metrics_logger(
    y_pred=y_pred,
    y_std=y_alea,
    y_true=y_true,
    y_err=y_err,
    # y_alea=y_alea,
    y_eps=y_eps,
    task_name=task_name+" Subset",
    n_subset=100,
)

# then we log to the csv file which appends after each iteration
uct_metrics_logger.csv_log()
