In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports

In [1]:
import pandas as pd
import numpy as np

#from src import models
#from src import experimental_setup
experimental_setup.path_prefix = '/../'

ModuleNotFoundError: No module named 'src'

# Experimental setup

In [8]:
_benchmarks = {
    'dnn_mordred': {'model': models.DNN_Mordred, 'encoding': 'mordred'},
    'dnn_ecfp': {'model': models.DNN_ECFP, 'encoding': 'ecfp_2048'},
    'rf_mordred': {'model': models.RF, 'encoding': 'mordred'},
    'rf_ecfp': {'model': models.RF, 'encoding': 'ecfp_4096'},
    'rf_nmf_ecfp': {'model': models.RF_NMF_ECFP, 'encoding': 'ecfp_4096'},
    'gp': {'model': models.GP, 'encoding': 'gp'},
    'gcn': {'model': models.GCN, 'encoding': 'smiles'}
}

# benchmarks to train/validate, check _benchmark_dict for options
# run_benchmarks = ['gcn']   # DU - changed  these to get diff models
run_benchmarks = ['dnn_mordred']

# `random` or `stratified`
sampling_type = 'random'

kfold = experimental_setup.CrossValidator(
    splits = 5, # dont change without re-running data preprocessing
    sampling_type = sampling_type,
)

converter = experimental_setup.LD50UnitConverter()

# Train/predict using benchmark models

In [9]:
for identity in run_benchmarks:
    benchmark = _benchmarks[identity]
    
    folds = enumerate(kfold.get_folds(benchmark['encoding']))
    
    for fold_no, (train, test) in folds:           
        x_train, y_train, smiles_train = train
        x_test, y_test, smiles_test = test
        
        y_train = experimental_setup.scaler.fit_transform(y_train)
        
        model = _benchmarks[identity]['model']()
        
        # <Gaussian process has special step for selecting parameters based on
        # RF benchmark models for ECFP and Mordred
        if identity == 'gp':
            fn = 'rf_mordred' + str(fold_no) + '_' + sampling_type
            mordred_rf = _benchmarks['rf_mordred']['model']()
            mordred_rf.load_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)
            
            fn = 'rf_ecfp' + str(fold_no) + '_' + sampling_type
            ecfp_rf = _benchmarks['rf_ecfp']['model']()
            ecfp_rf.load_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)
            
            model.rf_feature_selectors = (mordred_rf, ecfp_rf)
            model.rf_feature_reduce_to = (10, 200)
        # End of special GP step>
        
        model.fit(x_train, y_train)
        
        #save model
        fn = identity + str(fold_no) + '_' + sampling_type
        model.save_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)
        
        y_hat = experimental_setup.scaler.inverse_transform(model.predict(x_test))
        
        results = pd.DataFrame({
            'smiles': smiles_test.flatten(),
            'prediction_neglogld50': y_hat.flatten(),
            'prediction_mgkg': converter.convert_to_mgkg(y_hat, smiles_test),
            'prediction_epa': converter.convert_to_epa(y_hat, smiles_test),
            'actual_neglogld50': y_test.flatten(),
            'actual_mgkg': converter.convert_to_mgkg(y_test, smiles_test),
            'actual_epa': converter.convert_to_epa(y_test, smiles_test),
        })
        
        results.to_csv('../../data/benchmark-models/%s_predictions.csv' % fn)

Epoch 1/1000




[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 853951.5000 - mae: 249.8174 - mse: 853951.5000 
Epoch 2/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.7944 - mae: 0.9589 - mse: 1.7944
Epoch 3/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0322 - mae: 0.7708 - mse: 1.0322
Epoch 4/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.1392 - mae: 0.7910 - mse: 1.1392
Epoch 5/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.2238 - mae: 0.8149 - mse: 1.2238
Epoch 6/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.1454 - mae: 0.7927 - mse: 1.1454
Epoch 7/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.1088 - mae: 0.7810 - mse: 1.1088
Epoch 8/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0

ValueError: The filename must end in `.weights.h5`. Received: filepath=../../data/benchmark-models/chkpts/dnn_mordred0_random.chkpt

# Results comparison

In [18]:
#from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
#import pandas as pd
#import numpy as np

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np  # Ensure numpy is imported

compare_benchmarks = ['rf_mordred', 'rf_ecfp', 'rf_nmf_ecfp', 'dnn_mordred', 'dnn_ecfp', 'gp', 'gcn'] 

aggregated = pd.DataFrame(columns=['benchmark', 'fold_no', 'r2', 'mae', 'rmse', 'accuracy'])  # Adding fold_no column

for identity in compare_benchmarks:
    benchmark = _benchmarks[identity]
    folds = enumerate(kfold.get_folds(benchmark['encoding']))

    for fold_no, data in folds:
        fn = identity + str(fold_no) + '_' + sampling_type
        validation = pd.read_csv(f'../../data/benchmark-models/{fn}_predictions.csv')
        
        # Adding fold_no and metrics calculations
        aggregated = aggregated.append({
            'benchmark': identity,
            'fold_no': fold_no,
            'r2': r2_score(validation['actual_neglogld50'], validation['prediction_neglogld50']),
            'mae': mean_absolute_error(validation['actual_neglogld50'], validation['prediction_neglogld50']),
            'rmse': mean_squared_error(validation['actual_neglogld50'], validation['prediction_neglogld50'], squared=False),
            'accuracy': np.sum(validation['actual_epa'] == validation['prediction_epa']) / len(validation)
        }, ignore_index=True)

print(aggregated)


'''  
compare_benchmarks = ['rf_mordred','rf_ecfp','rf_nmf_ecfp','dnn_mordred','dnn_ecfp','gp','gcn'] 

aggregated = pd.DataFrame(columns=['benchmark', 'fold_no', 'r2', 'mae', 'rmse', 'accuracy'])  # Adding fold_no column
aggregated = pd.DataFrame()
#aggregated = pd.DataFrame(columns=['benchmark', 'r2', 'mae', 'rmse', 'accuracy'])  #DU added

for identity in compare_benchmarks:
        
    benchmark = _benchmarks[identity]
    folds = enumerate(kfold.get_folds(benchmark['encoding']))
    
    
    for fold_no, data in folds:
        fn = identity + str(fold_no) + '_' + sampling_type
        
        validation = pd.read_csv('../../data/benchmark-models/%s_predictions.csv' % fn)
        
        print(validation)
        aggregated = aggregated.extend()
        pass
        
        aggregated = aggregated.append({
            'benchmark': identity,
            'r2': r2_score(validation['actual_neglogld50'], validation['prediction_neglogld50']),
            'mae': mean_absolute_error(validation['actual_neglogld50'], validation['prediction_neglogld50']),
            'rmse': mean_squared_error(validation['actual_neglogld50'], validation['prediction_neglogld50'], squared=False),
            'accuracy': np.sum(validation['actual_epa'] == validation['prediction_epa']) / len(validation)
        }, ignore_index=True)


'''



AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
aggregated.pivot_table(index='benchmark', aggfunc=np.mean)