In [1]:
import os
import sys 
import numpy as np
from keras.models import load_model
import gzip, pickle
from torch_geometric.data import DataLoader
import torch

solvation_path = os.path.abspath(os.path.join(os.path.abspath(''), "../pnnlsolpaper"))
sys.path.append(solvation_path)

import smi
import mdm
import gnn
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.cluster import KMeans
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns


2025-04-02 15:39:26.842299: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## loading models and their data, prediction from individual models 

In [2]:
# Loading gnn model , data loader , and creating predictions 

# Defining a path using os.path
path = os.path.join('./data/')

# load data
with gzip.open(path+"train.pkl.gz", "rb") as f:
    train_X = pickle.load(f)
with gzip.open(path+"val.pkl.gz", "rb") as f:
    val_X = pickle.load(f)
with gzip.open(path+"test.pkl.gz", "rb") as f:
    test_X = pickle.load(f)
bs = gnn.config.bs

test_loader = DataLoader(test_X, batch_size=bs, shuffle=False, drop_last=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gnn.gnn_model.GNN(n_features = gnn.config.n_features).to(device)
model.load_state_dict(torch.load(gnn.config.best_model ))

_, gnn_pred = gnn.gnn_utils.test_fn_plotting(test_loader, model, device)

FileNotFoundError: [Errno 2] No such file or directory: './data/train.pkl.gz'

In [None]:
# Loading smi model , smi_x test and creating predictions
smi_x_tes = np.loadtxt("./data/x_test.txt")
smi_model = load_model(smi.config.best_model)
smi_pred = smi_model.predict(smi_x_tes).ravel()

# Loading mdm model, mdm_x_test and creating predictions
mdm_x_test = np.loadtxt("./data/x_test.txt")
mdm_model = load_model(mdm.config.best_model)
mdm_pred = mdm_model.predict(mdm_x_test).reshape(-1,)

# loading y_test, y from all models is identical, using mdm for convenience 
y_test = np.loadtxt("./data/y_test.txt")

## Ensembling and graphing all predictions 

In [None]:
w_gnn = 0.5948275422718199
w_mdm = 0.9463097878036206
w_smi = 0.33177489589161224
sum= w_gnn+w_mdm + w_smi
nw_gnn = w_gnn/sum
nw_mdm = w_mdm/sum
nw_smi = w_smi/sum
CV_pred = (nw_gnn * gnn_pred) + (nw_mdm * mdm_pred) + (nw_smi * smi_pred)

In [None]:
weight_gnn = 0.6306445985729451
weight_mdm =0.7775387543020923
weight_smi = 0.4295662789522257

sum_optuna= weight_gnn + weight_mdm + weight_smi
nognn= weight_gnn/ sum_optuna 
nomdm = weight_mdm / sum_optuna
nosmi = weight_smi / sum_optuna
optuna_pred = (nognn * gnn_pred) + (nomdm * mdm_pred) + (nosmi * smi_pred)

In [None]:
avg_pred = (gnn_pred+ mdm_pred + smi_pred)/3

In [None]:

def evaluate_predictions(y_test, **model_predictions):
    metrics = {}
    for model_name, predictions in model_predictions.items():
        r2 = r2_score(y_true=y_test, y_pred=predictions)
        rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False)
        sp = spearmanr(predictions, y_test)[0]
        mae = mean_absolute_error(y_true=y_test, y_pred=predictions)
        
        metrics[model_name] = {
            'R2': r2,
            'RMSE': rmse,
            'Spearman': sp,
            'MAE': mae
        }
        
        # Plotting
        plt.figure(figsize=(8, 6))
        plt.scatter(y_test, predictions, alpha=0.5, label=f'{model_name} Predictions')
        plt.title(f'Prediction vs True Value - {model_name}')
        plt.xlabel('True Values Log(S)')
        plt.ylabel('Predicted Values Log(S)')
        plt.plot([-3, 3], [-3, 3], 'r--')  # Adding a reference line
        plt.grid(True)
        plt.legend()
        plt.show()
        
        print(f"Metrics for {model_name}:")
        print(f"R2: {r2:.4f}")
        print(f"Spearman: {sp:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print("\n")

    return metrics


results = evaluate_predictions(y_test, smi_pred=smi_pred, mdm_pred=mdm_pred, gnn_pred=gnn_pred , avg_pred=avg_pred, CV_pred=CV_pred  , optuna_pred = optuna_pred)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

def evaluate_predictions(y_test, **model_predictions):
    metrics = {}
    for model_name, predictions in model_predictions.items():
        r2 = r2_score(y_true=y_test, y_pred=predictions)
        rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False)
        sp = spearmanr(predictions, y_test)[0]
        mae = mean_absolute_error(y_true=y_test, y_pred=predictions)
        
        metrics[model_name] = {
            'R2': r2,
            'RMSE': rmse,
            'Spearman': sp,
            'MAE': mae
        }
        

    return metrics

def plot_predictions(y_test, model_predictions, layout, figsize=(18, 6)):
    fig, axes = plt.subplots(layout[0], layout[1], figsize=figsize)
    axes = axes.flatten()
    for i, (model_name, predictions) in enumerate(model_predictions.items()):
        axes[i].scatter(y_test, predictions, alpha=0.5, label=f'{model_name} Predictions')
        axes[i].plot([-16, 10], [-16, 10], 'r--')  # Extending the reference line
        axes[i].set_title(f'Prediction vs Expiremntal Value - {model_name}')
        axes[i].set_xlabel('Expiremntal Values Log(S)')
        axes[i].set_ylabel('Predicted Values Log(S)')
        axes[i].set_xlim(-16, 10)
        axes[i].set_ylim(-16, 10)
        axes[i].set_aspect('equal', adjustable='box')
        axes[i].grid(True)
        axes[i].legend()
    plt.tight_layout()
    plt.show()

# Evaluate predictions
results = evaluate_predictions(y_test, smi_pred=smi_pred, mdm_pred=mdm_pred, gnn_pred=gnn_pred, avg_pred=avg_pred, CV_pred=CV_pred, optuna_pred=optuna_pred)

# Plot the first set of predictions (3-column, 1-row layout)
plot_predictions(y_test, {'SMI': smi_pred, 'MDM': mdm_pred, 'GNN': gnn_pred}, layout=(1, 3))

# Plot the second set of predictions (2-column, 1-row layout)
plot_predictions(y_test, {'Simple Average': avg_pred, 'CV with Optuna': CV_pred, 'Optuna': optuna_pred}, layout=(1, 3))
