In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as numpy
sns.set_context("poster")
sns.set_style("whitegrid")
sns.set_palette("Set2")

import pandas as pd
import os, json, sys, glob, pickle

from atomsci.ddm.pipeline import parameter_parser as parse
from atomsci.ddm.pipeline import perf_data
from atomsci.ddm.pipeline import model_pipeline as mp
import atomsci.ddm.utils.curate_data as curate_data
import atomsci.ddm.utils.struct_utils as struct_utils
from atomsci.ddm.pipeline import perf_plots as pp

In [4]:
from atomsci.ddm.pipeline import predict_from_model as pfm

# Load acrylate dataset
test_acrylate = pd.read_csv('acrylate_df.csv')

# Random split, graphconv, NN for acrylate
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_d45eab0f-3f53-4036-a3e5-0a0217a52e23.tar.gz'
input_df = test_acrylate
response_col = "DFT_HOMO_LUMO_GAP,DFT_HOMO_LUMO_GAP"
smiles_col = "rdkit_smiles"
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_randsplit = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_randsplit = corr_matrix_acrylate_R_sq_randsplit[0, 1]
dft_acrylate_R_sq_randsplit = corr_acrylate_R_sq_randsplit ** 2

# New Random split, graphconv, NN for acrylate with ECFP
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_1271d6b8-ee81-474e-acc2-dd73fbc3f4ae.tar.gz'
input_df = test_acrylate
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# New DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_randsplit_new = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_randsplit_new = corr_matrix_acrylate_R_sq_randsplit_new[0, 1]
dft_acrylate_R_sq_randsplit_new = corr_acrylate_R_sq_randsplit_new ** 2

# Fingerprint split, graphconv, NN for acrylate
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_4e942a08-e9d6-4ac9-bfb4-277e0518ee2d.tar.gz'
input_df = test_acrylate
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_finger = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_finger = corr_matrix_acrylate_R_sq_finger[0, 1]
dft_acrylate_R_sq_finger = corr_acrylate_R_sq_finger ** 2

# New Fingerprint split, graphconv, NN for acrylate
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_285356d8-0b65-4925-a8f6-ccb605d5136b.tar.gz'
input_df = test_acrylate
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# New DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_finger_new = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_finger_new = corr_matrix_acrylate_R_sq_finger_new[0, 1]
dft_acrylate_R_sq_finger_new = corr_acrylate_R_sq_finger_new ** 2

# Scaffold split, graphconv, NN for acrylate
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_de56ee5b-6ad7-457a-884f-055bbe7257ad.tar.gz'
input_df = test_acrylate
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_scaffold = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_scaffold = corr_matrix_acrylate_R_sq_scaffold[0, 1]
dft_acrylate_R_sq_scaffold = corr_acrylate_R_sq_scaffold ** 2

# New Scaffold split, graphconv, NN for acrylate
mfile = '/home/jupyter/Models/select_qmugs_data_curated (2)_model_5e947e95-3ae9-47df-a2ac-9d29066acc3c.tar.gz'
input_df = test_acrylate
results_df = pfm.predict_from_model_file(model_path=mfile, 
                                        input_df=input_df, 
                                        smiles_col=smiles_col, 
                                        response_col=response_col)

# New DFT R2 for acrylate
dft_acrylate_actual = results_df['DFT_HOMO_LUMO_GAP'].values
dft_acrylate_predict = results_df['DFT_HOMO_LUMO_GAP_pred'].values

corr_matrix_acrylate_R_sq_scaffold_new = numpy.corrcoef(dft_acrylate_actual, dft_acrylate_predict)
corr_acrylate_R_sq_scaffold_new = corr_matrix_acrylate_R_sq_scaffold_new[0, 1]
dft_acrylate_R_sq_scaffold_new = corr_acrylate_R_sq_scaffold_new ** 2

from IPython.display import HTML, display

# Results for acrylate
acrylate_result = {
    "Model": ["random split", "random split with ECFP",
              "fingerprint", "fingerprint with ECFP",
              "scaffold", "scaffold with ECFP"],
    "Result": [dft_acrylate_R_sq_randsplit, dft_acrylate_R_sq_randsplit_new,
               dft_acrylate_R_sq_finger, dft_acrylate_R_sq_finger_new,
               dft_acrylate_R_sq_scaffold, dft_acrylate_R_sq_scaffold_new]
}

result_df_acrylate = pd.DataFrame(acrylate_result)
result_df_acrylate.columns = ['Model', 'Tested R squared value']

# Adjusting the index to start at 1
result_df_acrylate.index = range(1, len(result_df_acrylate) + 1)

# Convert DataFrame to HTML and add custom styling
result_df_acrylate_pre = result_df_acrylate.style.set_table_styles(
    [
        # Styles for the table headers
        {'selector': 'th', 
         'props': [('background-color', 'lightgrey'), 
                   ('color', 'black'),
                   ('font-family', 'Arial'), 
                   ('text-align', 'center')]},

        {'selector': '.col0', 
         'props': [('text-align', 'left')]},

        # Styles for the 'Tested R_squared value' column
        {'selector': '.col1', 
         'props': [('text-align', 'center')]},

        # Styles for the caption
        {'selector': 'caption', 
         'props': [('caption-side', 'top'),
                   ('font-size', '1.2em'), 
                   ('font-weight', 'bold'), 
                   ('color', 'black'), 
                   ('font-family', 'Arial'), 
                   ('text-align', 'center')]}
    ]
).set_caption("Acrylate").render()

result_df_acrylate.to_csv('result_df_acrylate.csv', index=False)

display(HTML(result_df_acrylate_pre))


Standardizing SMILES strings for 6760 compounds.


2023-12-05 00:43:09,555 /home/jupyter/Models/select_qmugs_data_curated (2)_model_d45eab0f-3f53-4036-a3e5-0a0217a52e23.tar.gz, 1.5.1
2023-12-05 00:43:09,562 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_d45eab0f-3f53-4036-a3e5-0a0217a52e23.tar.gz version = "1.5", AMPL version = "1.5"
2023-12-05 00:43:20,195 ['ampl_version', 'time_generated', 'best_epoch', 'time_built', 'dataset_hash', 'dataset_metadata', 'training_metrics'] are not part of the accepted list of parameters and will be ignored
2023-12-05 00:43:20,818 Featurization = DynamicFeaturization with graphconv features


num_model_tasks is deprecated and its value is ignored.
Standardizing SMILES strings for 6760 compounds.


2023-12-05 00:54:22,669 /home/jupyter/Models/select_qmugs_data_curated (2)_model_1271d6b8-ee81-474e-acc2-dd73fbc3f4ae.tar.gz, 1.5.1
2023-12-05 00:54:22,673 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_1271d6b8-ee81-474e-acc2-dd73fbc3f4ae.tar.gz version = "1.5", AMPL version = "1.5"


num_model_tasks is deprecated and its value is ignored.
['/var/tmp/tmp5gx8ak88/best_model/checkpoint1.pt']
/var/tmp/tmp5gx8ak88/best_model/checkpoint1.pt
Standardizing SMILES strings for 6760 compounds.


2023-12-05 00:59:10,734 /home/jupyter/Models/select_qmugs_data_curated (2)_model_4e942a08-e9d6-4ac9-bfb4-277e0518ee2d.tar.gz, 1.5.1
2023-12-05 00:59:10,741 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_4e942a08-e9d6-4ac9-bfb4-277e0518ee2d.tar.gz version = "1.5", AMPL version = "1.5"


num_model_tasks is deprecated and its value is ignored.
Standardizing SMILES strings for 6760 compounds.


2023-12-05 01:01:16,403 /home/jupyter/Models/select_qmugs_data_curated (2)_model_285356d8-0b65-4925-a8f6-ccb605d5136b.tar.gz, 1.5.1
2023-12-05 01:01:16,410 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_285356d8-0b65-4925-a8f6-ccb605d5136b.tar.gz version = "1.5", AMPL version = "1.5"


num_model_tasks is deprecated and its value is ignored.
['/var/tmp/tmpty177sd9/best_model/checkpoint1.pt']
/var/tmp/tmpty177sd9/best_model/checkpoint1.pt
Standardizing SMILES strings for 6760 compounds.


2023-12-05 01:02:12,295 /home/jupyter/Models/select_qmugs_data_curated (2)_model_de56ee5b-6ad7-457a-884f-055bbe7257ad.tar.gz, 1.5.1
2023-12-05 01:02:12,299 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_de56ee5b-6ad7-457a-884f-055bbe7257ad.tar.gz version = "1.5", AMPL version = "1.5"


num_model_tasks is deprecated and its value is ignored.
Standardizing SMILES strings for 6760 compounds.


2023-12-05 01:03:54,463 /home/jupyter/Models/select_qmugs_data_curated (2)_model_5e947e95-3ae9-47df-a2ac-9d29066acc3c.tar.gz, 1.5.1
2023-12-05 01:03:54,468 Version compatible check: /home/jupyter/Models/select_qmugs_data_curated (2)_model_5e947e95-3ae9-47df-a2ac-9d29066acc3c.tar.gz version = "1.5", AMPL version = "1.5"


num_model_tasks is deprecated and its value is ignored.
['/var/tmp/tmpyy1xvz8y/best_model/checkpoint1.pt']
/var/tmp/tmpyy1xvz8y/best_model/checkpoint1.pt


Unnamed: 0,Model,Tested R squared value
1,random split,0.943133
2,random split with ECFP,0.828233
3,fingerprint,0.879586
4,fingerprint with ECFP,0.534854
5,scaffold,0.92315
6,scaffold with ECFP,0.803102
