In [11]:
#remove entries for model MultilinearRegression and descriptor RDKit_PhysChem for all seeds
query = """
DELETE FROM cs_mdfps_schema.model_descriptor_results
WHERE descriptor=%s AND model=%s
"""
conn = psycopg2.connect("dbname=cs_mdfps user=cschiebroek host=lebanon")
cur = conn.cursor()
cur.execute(query, ('RDKit_PhysChem', 'MultilinearRegression'))
conn.commit()


In [12]:
import psycopg2
import pandas as pd
import logging
import warnings
from utils.data_preprocessing import prepare_data, preprocess_data, get_features
from descriptors.rdkit_physchem_decriptors import calculate_RDKit_PhysChem_descriptors
from descriptors.mdfp import extract_mdfp_features
from models.linear_regression import MultilinearRegressionModel


# Add the new model to the dictionary of available models
model_classes = {
    'MultilinearRegression': MultilinearRegressionModel,
}

warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

descriptor_functions = {
    'RDKit_PhysChem': calculate_RDKit_PhysChem_descriptors,
    'MDFP': extract_mdfp_features,
}
 
descriptors_to_use = ['RDKit_PhysChem', 'MDFP','Counts',"NumHeavyAtoms","Counts_simple"]
models_to_evaluate = ['MultilinearRegression']
conn = psycopg2.connect("dbname=cs_mdfps user=cschiebroek host=lebanon")
df = prepare_data(conn,descriptors_to_use)
def store_results_in_db(conn, descriptor, model_name, seed, molregno, y_true, y_pred):
    query = """
    INSERT INTO cs_mdfps_schema.model_descriptor_results (descriptor, model, seed, molregno, y_true, y_pred) 
    VALUES (%s, %s, %s, %s, %s, %s)
    """
    cur = conn.cursor()
    cur.execute(query, (descriptor, model_name, seed, molregno, y_true, y_pred))
    conn.commit()
    cur.close()
predictions = {(descriptor, model_name): [] for descriptor in descriptors_to_use for model_name in models_to_evaluate}
y_list, molregno_list = [], []

for i in range(10):
    logging.info(f"Training and evaluating split {i+1}... \n \n \n")
    _, val_molregnos, train_y, val_y, df_train, df_val = preprocess_data(df, i)

    for descriptor in descriptors_to_use:
        logging.info(f"Extracting features for descriptor: {descriptor}")
        scale = False
        train_X, val_X = get_features(df_train=df_train, df_val=df_val, descriptor_name=descriptor, scale=scale)

        for model_name in models_to_evaluate:
            logging.info(f"Training and evaluating model: {model_name} with descriptor: {descriptor}")


            model_class = model_classes[model_name]
            model = model_class()
            model.train(train_X, train_y)
            y_pred = model.predict(val_X)
            y_true = val_y.tolist()
            if descriptor == 'RDKit_PhysChem':
                store_results_in_db(conn, descriptor, model_name, i, val_molregnos, val_y.tolist(), y_pred.tolist())

            assert len(y_pred) == len(y_true)
            molregno = val_molregnos
            predictions[(descriptor, model_name)].append(y_pred)

    y_list.append(y_true)
    molregno_list.append(molregno)

combined_titles = []
combined_preds = []
combined_reals = []
combined_molregnos = []

for descriptor in descriptors_to_use:
    for model_name in models_to_evaluate:
        combined_titles.append(f"{model_name} ({descriptor})")
        combined_preds.append(predictions[(descriptor, model_name)])
        combined_reals.append(y_list)
        combined_molregnos.append(molregno_list)


2024-11-12 16:19:27,781 - INFO - Loading data from the database...


2024-11-12 16:19:28,133 - INFO - Calculating descriptors...
2024-11-12 16:19:28,133 - INFO - Fetching RDKit descriptors from the database...
2024-11-12 16:19:29,029 - INFO - Merging RDKit descriptors with the main dataframe...
2024-11-12 16:19:29,441 - INFO - Index(['molregno', 'conf_id', 'vp_log10_pa', 'mdfp', 'molblock',
       'md_experiment_uuid', 'confgen_uuid', 'Ipc', 'qed', 'Chi0',
       ...
       'water_intra_ene_median', 'water_total_ene_mean', 'water_total_ene_std',
       'water_total_ene_median', 'water_rgyr_mean', 'water_rgyr_std',
       'water_rgyr_median', 'water_sasa_mean', 'water_sasa_std',
       'water_sasa_median'],
      dtype='object', length=257)
2024-11-12 16:19:29,443 - INFO - Data loaded and descriptors calculated, dropping NaNs...
2024-11-12 16:19:29,446 - INFO - Dropped 0 rows due to missing values.
2024-11-12 16:19:29,448 - INFO - Training and evaluating split 1... 
 
 

2024-11-12 16:19:29,449 - INFO - Splitting data into training and testing sets...
20

In [2]:
from utils.visualization import density_plots

# density_plots(reals_list=combined_reals, predictions_list=combined_preds, molregnos_list=combined_molregnos,
#                 print_stats=True, bounds=None, title=combined_titles,
#                 name="Simple", dims=(3,2), thresholds=1)

In [3]:
combined_titles

['MultilinearRegression (RDKit_PhysChem)',
 'MultilinearRegression (MDFP)',
 'MultilinearRegression (Counts)',
 'MultilinearRegression (NumHeavyAtoms)',
 'MultilinearRegression (Counts_simple)']

In [4]:


#and change order; first numbeavyatoms, then counts, then rdkit_physchem
combined_titles_no_mdfp = [combined_titles[3],combined_titles[4],combined_titles[0]]
combined_reals_no_mdfp = [combined_reals[3],combined_reals[4],combined_reals[0]]
combined_preds_no_mdfp = [combined_preds[3],combined_preds[4],combined_preds[0]]
combined_molregnos_no_mdfp = [combined_molregnos[3],combined_molregnos[4],combined_molregnos[0]]

#changes names from MultilinearRegression to Ordinary Least Squares
combined_titles_no_mdfp[0] = combined_titles_no_mdfp[0].replace("MultilinearRegression","Ordinary Least Squares")
combined_titles_no_mdfp[1] = combined_titles_no_mdfp[1].replace("MultilinearRegression","Ordinary Least Squares")
combined_titles_no_mdfp[2] = combined_titles_no_mdfp[2].replace("MultilinearRegression","Ordinary Least Squares")
combined_titles_no_mdfp[1] = combined_titles_no_mdfp[1].replace("Counts_simple","Counts")
combined_titles_no_mdfp[2] = combined_titles_no_mdfp[2].replace("RDKit_PhysChem","All")


# density_plots(reals_list=combined_reals_no_mdfp, predictions_list=combined_preds_no_mdfp, molregnos_list=combined_molregnos_no_mdfp,
#                 print_stats=True, bounds=None, title=combined_titles_no_mdfp,
#                 name="Simple", dims=(1,3), thresholds=1)


In [5]:
df_numeric = df.select_dtypes(include=['float64', 'int64'])
corr = df_numeric.corr()
# corr.style.background_gradient(cmap='coolwarm')


In [10]:
#and do a subset of these, with only those that have a corr of at least (abs) 0.3 to vp_log10_pa
corr_vp = corr["vp_log10_pa"]
corr_vp = corr_vp[abs(corr_vp) > 0.3]
absolute_corr_vp = abs(corr_vp)
absolute_corr_vp = absolute_corr_vp.sort_values(ascending=False)
for factor in absolute_corr_vp.index[2:]:
    print(f"Correlation between vp_log10_pa and {factor}: {corr_vp[factor]}")



Correlation between vp_log10_pa and MolMR: -0.848489494985087
Correlation between vp_log10_pa and LabuteASA: -0.8355685873902197
Correlation between vp_log10_pa and Chi1: -0.8306929717690472
Correlation between vp_log10_pa and NumHeavyAtoms: -0.8100781236883946
Correlation between vp_log10_pa and HeavyAtomCount: -0.8100781236883946
Correlation between vp_log10_pa and Chi0v: -0.805276382784739
Correlation between vp_log10_pa and AvgIpc: -0.7905094889563636
Correlation between vp_log10_pa and BertzCT: -0.7804293005952412
Correlation between vp_log10_pa and Chi1v: -0.7777188689461293
Correlation between vp_log10_pa and water_sasa_mean: -0.7737424695193129
Correlation between vp_log10_pa and water_sasa_median: -0.771180628248295
Correlation between vp_log10_pa and Chi0: -0.7678843156220447
Correlation between vp_log10_pa and water_rgyr_mean: -0.7542551042445687
Correlation between vp_log10_pa and ExactMolWt: -0.7491918666478078
Correlation between vp_log10_pa and NumValenceElectrons: -0.74