In [1]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

import numpy as np
import pandas as pd
import math

from collections import Counter

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

from sklearn.metrics import root_mean_squared_error,r2_score,mean_absolute_error
from sklearn.model_selection import train_test_split, ParameterGrid, cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.ensemble import RandomForestRegressor

from tqdm import tqdm

random_seed = 42

# Prepare data

In [2]:
df = (pd.read_csv('../data/crc.csv', sep=';'))[['name', 'SMILES', 'boiling_point']]
df['Mol'] = df.SMILES.apply(Chem.MolFromSmiles)

In [3]:
df.head()

Unnamed: 0,name,SMILES,boiling_point,Mol
0,Butane,CCCC,-0.5,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...
1,Dodecane,CCCCCCCCCCCC,216.3,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...
2,Propane,CCC,-42.1,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...
3,Ethane,CC,-88.6,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...
4,5-Butyldocosane,CCCCCCCCCCCCCCCCCC(CCCC)CCCC,244.0,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...


In [4]:
MFPGEN_2_1024 = rdFingerprintGenerator.GetMorganGenerator(fpSize=1024,radius=2)
MFPGEN_0_1024 = rdFingerprintGenerator.GetMorganGenerator(fpSize=1024,radius=0)
MFPGEN_1_1024 = rdFingerprintGenerator.GetMorganGenerator(fpSize=1024,radius=1)
MFPGEN_3_1024 = rdFingerprintGenerator.GetMorganGenerator(fpSize=1024,radius=3)
MFPGEN_0_256 = rdFingerprintGenerator.GetMorganGenerator(fpSize=256,radius=0)
MFPGEN_0_512 = rdFingerprintGenerator.GetMorganGenerator(fpSize=512,radius=0)
MFPGEN_0_2048 = rdFingerprintGenerator.GetMorganGenerator(fpSize=2048,radius=0)
MFPGEN_0_4096 = rdFingerprintGenerator.GetMorganGenerator(fpSize=4096,radius=0)

In [5]:
df['FP_bin_2_1024'] = df.Mol.apply(MFPGEN_2_1024.GetFingerprintAsNumPy)
df['FP_count_2_1024'] = df.Mol.apply(MFPGEN_2_1024.GetCountFingerprintAsNumPy)
df['FP_count_0_1024'] = df.Mol.apply(MFPGEN_0_1024.GetCountFingerprintAsNumPy)
df['FP_count_1_1024'] = df.Mol.apply(MFPGEN_1_1024.GetCountFingerprintAsNumPy)
df['FP_count_3_1024'] = df.Mol.apply(MFPGEN_3_1024.GetCountFingerprintAsNumPy)
df['FP_count_0_256'] = df.Mol.apply(MFPGEN_0_256.GetCountFingerprintAsNumPy)
df['FP_count_0_512'] = df.Mol.apply(MFPGEN_0_512.GetCountFingerprintAsNumPy)
df['FP_count_0_2048'] = df.Mol.apply(MFPGEN_0_2048.GetCountFingerprintAsNumPy)
df['FP_count_0_4096'] = df.Mol.apply(MFPGEN_0_4096.GetCountFingerprintAsNumPy)

In [6]:
df.head(2)

Unnamed: 0,name,SMILES,boiling_point,Mol,FP_bin_2_1024,FP_count_2_1024,FP_count_0_1024,FP_count_1_1024,FP_count_3_1024,FP_count_0_256,FP_count_0_512,FP_count_0_2048,FP_count_0_4096
0,Butane,CCCC,-0.5,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Dodecane,CCCCCCCCCCCC,216.3,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
df.head(3)

Unnamed: 0,name,SMILES,boiling_point,Mol,FP_bin_2_1024,FP_count_2_1024,FP_count_0_1024,FP_count_1_1024,FP_count_3_1024,FP_count_0_256,FP_count_0_512,FP_count_0_2048,FP_count_0_4096
0,Butane,CCCC,-0.5,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Dodecane,CCCCCCCCCCCC,216.3,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Propane,CCC,-42.1,<rdkit.Chem.rdchem.Mol object at 0x00000186A43...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
nonzero_counts = df['FP_count_1_1024'].apply(lambda x: np.count_nonzero(x))

max_nonzero = nonzero_counts.max()


print(f"Max non zero counts: {max_nonzero}")

Max non zero counts: 12


# Random forest for counts not included

In [12]:
X = np.stack(df['FP_bin_2_1024'].values)
y = df['boiling_point'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [14]:
rf_bin = RandomForestRegressor(random_state=random_seed)
rf_bin.fit(X_train, y_train)

In [15]:
def get_metrics(y_true_train, y_pred_train, y_true_val, y_pred_val):
    return (root_mean_squared_error(y_true_train, y_pred_train), root_mean_squared_error(y_true_val,  y_pred_val),
            mean_absolute_error(y_true_train, y_pred_train), mean_absolute_error(y_true_val,  y_pred_val),
            r2_score(y_true_train, y_pred_train), r2_score(y_true_val,  y_pred_val))
def print_metrics(y_true_train, y_pred_train, y_true_val, y_pred_val):
    print("Metrics:")
    print("RMSE train: ", root_mean_squared_error(y_true_train, y_pred_train))
    print("RMSE validation: ", root_mean_squared_error(y_true_val,  y_pred_val))
    print("MAE train: ", mean_absolute_error(y_true_train, y_pred_train))
    print("MAE validation: ", mean_absolute_error(y_true_val,  y_pred_val))
    print("R2 score train: ", r2_score(y_true_train, y_pred_train))
    print("R2 score validation: ", r2_score(y_true_val,  y_pred_val))

In [16]:
print_metrics(y_train, rf_bin.predict(X_train), y_test, rf_bin.predict(X_test))
metrics_bin_2_1024 = get_metrics(y_train, rf_bin.predict(X_train), y_test, rf_bin.predict(X_test))

Metrics:
RMSE train:  65.51773039658003
RMSE validation:  82.86325939543806
MAE train:  35.137963403880065
MAE validation:  53.78188946208115
R2 score train:  0.729134958166471
R2 score validation:  0.3435791460468919


# Random forest for counts included

### r=2 fpSize=1024

In [17]:
X = np.stack(df['FP_count_2_1024'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [18]:
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)

In [19]:
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_2_1024 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  13.397715633561482
RMSE validation:  28.043722370414542
MAE train:  7.487166666666659
MAE validation:  20.99853333333333
R2 score train:  0.9886734725926636
R2 score validation:  0.9248152666922039


### r=0 fpSize=1024

In [20]:
X = np.stack(df['FP_count_0_1024'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_0_1024 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.672255043594687
RMSE validation:  18.37180392186347
MAE train:  7.029889640272277
MAE validation:  13.055789266071622
R2 score train:  0.9898668824928819
R2 score validation:  0.9677327503766221


### r=1 fpSize=1024

In [21]:
X = np.stack(df['FP_count_1_1024'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_1_1024 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.518265421182143
RMSE validation:  26.3341246321968
MAE train:  6.843362345679002
MAE validation:  19.816966666666666
R2 score train:  0.9901116556917684
R2 score validation:  0.9337026577180043


### r=3 fpSize=1024

In [22]:
X = np.stack(df['FP_count_3_1024'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_3_1024 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  13.321490570332696
RMSE validation:  28.883975502080986
MAE train:  7.44134166666666
MAE validation:  20.943085185185183
R2 score train:  0.9888019884192324
R2 score validation:  0.920242362453985


### r=0 fpSize=256

In [23]:
X = np.stack(df['FP_count_0_256'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_0_256 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.681084416834661
RMSE validation:  18.272976602753566
MAE train:  7.062813628756561
MAE validation:  12.932472295085695
R2 score train:  0.9898527571071531
R2 score validation:  0.9680789666559968


### r=0 fpSize=512

In [24]:
X = np.stack(df['FP_count_0_512'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_0_512= get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.622311354666733
RMSE validation:  18.003366029612998
MAE train:  7.021157306351052
MAE validation:  12.698444821627175
R2 score train:  0.9899465980572112
R2 score validation:  0.969013982048426


### r=0 fpSize=2048

In [25]:
X = np.stack(df['FP_count_0_2048'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_0_2048 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.690748334559917
RMSE validation:  18.393827831366693
MAE train:  7.051404986781251
MAE validation:  13.010531142822321
R2 score train:  0.9898372853253246
R2 score validation:  0.9676553407821364


### r=0 fpSize=4096

In [26]:
X = np.stack(df['FP_count_0_4096'].values)
y = df['boiling_point'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
rf_count = RandomForestRegressor(random_state=random_seed)
rf_count.fit(X_train, y_train)
print_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))
metrics_counts_0_4096 = get_metrics(y_train, rf_count.predict(X_train), y_test, rf_count.predict(X_test))

Metrics:
RMSE train:  12.641899845796601
RMSE validation:  18.159504623881872
MAE train:  6.99382540859195
MAE validation:  12.601388021129196
R2 score train:  0.9899153702135528
R2 score validation:  0.9684741837532381


# Comparison

In [27]:
data = {
    'Model': [
        'VE1', 'log(VE1)',
        'Binary r=2 fpSize=1024',
        'Counts r=1 fpSize=1024', 'Counts r=2 fpSize=1024', 'Counts r=3 fpSize=1024',
        'Counts r=0 fpSize=256', 'Counts r=0 fpSize=512', 'Counts r=0 fpSize=1024', 'Counts r=0 fpSize=2048', 'Counts r=0 fpSize=4096'
    ],
    'RMSE train': [
        24.25060072083824, 15.979987328026494,
        metrics_bin_2_1024[0],
        metrics_counts_1_1024[0], metrics_counts_2_1024[0], metrics_counts_3_1024[0],
        metrics_counts_0_256[0], metrics_counts_0_512[0], metrics_counts_0_1024[0], metrics_counts_0_2048[0], metrics_counts_0_4096[0]
    ],
    'RMSE test': [
        17.245949894319452, 16.266153840585194,
        metrics_bin_2_1024[1],
        metrics_counts_1_1024[1], metrics_counts_2_1024[1], metrics_counts_3_1024[1],
        metrics_counts_0_256[1], metrics_counts_0_512[1], metrics_counts_0_1024[1], metrics_counts_0_2048[1], metrics_counts_0_4096[1]
    ],
    'MAE train': [
        16.445572429282834, 10.36957440697571,
        metrics_bin_2_1024[2],
        metrics_counts_1_1024[2], metrics_counts_2_1024[2], metrics_counts_3_1024[2],
        metrics_counts_0_256[2], metrics_counts_0_512[2], metrics_counts_0_1024[2], metrics_counts_0_2048[2], metrics_counts_0_4096[2]
    ],
    'MAE test': [
        13.197447754485966, 11.176508823638748,
        metrics_bin_2_1024[3],
        metrics_counts_1_1024[3], metrics_counts_2_1024[3], metrics_counts_3_1024[3],
        metrics_counts_0_256[3], metrics_counts_0_512[3], metrics_counts_0_1024[3], metrics_counts_0_2048[3], metrics_counts_0_4096[3]
    ],
    'R2 train': [
        0.965730268026368, 0.9851194642807888,
        metrics_bin_2_1024[4],
        metrics_counts_1_1024[4], metrics_counts_2_1024[4], metrics_counts_3_1024[4],
        metrics_counts_0_256[4], metrics_counts_0_512[4], metrics_counts_0_1024[4], metrics_counts_0_2048[4], metrics_counts_0_4096[4]
    ],
    'R2 test': [
        0.9679490172542323, 0.9714873973881354,
        metrics_bin_2_1024[5],
        metrics_counts_1_1024[5], metrics_counts_2_1024[5], metrics_counts_3_1024[5],
        metrics_counts_0_256[5], metrics_counts_0_512[5], metrics_counts_0_1024[5], metrics_counts_0_2048[5], metrics_counts_0_4096[5]
    ],
}


df_table = pd.DataFrame(data)

def highlight_best_worst(s):
    if 'RMSE' in s.name:
        is_best = s == s.min()
    elif 'MAE' in s.name:
        is_best = s == s.min()
    elif 'R2' in s.name:
        is_best = s == s.max()

    return [f'background-color: #228B22; color: white' if v else '' for v in is_best]

styled_df = df_table.style.apply(highlight_best_worst, subset=['RMSE train', 'RMSE test', 'MAE train', 'MAE test', 'R2 train', 'R2 test'])
styled_df


Unnamed: 0,Model,RMSE train,RMSE test,MAE train,MAE test,R2 train,R2 test
0,VE1,24.250601,17.24595,16.445572,13.197448,0.96573,0.967949
1,log(VE1),15.979987,16.266154,10.369574,11.176509,0.985119,0.971487
2,Binary r=2 fpSize=1024,65.51773,82.863259,35.137963,53.781889,0.729135,0.343579
3,Counts r=1 fpSize=1024,12.518265,26.334125,6.843362,19.816967,0.990112,0.933703
4,Counts r=2 fpSize=1024,13.397716,28.043722,7.487167,20.998533,0.988673,0.924815
5,Counts r=3 fpSize=1024,13.321491,28.883976,7.441342,20.943085,0.988802,0.920242
6,Counts r=0 fpSize=256,12.681084,18.272977,7.062814,12.932472,0.989853,0.968079
7,Counts r=0 fpSize=512,12.622311,18.003366,7.021157,12.698445,0.989947,0.969014
8,Counts r=0 fpSize=1024,12.672255,18.371804,7.02989,13.055789,0.989867,0.967733
9,Counts r=0 fpSize=2048,12.690748,18.393828,7.051405,13.010531,0.989837,0.967655


# Rerun with cross validation

In [28]:
r_list = [0, 1, 2, 3]
fpSize_list = [256, 512, 1024, 2048, 4096]

In [30]:
metrics_dict = {}

for r in r_list:
    for fpSize in fpSize_list:
        print('r = ', r, ' ', end='')
        print('fpSize = ', fpSize)

        MFPGEN = rdFingerprintGenerator.GetMorganGenerator(fpSize=fpSize,radius=r)
        X = np.stack(df.Mol.apply(MFPGEN.GetCountFingerprintAsNumPy).values)
        y = df['boiling_point'].values

        model = RandomForestRegressor(n_estimators=20, random_state=random_seed)
        mae_scores = -cross_val_score(model, X, y, cv=10, scoring='neg_mean_absolute_error')
        rmse_scores = -cross_val_score(model, X, y, cv=10, scoring='neg_root_mean_squared_error')
        r2_scores = cross_val_score(model, X, y, cv=10, scoring='r2')

        metrics_dict[(r, fpSize)] = {'rmse': rmse_scores.mean(), 'rmse_std': rmse_scores.std(),
                                'mae': mae_scores.mean(), 'mae_std': mae_scores.std(),
                                'r2': r2_scores.mean(), 'r2_std': r2_scores.std()}
        

        

r =  0  fpSize =  256
r =  0  fpSize =  512
r =  0  fpSize =  1024
r =  0  fpSize =  2048
r =  0  fpSize =  4096
r =  1  fpSize =  256
r =  1  fpSize =  512
r =  1  fpSize =  1024
r =  1  fpSize =  2048
r =  1  fpSize =  4096
r =  2  fpSize =  256
r =  2  fpSize =  512
r =  2  fpSize =  1024
r =  2  fpSize =  2048
r =  2  fpSize =  4096
r =  3  fpSize =  256
r =  3  fpSize =  512
r =  3  fpSize =  1024
r =  3  fpSize =  2048
r =  3  fpSize =  4096


In [31]:
df_metrics = pd.DataFrame.from_dict(metrics_dict, orient='index')

df_metrics.index = pd.MultiIndex.from_tuples(df_metrics.index, names=['r', 'fpSize'])
df_metrics.reset_index(inplace=True)

df_metrics.to_csv("ecfp_ve1_comparison_20.csv", index=False)

# ECFP identifiers

In [9]:
nonzero_counts = df['FP_count_0_256'].apply(lambda x: np.count_nonzero(x))

max_nonzero = nonzero_counts.max()


print(f"Max non zero counts: {max_nonzero}")

Max non zero counts: 4


## Comparison with counts model

In [42]:
def count_carbon_types(mol):
    carbon_degrees = Counter(atom.GetDegree() for atom in mol.GetAtoms() if atom.GetSymbol() == 'C')
    return [carbon_degrees.get(i, 0) for i in range(1, 5)]

In [43]:
df_counts = df['Mol'].apply(count_carbon_types).apply(pd.Series)
df_counts['boiling_point'] = df['boiling_point']
df_counts.head(2)

Unnamed: 0,0,1,2,3,boiling_point
0,2,2,0,0,-0.5
1,2,10,0,0,216.3


In [55]:
X = df_counts.drop(columns=['boiling_point'])
y = df_counts['boiling_point']

rf = RandomForestRegressor(n_estimators=100, random_state=random_seed)

mae_scores = -cross_val_score(rf, X, y, cv=10, scoring='neg_mean_absolute_error')
rmse_scores = np.sqrt(-cross_val_score(rf, X, y, cv=10, scoring='neg_mean_squared_error'))
r2_scores = cross_val_score(rf, X, y, cv=10, scoring='r2')

In [56]:
print(f"Mean MAE: {np.mean(mae_scores):.2f}")
print(f"Mean RMSE: {np.mean(rmse_scores):.2f}")
print(f"Mean R2: {np.mean(r2_scores):.2f}")

Mean MAE: 19.50
Mean RMSE: 28.09
Mean R2: 0.84
