#  <span style = "color : red"> Train Data Preparation </span>

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools, AllChem, MACCSkeys, RDConfig
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem.Pharm2D import Generate
import os
from sklearn.svm import SVR
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import csv

In [3]:
sdf_file = './dataset/Tetrahymena_pyriformis_Work_set_OCHEM.sdf'
suppl = Chem.SDMolSupplier(sdf_file)
molecules = [mol for mol in suppl if mol is not None]
df = PandasTools.LoadSDF(sdf_file)
df.head()

Unnamed: 0,Name,IGC50,CAS,SET,Species,Test duration,ID,ROMol
0,"1,4-Naphthalenedione, 2,3-dichloro-",6.36,117-80-6,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
1,"Isothiocyanic acid, 1,4-phenylenedi-",6.347,4044-65-9,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
2,"1,4-Naphthoquinone, 5-hydroxy- (8CI)",6.33,481-39-0,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
3,"1,2-Naphthalenedione",6.2,524-42-5,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
4,1-Naphthalenemethyl isothiocyanate,6.056,17112-82-2,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...


In [4]:
df['SMILES'] = df['ROMol'].apply(Chem.MolToSmiles)

In [5]:
df.head()

Unnamed: 0,Name,IGC50,CAS,SET,Species,Test duration,ID,ROMol,SMILES
0,"1,4-Naphthalenedione, 2,3-dichloro-",6.36,117-80-6,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,O=C1C(Cl)=C(Cl)C(=O)c2ccccc21
1,"Isothiocyanic acid, 1,4-phenylenedi-",6.347,4044-65-9,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,S=C=Nc1ccc(N=C=S)cc1
2,"1,4-Naphthoquinone, 5-hydroxy- (8CI)",6.33,481-39-0,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,O=C1C=CC(=O)c2c(O)cccc21
3,"1,2-Naphthalenedione",6.2,524-42-5,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,O=C1C=Cc2ccccc2C1=O
4,1-Naphthalenemethyl isothiocyanate,6.056,17112-82-2,ws,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,S=C=NCc1cccc2ccccc12


In [6]:
selected_columns = df[['Name', 'SMILES', 'IGC50']]
output_file = './qsar_regression_training.csv'
selected_columns.to_csv(output_file, index=False)
selected_columns.head()

Unnamed: 0,Name,SMILES,IGC50
0,"1,4-Naphthalenedione, 2,3-dichloro-",O=C1C(Cl)=C(Cl)C(=O)c2ccccc21,6.36
1,"Isothiocyanic acid, 1,4-phenylenedi-",S=C=Nc1ccc(N=C=S)cc1,6.347
2,"1,4-Naphthoquinone, 5-hydroxy- (8CI)",O=C1C=CC(=O)c2c(O)cccc21,6.33
3,"1,2-Naphthalenedione",O=C1C=Cc2ccccc2C1=O,6.2
4,1-Naphthalenemethyl isothiocyanate,S=C=NCc1cccc2ccccc12,6.056


#  <span style = "color : red"> Test Data Preparation </span>

In [7]:
sdf_file = './dataset/Tetrahymena_pyriformis_Test_set_OCHEM.sdf'
suppl = Chem.SDMolSupplier(sdf_file)
molecules = [mol for mol in suppl if mol is not None]
df = PandasTools.LoadSDF(sdf_file)
df.head()

Unnamed: 0,Name,IGC50,CAS,SET,Species,Test duration,ID,ROMol
0,"Phenol, 2,2'-methylenebis(4-chloro-",6.09,97-23-4,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
1,"Acetic acid, iodo-, ethyl ester",5.921,623-48-3,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
2,"Benzene, (isothiocyanatomethyl)-",5.74,622-78-6,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
3,L-alpha-Methylbenzyl isothiocyanate,5.699,24277-43-8,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...
4,"2H-1-Benzopyran-2-one,3-(bromoacetyl)- (9CI)",5.658,29310-88-1,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...


In [8]:
df['SMILES'] = df['ROMol'].apply(Chem.MolToSmiles)
df.head()

Unnamed: 0,Name,IGC50,CAS,SET,Species,Test duration,ID,ROMol,SMILES
0,"Phenol, 2,2'-methylenebis(4-chloro-",6.09,97-23-4,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,Oc1ccc(Cl)cc1Cc1cc(Cl)ccc1O
1,"Acetic acid, iodo-, ethyl ester",5.921,623-48-3,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,CCOC(=O)CI
2,"Benzene, (isothiocyanatomethyl)-",5.74,622-78-6,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,S=C=NCc1ccccc1
3,L-alpha-Methylbenzyl isothiocyanate,5.699,24277-43-8,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,C[C@H](N=C=S)c1ccccc1
4,"2H-1-Benzopyran-2-one,3-(bromoacetyl)- (9CI)",5.658,29310-88-1,ts,Tetrahymena pyriformis,48h,,<rdkit.Chem.rdchem.Mol object at 0x0000016EA56...,O=C(CBr)c1cc2ccccc2oc1=O


In [9]:
selected_columns = df[['Name','SMILES', 'IGC50']]
output_file = './qsar_regression_testing.csv'
selected_columns.to_csv(output_file, index=False)
selected_columns.head()

Unnamed: 0,Name,SMILES,IGC50
0,"Phenol, 2,2'-methylenebis(4-chloro-",Oc1ccc(Cl)cc1Cc1cc(Cl)ccc1O,6.09
1,"Acetic acid, iodo-, ethyl ester",CCOC(=O)CI,5.921
2,"Benzene, (isothiocyanatomethyl)-",S=C=NCc1ccccc1,5.74
3,L-alpha-Methylbenzyl isothiocyanate,C[C@H](N=C=S)c1ccccc1,5.699
4,"2H-1-Benzopyran-2-one,3-(bromoacetyl)- (9CI)",O=C(CBr)c1cc2ccccc2oc1=O,5.658


In [10]:
df_train = pd.read_csv('qsar_regression_training.csv')
train_smile = df_train['SMILES']
df_test = pd.read_csv('qsar_regression_testing.csv')
test_smile = df_test['SMILES']

#  <span style = "color : red"> Atom Pair Fingerprint </span>

In [11]:
def fingerprint_atom_pair_smiles_list(
        smiles,
        len_fingerprint=2048,
        min_atom_pair_len=1,
        max_atom_pair_len=30,
        from_atoms=0,
        ignore_atoms=0,
        atom_invariants=0,
        bits_per_entry=4,
        Chirality=False,
        use_2D=True,
        conf_Id=-1
):
    fingerprint_rep = [None] * len(smiles)
    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetHashedAtomPairFingerprintAsBitVect(
                mol, nBits=len_fingerprint, minLength=min_atom_pair_len,
                maxLength=max_atom_pair_len, fromAtoms=from_atoms,
                ignoreAtoms=ignore_atoms, atomInvariants=atom_invariants,
                nBitsPerEntry=bits_per_entry, includeChirality=Chirality,
                use2D=use_2D, confId=conf_Id
            )
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep


In [12]:
train_input_features = fingerprint_atom_pair_smiles_list(train_smile)
test_input_features = fingerprint_atom_pair_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [13]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,6.347
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [14]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.740
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.699
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [15]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_atom_pair.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [16]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    # Store results
                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_atom_pair.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [17]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'squared_error', 'max_features': 'sqrt'}
Training R^2: 0.9999999898337997
Training MSE: 1.121158911314689e-08
Validation R^2: 0.7091692408837031
Validation MSE: 0.32044211965456115
Test R^2: 0.7025296360165587
Test MSE: 0.328522266308146


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [18]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_atom_pair.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [19]:
best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")



SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9902545282348852
Training MSE: 0.01074759711367009
Validation R^2: 0.7361954458097376
Validation MSE: 0.29066420201258997
Test R^2: 0.7359263187571594
Test MSE: 0.2916394193777976


#  <span style = "color : red"> Extended Connectivity Fingerprint </span>

In [20]:
def fingerprint_extended_connectivity_smiles_list(
        smiles,
        radius = 2,
        len_fingerprint=2048,
        from_atoms=0,
        atom_invariants=0,
        Chirality=False,
        Bond_Types=True,
        use_Features=False,
        bitInfo=None,
        RedundantEnvironments=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=len_fingerprint, 
                                                                invariants=atom_invariants, fromAtoms=from_atoms,
                                                                useChirality=Chirality, useBondTypes=Bond_Types,
                                                                useFeatures=use_Features, bitInfo=bitInfo,
                                                                 includeRedundantEnvironments=RedundantEnvironments ) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()
            
    return fingerprint_rep


In [21]:
train_input_features = fingerprint_extended_connectivity_smiles_list(train_smile)
test_input_features = fingerprint_extended_connectivity_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [22]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [23]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.740
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.699
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [24]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_ecfp.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [25]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_ecfp.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [26]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'friedman_mse', 'max_features': 'sqrt'}
Training R^2: 0.9801217485292654
Training MSE: 0.02192232898323509
Validation R^2: 0.6621496177138035
Validation MSE: 0.37224911475956085
Test R^2: 0.6536367680740236
Test MSE: 0.38251889161123237


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [27]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_ecfp.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [28]:
best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9675888751596717
Training MSE: 0.03574395577560935
Validation R^2: 0.717273417463834
Validation MSE: 0.3115128044429142
Test R^2: 0.6941783009947935
Test MSE: 0.33774536830496094


#  <span style = "color : red"> MACCS Keys Fingerprint </span>

In [29]:
def fingerprint_maccs_keys_smiles_list(
        smiles
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin =  MACCSkeys.GenMACCSKeys(mol)
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep

In [30]:
train_input_features = fingerprint_maccs_keys_smiles_list(train_smile)
test_input_features = fingerprint_maccs_keys_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [31]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(167)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_159,descriptor_160,descriptor_161,descriptor_162,descriptor_163,descriptor_164,descriptor_165,descriptor_166,descriptor_167,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,1,0,6.347
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,1,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0.764
1420,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,1,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0.548


In [32]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(167)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_159,descriptor_160,descriptor_161,descriptor_162,descriptor_163,descriptor_164,descriptor_165,descriptor_166,descriptor_167,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,1,0,5.740
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,1,0,5.699
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,1,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,1,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [33]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_maccs_keys.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [34]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_maccs_keys.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [35]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 100, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'squared_error', 'max_features': 'sqrt'}
Training R^2: 0.955521972473066
Training MSE: 0.049051696192009143
Validation R^2: 0.6768752563528163
Validation MSE: 0.3560241636124043
Test R^2: 0.6675438966679936
Test MSE: 0.36716004596910246


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [36]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [37]:

best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9142047541436434
Training MSE: 0.09461755766746423
Validation R^2: 0.6996404071542384
Validation MSE: 0.33094114557390886
Test R^2: 0.7136890641067781
Test MSE: 0.3161979441208584


#  <span style = "color : red"> Morgan Fingerprint </span>

In [38]:
def fingerprint_morgan_smiles_list(
        smiles,
        radius = 1,
        len_fingerprint=2048,
        from_atoms=0,
        atom_invariants=0,
        Chirality=False,
        Bond_Types=True,
        use_Features=False,
        bitInfo=None,
        RedundantEnvironments=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=len_fingerprint, 
                                                                invariants=atom_invariants, fromAtoms=from_atoms,
                                                                useChirality=Chirality, useBondTypes=Bond_Types,
                                                                useFeatures=use_Features, bitInfo=bitInfo,
                                                                 includeRedundantEnvironments=RedundantEnvironments ) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep 


In [39]:
train_input_features = fingerprint_morgan_smiles_list(train_smile)
test_input_features = fingerprint_morgan_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [40]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [41]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.740
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.699
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [42]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_morgan.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [43]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_morgan.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [44]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'poisson', 'max_features': 'sqrt'}
Training R^2: 0.9405525091849494
Training MSE: 0.06556046706593086
Validation R^2: 0.5643096332226586
Validation MSE: 0.48005082085342987
Test R^2: 0.5765370104714312
Test MSE: 0.46766682621631545


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [45]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_morgan.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [46]:

best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9203325707830897
Training MSE: 0.08785961859436936
Validation R^2: 0.6413419186828355
Validation MSE: 0.3951753802030014
Test R^2: 0.6284933546640484
Test MSE: 0.4102869389741873


#  <span style = "color : red"> Pharmacophore Fingerprint </span>

In [47]:
def fingerprint_pharmacophore_smiles_list(smiles_list):
    fingerprint_rep = [None] * len(smiles_list)
    for i,smiles in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        AllChem.Compute2DCoords(mol)
        featFactory = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
        if featFactory.GetFeaturesForMol(mol) == []:
            return None  
        sigFactory = SigFactory(featFactory)
        sigFactory.SetBins([(0,3), (3,6), (6,10)]) 
        sigFactory.Init()
        pharm2d = Generate.Gen2DFingerprint(mol, sigFactory)
        fingerprint_rep[i]=pharm2d.ToBitString()
    return fingerprint_rep

In [48]:
train_input_features = fingerprint_pharmacophore_smiles_list(train_smile)
test_input_features = fingerprint_pharmacophore_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [49]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(3348)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_3340,descriptor_3341,descriptor_3342,descriptor_3343,descriptor_3344,descriptor_3345,descriptor_3346,descriptor_3347,descriptor_3348,IGC50
0,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,0,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,6.330
3,0,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [50]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(3348)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_3340,descriptor_3341,descriptor_3342,descriptor_3343,descriptor_3344,descriptor_3345,descriptor_3346,descriptor_3347,descriptor_3348,IGC50
0,0,0,1,1,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,6.090
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.740
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.699
4,0,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [51]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_pharmacophore.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [52]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_pharmacophore.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [53]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'bootstrap': True, 'criterion': 'absolute_error', 'max_features': 'sqrt'}
Training R^2: 0.6843128652594068
Training MSE: 0.34814919379337145
Validation R^2: 0.5101705010231594
Validation MSE: 0.5397022082478724
Test R^2: 0.5119763254155956
Test MSE: 0.5389667778650459


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [54]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_pharmacophore.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [55]:
best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.6712737866887399
Training MSE: 0.3625290787890534
Validation R^2: 0.4931368476420215
Validation MSE: 0.5584701680451724
Test R^2: 0.46756184976445714
Test MSE: 0.5880175269965197


#  <span style = "color : red"> PubChem Fingerprint </span>

In [56]:

def fingerprint_pubchem_smiles_list(
        smiles,
        minimum_path=1,
        maximum_path=7,
        len_fingerprint=2048,
        BitsPerHash=2,
        useHs=True,
        Density=0.0,
        minimum_size=128,
        branchedPaths=True,
        useBondOrder=True,
        from_atoms=0,
        atom_invariants=0,
        atomBits=None,
        bitInfo=None
        ):
    fingerprint_rep = [None] * len(smiles)
    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = Chem.RDKFingerprint(mol, minPath=minimum_path, maxPath=maximum_path, fpSize=len_fingerprint, 
                                              nBitsPerHash=BitsPerHash, useHs=useHs, tgtDensity=Density, minSize=minimum_size,
                                              branchedPaths=branchedPaths, useBondOrder=useBondOrder, 
                                                atomInvariants=atom_invariants, fromAtoms=from_atoms, atomBits=atomBits,
                                                 bitInfo=bitInfo)
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep


In [57]:
train_input_features = fingerprint_pubchem_smiles_list(train_smile)
test_input_features = fingerprint_pubchem_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [58]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,6.330
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [59]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5.740
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,5.699
4,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.069
352,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.910
354,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [60]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_pubchem.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [61]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_pubchem.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [62]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 50, 'min_samples_leaf': 1, 'bootstrap': True, 'criterion': 'squared_error', 'max_features': 'sqrt'}
Training R^2: 0.9332056859505238
Training MSE: 0.07366276299290764
Validation R^2: 0.7141330528079911
Validation MSE: 0.31497290993472754
Test R^2: 0.6799552775744393
Test MSE: 0.3534530839416981


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [63]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_pubchem.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [64]:

best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9589840986767592
Training MSE: 0.045233560088309116
Validation R^2: 0.7531660526663886
Validation MSE: 0.2719657079141892
Test R^2: 0.7035363241180824
Test MSE: 0.32741049351791135


#  <span style = "color : red"> Substructure Fingerprint </span>

In [65]:
def fingerprint_substructure_smiles_list(
        smiles
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.PatternFingerprint(mol) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()
    return fingerprint_rep 


In [66]:
train_input_features = fingerprint_substructure_smiles_list(train_smile)
test_input_features = fingerprint_substructure_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [67]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [68]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5.740
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5.699
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [69]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_substructure.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [70]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_substructure.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [71]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'squared_error', 'max_features': 'sqrt'}
Training R^2: 0.9891532102576941
Training MSE: 0.01196216344746854
Validation R^2: 0.6994518680805504
Validation MSE: 0.3311488810300685
Test R^2: 0.6633243925516925
Test MSE: 0.37182000952453403


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [72]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_substructure.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [73]:

best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9169714558940504
Training MSE: 0.09156635640561239
Validation R^2: 0.7596227868489699
Validation MSE: 0.26485157186544667
Test R^2: 0.7546077609754014
Test MSE: 0.27100788602685466


#  <span style = "color : red"> Topological Torsion Fingerprint </span>

In [74]:
def fingerprint_topological_torsion_smiles_list(
        smiles,
        len_fingerprint=2048,
        target_size=4,
        from_atoms=0,
        atom_invariants=0,
        ignoreAtoms=0,
        Chirality=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=len_fingerprint, targetSize=target_size,
                                                                atomInvariants=atom_invariants, fromAtoms=from_atoms,ignoreAtoms=ignoreAtoms,
                                                                includeChirality=Chirality) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep


In [75]:
train_input_features = fingerprint_topological_torsion_smiles_list(train_smile)
test_input_features = fingerprint_topological_torsion_smiles_list(test_smile)

###  <span style = "color : lightgreen"> Making Train and Test Dataframe </span>

In [76]:
input_train_descriptor = []
for binary_string in train_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_train_descriptor.append(output_list)
train_descriptor_frame = pd.DataFrame(input_train_descriptor)
train_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
train_descriptor_frame['IGC50'] = df_train['IGC50']
train_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.360
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.347
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.330
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.200
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.764
1420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.699
1421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.678
1422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.548


In [77]:
input_test_descriptor = []
for binary_string in test_input_features:
    output_list = [int(bit) for bit in binary_string]
    input_test_descriptor.append(output_list)
test_descriptor_frame = pd.DataFrame(input_test_descriptor)
test_descriptor_frame.columns = [f'descriptor_{i+1}' for i in range(2048)]
test_descriptor_frame['IGC50'] = df_test['IGC50']
test_descriptor_frame

Unnamed: 0,descriptor_1,descriptor_2,descriptor_3,descriptor_4,descriptor_5,descriptor_6,descriptor_7,descriptor_8,descriptor_9,descriptor_10,...,descriptor_2040,descriptor_2041,descriptor_2042,descriptor_2043,descriptor_2044,descriptor_2045,descriptor_2046,descriptor_2047,descriptor_2048,IGC50
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.090
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.921
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.740
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.699
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.069
352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.028
353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.910
354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.723


##  <span style = "color : lightgreen"> Splitting into Train and Validation Set </span>

In [78]:
X = train_descriptor_frame.drop(columns='IGC50')
y = train_descriptor_frame['IGC50']
X_test = test_descriptor_frame.drop(columns='IGC50')
y_test = test_descriptor_frame['IGC50']

split_ratio = 0.2
random_state = 42 
np.random.seed(random_state)

indices = np.arange(len(X))
np.random.shuffle(indices)
split_index = int((1 - split_ratio) * len(X))

train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train = X.iloc[train_indices, :]
y_train = y.iloc[train_indices]
X_val = X.iloc[val_indices, :]
y_val = y.iloc[val_indices]

validation_set = pd.concat([X_val, y_val], axis=1)
validation_set['index'] = validation_set.index
validation_set.to_csv('validation_set_topological_torsion.csv', index=False)

##  <span style = "color : Orange"> Random Forest </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [79]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 15],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['sqrt', 'log2']
}

results_rf = []

best_params_rf = {}
max_test_r2_rf = float('-inf')

for n_estimators in param_grid_rf['n_estimators']:
    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
        for bootstrap in param_grid_rf['bootstrap']:
            for criterion in param_grid_rf['criterion']:
                for max_features in param_grid_rf['max_features']:
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        criterion=criterion,
                        max_features=max_features,
                        random_state=42
                    )
                    rf.fit(X_train, y_train)
                    y_pred_train = rf.predict(X_train)
                    y_pred_val = rf.predict(X_val)
                    y_pred_test = rf.predict(X_test)
                    train_score = r2_score(y_train, y_pred_train)
                    val_score = r2_score(y_val, y_pred_val)
                    test_score = r2_score(y_test, y_pred_test)
                    train_mse = mean_squared_error(y_train, y_pred_train)
                    val_mse = mean_squared_error(y_val, y_pred_val)
                    test_mse = mean_squared_error(y_test, y_pred_test)

                    results_rf.append({
                        'n_estimators': n_estimators,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap,
                        'criterion': criterion,
                        'max_features': max_features,
                        'train_r2': train_score,
                        'val_r2': val_score,
                        'test_r2': test_score,
                        'train_mse': train_mse,
                        'val_mse': val_mse,
                        'test_mse': test_mse
                    })

                    if test_score > max_test_r2_rf:
                        max_test_r2_rf = test_score
                        best_params_rf = {
                            'n_estimators': n_estimators,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion,
                            'max_features': max_features,
                        }

with open('random_forest_results_topological_torsion.csv', 'w', newline='') as csvfile:
    fieldnames = ['n_estimators', 'min_samples_leaf', 'bootstrap', 'criterion', 'max_features',
                  'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_rf)

###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [80]:
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_train_rf = best_rf.predict(X_train)
y_pred_val_rf = best_rf.predict(X_val)
y_pred_test_rf = best_rf.predict(X_test)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
val_r2_rf = r2_score(y_val, y_pred_val_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
val_mse_rf = mean_squared_error(y_val, y_pred_val_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)

print("RandomForestRegressor Results")
print(f"Best Parameters: {best_params_rf}")
print(f"Training R^2: {train_r2_rf}")
print(f"Training MSE: {train_mse_rf}")
print(f"Validation R^2: {val_r2_rf}")
print(f"Validation MSE: {val_mse_rf}")
print(f"Test R^2: {test_r2_rf}")
print(f"Test MSE: {test_mse_rf}")

RandomForestRegressor Results
Best Parameters: {'n_estimators': 100, 'min_samples_leaf': 1, 'bootstrap': False, 'criterion': 'friedman_mse', 'max_features': 'sqrt'}
Training R^2: 0.9864635640837625
Training MSE: 0.014928385519879591
Validation R^2: 0.60024108511336
Validation MSE: 0.4404609554584904
Test R^2: 0.52545502374264
Test MSE: 0.5240810848434347


##  <span style = "color : Orange"> Support Vector Machine </span>

###  <span style = "color : lightgreen">  Hyperparameter Tuning to find the set of Best Parameters </span>

In [81]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

results_svr = []

best_params_svr = {}
max_test_r2_svr = float('-inf')

for C in param_grid_svr['C']:
    for gamma in param_grid_svr['gamma']:
        for kernel in param_grid_svr['kernel']:
            for degree in param_grid_svr['degree']:
                if kernel != 'poly' and degree != 3:
                    continue
                svr = SVR(C=C, gamma=gamma, kernel=kernel, degree=degree)
                svr.fit(X_train, y_train)
                y_pred_train = svr.predict(X_train)
                y_pred_val = svr.predict(X_val)
                y_pred_test = svr.predict(X_test)
                train_score = r2_score(y_train, y_pred_train)
                val_score = r2_score(y_val, y_pred_val)
                test_score = r2_score(y_test, y_pred_test)
                train_mse = mean_squared_error(y_train, y_pred_train)
                val_mse = mean_squared_error(y_val, y_pred_val)
                test_mse = mean_squared_error(y_test, y_pred_test)

                results_svr.append({
                    'C': C,
                    'gamma': gamma,
                    'kernel': kernel,
                    'degree': degree,
                    'train_r2': train_score,
                    'val_r2': val_score,
                    'test_r2': test_score,
                    'train_mse': train_mse,
                    'val_mse': val_mse,
                    'test_mse': test_mse
                })

                if test_score > max_test_r2_svr:
                    max_test_r2_svr = test_score
                    best_params_svr = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel,
                        'degree': degree
                    }

with open('svr_results_topological_torsion.csv', 'w', newline='') as csvfile:
    fieldnames = ['C', 'gamma', 'kernel', 'degree', 'train_r2', 'val_r2', 'test_r2', 'train_mse', 'val_mse', 'test_mse']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results_svr)


###  <span style = "color : lightgreen"> Model Training with the best parameters || Validation || Model Performance on Test Set </span>

In [82]:

best_svr = SVR(**best_params_svr)
best_svr.fit(X_train, y_train)
y_pred_train_svr = best_svr.predict(X_train)
y_pred_val_svr = best_svr.predict(X_val)
y_pred_test_svr = best_svr.predict(X_test)
train_r2_svr = r2_score(y_train, y_pred_train_svr)
val_r2_svr = r2_score(y_val, y_pred_val_svr)
test_r2_svr = r2_score(y_test, y_pred_test_svr)
train_mse_svr = mean_squared_error(y_train, y_pred_train_svr)
val_mse_svr = mean_squared_error(y_val, y_pred_val_svr)
test_mse_svr = mean_squared_error(y_test, y_pred_test_svr)

print("\nSVR Results")
print(f"Best Parameters: {best_params_svr}")
print(f"Training R^2: {train_r2_svr}")
print(f"Training MSE: {train_mse_svr}")
print(f"Validation R^2: {val_r2_svr}")
print(f"Validation MSE: {val_mse_svr}")
print(f"Test R^2: {test_r2_svr}")
print(f"Test MSE: {test_mse_svr}")


SVR Results
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'degree': 3}
Training R^2: 0.9606695199324475
Training MSE: 0.04337482722657157
Validation R^2: 0.6292298241394005
Validation MSE: 0.40852068542707065
Test R^2: 0.5801339198325768
Test MSE: 0.46369444792892983


#  <span style = "color : red"> Topological Torsion Fingerprint </span>

In [83]:

data = {
    ("Support Vector Machine", "Train", "mse"): [0.01074759711367009, 0.09461755766746423, 0.08785961859436936, 0.03574395577560935, 0.09156635640561239, 0.3625290787890534, 0.045233560088309116, 0.04337482722657157],
    ("Support Vector Machine", "Train", "r2"): [0.9902545282348852, 0.9142047541436434, 0.9203325707830897, 0.9675888751596717, 0.9169714558940504, 0.6712737866887399, 0.9589840986767592, 0.9606695199324475],
    ("Support Vector Machine", "Validation", "mse"): [0.29066420201258997, 0.33094114557390886, 0.3951753802030014, 0.3115128044429142, 0.26485157186544667, 0.5584701680451724, 0.2719657079141892, 0.40852068542707065],
    ("Support Vector Machine", "Validation", "r2"): [0.7361954458097376, 0.6996404071542384, 0.6413419186828355, 0.717273417463834, 0.7596227868489699, 0.4931368476420215, 0.7531660526663886, 0.6292298241394005],
    ("Support Vector Machine", "Test", "mse"): [0.2916394193777976, 0.3161979441208584, 0.4102869389741873, 0.33774536830496094, 0.27100788602685466, 0.5880175269965197, 0.32741049351791135, 0.46369444792892983],
    ("Support Vector Machine", "Test", "r2"): [0.7359263187571594, 0.7136890641067781, 0.6284933546640484, 0.6941783009947935, 0.7546077609754014, 0.46756184976445714, 0.7035363241180824, 0.5801339198325768],
    ("Support Vector Machine", "Best Parameters","C"): [10,10,10,10,10,10,10,10],
    ("Support Vector Machine", "Best Parameters","gamma"): ['scale','scale','scale','scale','scale','scale','scale','scale'],
    ("Support Vector Machine", "Best Parameters","kernel"): ['rbf','rbf','rbf','rbf','rbf','rbf','rbf','rbf'],
    ("Support Vector Machine", "Best Parameters","degree"): [3,3,3,3,3,3,3,3],
    ("Random Forest", "Train","mse"): [1.121158911314689e-08, 0.049051696192009143, 0.06556046706593086, 0.02192232898323509, 0.01196216344746854, 0.34814919379337145, 0.07366276299290764, 0.014928385519879591],
    ("Random Forest", "Train","r2"):[0.9999999898337997,0.955521972473066, 0.9405525091849494,0.9801217485292654,0.9891532102576941,0.6843128652594068,0.9332056859505238,0.9864635640837625],
    ("Random Forest", "Validation","mse"): [0.32044211965456115, 0.3560241636124043, 0.48005082085342987, 0.37224911475956085, 0.3311488810300685, 0.5397022082478724,0.31497290993472754, 0.4404609554584904],
    ("Random Forest", "Validation","r2"):[0.7091692408837031,0.6768752563528163,0.5643096332226586,0.6621496177138035,0.7141330528079911,0.5101705010231594,0.7141330528079911,0.60024108511336],
    ("Random Forest", "Test","mse"): [0.328522266308146, 0.36716004596910246, 0.46766682621631545, 0.38251889161123237, 0.37182000952453403, 0.5389667778650459, 0.3534530839416981, 0.5240810848434347],
    ("Random Forest", "Test","r2"):[0.7025296360165587,0.6675438966679936,0.5765370104714312,0.6536367680740236,0.6799552775744393,0.5119763254155956,0.6799552775744393,0.52545502374264],
    ("Random Forest","Best  Parameters","n_estimators"):[200,100,200,200,200,200,50,100],
    ("Random Forest","Best  Parameters","min_samples_leaf"):[1,1,1,1,1,1,1,1],
    ("Random Forest","Best  Parameters","bootstrap"):[False,False,False,False,False,True,True,False],
    ("Random Forest","Best  Parameters","criterion"):['squared_error','squared_error','poisson','friedman_mse','squared_error','absolue_error','squared_error','friedman_mse'],
    ("Random Forest","Best  Parameters","max_features"):['sqrt','sqrt','sqrt','sqrt','sqrt','sqrt','sqrt','sqrt']
}

df = pd.DataFrame(data)
df.insert(0, 'Fingerprint', ["Atom Pair", "MACCS Keys", "Morgan", "Extended Connectivity", 
                             "Substructure", "Pharmacophore", "Pubchem", "Topological Torsion"])

df


Unnamed: 0_level_0,Fingerprint,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,Support Vector Machine,...,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_1,Unnamed: 1_level_1,Train,Train,Validation,Validation,Test,Test,Best Parameters,Best Parameters,Best Parameters,...,Train,Validation,Validation,Test,Test,Best Parameters,Best Parameters,Best Parameters,Best Parameters,Best Parameters
Unnamed: 0_level_2,Unnamed: 1_level_2,mse,r2,mse,r2,mse,r2,C,gamma,kernel,...,r2,mse,r2,mse,r2,n_estimators,min_samples_leaf,bootstrap,criterion,max_features
0,Atom Pair,0.010748,0.990255,0.290664,0.736195,0.291639,0.735926,10,scale,rbf,...,1.0,0.320442,0.709169,0.328522,0.70253,200,1,False,squared_error,sqrt
1,MACCS Keys,0.094618,0.914205,0.330941,0.69964,0.316198,0.713689,10,scale,rbf,...,0.955522,0.356024,0.676875,0.36716,0.667544,100,1,False,squared_error,sqrt
2,Morgan,0.08786,0.920333,0.395175,0.641342,0.410287,0.628493,10,scale,rbf,...,0.940553,0.480051,0.56431,0.467667,0.576537,200,1,False,poisson,sqrt
3,Extended Connectivity,0.035744,0.967589,0.311513,0.717273,0.337745,0.694178,10,scale,rbf,...,0.980122,0.372249,0.66215,0.382519,0.653637,200,1,False,friedman_mse,sqrt
4,Substructure,0.091566,0.916971,0.264852,0.759623,0.271008,0.754608,10,scale,rbf,...,0.989153,0.331149,0.714133,0.37182,0.679955,200,1,False,squared_error,sqrt
5,Pharmacophore,0.362529,0.671274,0.55847,0.493137,0.588018,0.467562,10,scale,rbf,...,0.684313,0.539702,0.510171,0.538967,0.511976,200,1,True,absolue_error,sqrt
6,Pubchem,0.045234,0.958984,0.271966,0.753166,0.32741,0.703536,10,scale,rbf,...,0.933206,0.314973,0.714133,0.353453,0.679955,50,1,True,squared_error,sqrt
7,Topological Torsion,0.043375,0.96067,0.408521,0.62923,0.463694,0.580134,10,scale,rbf,...,0.986464,0.440461,0.600241,0.524081,0.525455,100,1,False,friedman_mse,sqrt


In [84]:
df.to_csv('regression_results.csv', index=False)