In [1]:
import re
from typing import List, Union, Tuple, Dict, Any, Optional

import pandas as pd
import numpy as np
import rdkit
import joblib
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
# import tensorflow as tf
import selfies as sf

from utils import DataSet, grid_search, plot_results, GetMolDescriptors, GetDescriptorsForModelFit

# Set pandas to display all rows
pd.set_option("display.max_rows", None)

In [None]:
target = 'LogExtCoeff'
lec_data = DataSet(target)
print(f'LEC y shape: {lec_data.y.shape}')
print(f'Features shape: {lec_data.X.shape}')
lec_data.X.info()

In [None]:
#  Show any columns that have missing values
missing_values = lec_data.X.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
# Create a new dataset with the missing values dropped
lec_data = DataSet("LogExtCoeff", fill_na="drop")

In [None]:
lec_data.drop_features(["Ipc"])

In [None]:
# Example of a grid search of a RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1, random_state=42)
params = {
    'n_estimators': [ 200 ],
    'max_depth': [ 20 ],
}

grid, y_pred, r2 = grid_search(rf, params, lec_data, scaler=None, model_name="LEC_RF", verbose=3)

In [2]:
# now look at fluorene test data
# load belfield_LEC.parquet
# load belfield_LMA.parquet
belfield_LEC = pd.read_parquet('data/belfield_LEC.parquet')
belfield_LMA = pd.read_parquet('data/belfield_LMA.parquet')
# rename LEC column to "LogExtCoeff"
belfield_LEC.rename(columns={'LEC': 'LogExtCoeff'}, inplace=True)
# rename LMA column to "LambdaMaxAbs"
belfield_LMA.rename(columns={'LMA2 (nm) assume hexane': 'LambdaMaxAbs'}, inplace=True)
belfield_LEC.head()


Unnamed: 0,smiles,LogExtCoeff
23,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC=4C=CC=...,5.0
26,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC=4C=CC5...,5.0
28,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC4=CC=C5...,5.477121
29,O=N(=O)C1=CC=C2C3=CC=C(C=CC4=CC=C5C6=CC=C(C=C6...,4.582745
30,N1=C(SC2=CC=CC=C12)C3=CC=C4C5=CC=C(C=CC6=CC=C7...,5.012837


In [3]:
belfield_LEC_descriptors = GetDescriptorsForModelFit.generate_descriptors_for_model(belfield_LEC, 'smiles', 'used_features')

In [4]:
belfield_LEC_descriptors.head()

Unnamed: 0,smiles,LogExtCoeff,mol,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC=4C=CC=...,5.0,<rdkit.Chem.rdchem.Mol object at 0x15716c7b0>,2.435503,2.435503,0.029431,-0.029431,0.114456,761.025,712.641,...,0,0,0,0,0,0,0,0,0,0
1,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC=4C=CC5...,5.0,<rdkit.Chem.rdchem.Mol object at 0x15716c350>,2.471102,2.471102,0.165766,-0.165766,0.084177,1030.372,966.868,...,0,0,0,0,0,0,0,0,0,0
2,C=1C=CC(=CC1)N(C=2C=CC=CC2)C3=CC=C(C=CC4=CC=C5...,5.477121,<rdkit.Chem.rdchem.Mol object at 0x15716c5f0>,2.605472,2.605472,0.301465,-0.433935,0.035209,2577.472,2414.176,...,0,0,0,0,0,0,0,0,0,0
3,O=N(=O)C1=CC=C2C3=CC=C(C=CC4=CC=C5C6=CC=C(C=C6...,4.582745,<rdkit.Chem.rdchem.Mol object at 0x15716c820>,12.529215,12.529215,0.096659,-0.245659,0.016606,1129.756,1020.892,...,0,0,0,0,0,0,0,0,24,0
4,N1=C(SC2=CC=CC=C12)C3=CC=C4C5=CC=C(C=CC6=CC=C7...,5.012837,<rdkit.Chem.rdchem.Mol object at 0x15716c660>,5.256437,5.256437,0.036659,-0.067293,0.028001,1217.934,1105.038,...,0,0,0,0,0,1,0,0,24,0


In [None]:
model = joblib.load(f"models/LEC_RF.joblib")


# Test it's performance on the belfield set
y_pred_ = model.predict(belfield_LEC_data.X_test)
    r2 = r2_score(data.y_test[data.target_name], y_pred) # type: ignore

    # Cast r2 to float to be sure it's not a numpy float
    r2 = float(r2)
    if verbose > 0:
        print(f"Test R2 score: {r2:.3f}")


In [None]:
target = 'LogExtCoeff'
belfield_lec_data = DataSet(target)
print(f'LEC y shape: {belfield_lec_data.y.shape}')
print(f'Features shape: {belfield_lec_data.X.shape}')
belfield_lec_data.X.info()

Let's assume that we have a dataframe with smiles strings in one of the columns. The function has to be written with an input of the dataframe and the column label for the smiles. We also have to input the list of descriptors that should be used. let's go to the code that generates the descriptors. In the following example let's work with the belfield_LEC df.