In [21]:
import sklearn.datasets
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem, DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors, Descriptors, rdMMPA, QED, RDConfig, Draw, PropertyMol
from rdkit.Chem.Lipinski import RotatableBondSmarts
from rdkit.Chem.FilterCatalog import *
from sklearn.model_selection import train_test_split
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import sklearn
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
import pickle 


In [22]:
def morgan_fp(mol, radius=3, nbits=2048, use_features=False):
    "morgan fingerprint"
    mol = to_mol(mol)
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits, useFeatures=use_features)
    return fp


def fp_to_array(fp):
    "Converts RDKit `ExplicitBitVec` to numpy array"
    return np.unpackbits(np.frombuffer(DataStructs.BitVectToBinaryText(fp), dtype=np.uint8), bitorder='little')


def to_mol(smile_or_mol):
    if (type(smile_or_mol) == str) or (type(smile_or_mol) == np.str_):
        mol = Chem.MolFromSmiles(smile_or_mol)
        if mol is not None:
            try:
                Chem.SanitizeMol(mol)
            except:
                mol = None
    else:
        mol = smile_or_mol
        
    return mol

In [23]:
data = pd.read_csv('ExperimentalHeatOfCombustionValuesLog.csv')

In [32]:
data

Unnamed: 0,SMILES,Y
0,c1ccc(cc1)CCc1ccccc1,0.000000
1,Cc1cccc(C)c1,0.000000
2,Cc1ccc(cc1)C(C)C,0.301030
3,c1ccccc1,0.301030
4,CO,0.477121
...,...,...
316,CCCCCCCCCCCCCCCCCCCC,2.041393
317,CCCCCCCCCCCCCCCCCCC,2.041393
318,CCCCCCCCCCCCCCCCCC(=O)OCCC(CC)CCC,2.064458
319,CCCCCCCC/C=C\CCCCCCCC(=O)OCCCCCCCC,2.117271


In [24]:
X = data['SMILES'].tolist()
Y = data['Y'].tolist()

In [25]:
molecules = []
for mol in X:
    molecules.append(morgan_fp(mol))

X_data = []
for arr in molecules:
    X_data.append(fp_to_array(arr))



In [26]:
model = lgb.LGBMRegressor(objective="regression")

In [27]:

X_ndarray = np.array(X_data)
y_ndarray = np.array(data['Y'].tolist())
X_ndarray_train,X_ndarray_test,y_ndarray_train,y_ndarray_test = train_test_split(X_ndarray,
                                                 y_ndarray,
                                                 test_size=0.30,
                                                 random_state=42)



In [28]:
model.fit(X_ndarray_train,y_ndarray_train,1000)

In [29]:
#Train the model using the training sets y_pred=clf.predict(X_test)
y_pred= model.predict(X_ndarray_test)
print("Accuracy r^2:",metrics.r2_score(y_ndarray_test, y_pred))


Accuracy r^2: 0.8116850989834227


In [18]:
# Save to file in the current working directory
pkl_filename = "LBGM_Hemloytic_81%.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [37]:
test = 'Cc1cccc(C)c1'
temp = []
l = fp_to_array(morgan_fp(test))

temp.append(l)

In [38]:
model.predict(temp)

array([0.63374734])