## Prepare

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem, DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

import re
import phik
from umap import UMAP

from IPython.display import clear_output

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.inspection import permutation_importance
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from catboost import CatBoostRegressor

import shap
from boruta import BorutaPy
import os
import phik

from joblib import dump, load

In [3]:
fpps_good = load('../model_features/fpps_good.joblib') 
mcf7_good = load('../model_features/mcf7_good.joblib') 
pc3_good = load('../model_features/pc3_good.joblib') 

def get_descr_value(molecule, descr_func):
    out_arr = np.zeros((1,), dtype=int)
    descriptor = descr_func(molecule)
    if isinstance(descriptor, rdkit.DataStructs.cDataStructs.ExplicitBitVect):
        DataStructs.ConvertToNumpyArray(descriptor, out_arr)
        return out_arr
    return descriptor

def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    df = calc.pandas(mols)
    return df

def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        mol=Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors, desc_names   

def get_clust_df(df, good_col):
    mols = [Chem.MolFromSmiles(smile) for smile in df['smiles']]
    fps = []
    col = [f'mor_{i}' for i in range(1024)]
    col.extend([f'mac_{i}' for i in range(167)])
    df_clust = df.copy()
    
    # mor+mac
    for i in range(len(mols)):
        mac = get_descr_value(mols[i], AllChem.GetMACCSKeysFingerprint)
        mor = np.array(AllChem.GetMorganFingerprintAsBitVect(mols[i], radius=3, useFeatures=False, nBits=1024))
        fps.append(np.concatenate([mor, mac]))
    fps = np.array(fps)
    
    #mordred
    mordred_descriptors = All_Mordred_descriptors(df['smiles'])
    mordred_descriptors = mordred_descriptors.drop(columns=mordred_descriptors.columns[mordred_descriptors.dtypes == object])
    mordred_descriptors = mordred_descriptors.drop(columns=mordred_descriptors.columns[mordred_descriptors.dtypes == bool])
    mordred_descriptors = mordred_descriptors.dropna(axis=1)
    
    #rdkit
    Mol_descriptors, desc_names = RDkit_descriptors(df['smiles'])
    Mol_descriptors = pd.DataFrame(Mol_descriptors)[good_col]
    Mol_descriptors.columns = [f'rdkit_{i}' for i in range(len(Mol_descriptors.columns))]
    
    df_clust = pd.concat([
        df_clust,
        pd.DataFrame(columns=col, data=fps)],
        axis=1)
    df_clust = pd.concat([df_clust, Mol_descriptors, mordred_descriptors], axis=1)
    df_clust.columns = [str(col) for col in df_clust.columns]
    return df_clust

In [4]:
np.float = np.float64

## pred

In [15]:
df = pd.read_excel('../data/init_data/BF_2.xlsx')
df.columns = ['smiles', 'fpps', 'mcf7', 'pc3']

In [16]:
df_clust = get_clust_df(df[['smiles']], fpps_good)

reg = load('../models/fpps_reg.joblib')
pre = reg.predict(df_clust[reg.feature_names_in_])
df['fpps'] = pre

100%|██████████| 30/30 [00:02<00:00, 13.73it/s]


In [17]:
df_clust = get_clust_df(df[['smiles']], mcf7_good)
df_clust = df_clust[df_clust.drop(columns=['smiles']).columns]

reg = load('../models/mcf7_reg.joblib')
pre = reg.predict(df_clust[reg.feature_names_in_])
df['mcf7'] = pre

100%|██████████| 30/30 [00:02<00:00, 14.13it/s]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [18]:
df_clust = get_clust_df(df[['smiles']], pc3_good)

reg = load('../models/pc3_reg.joblib')
pre = reg.predict(df_clust[reg.feature_names_in_])
df['pc3'] = pre

100%|██████████| 30/30 [00:02<00:00, 14.06it/s]


In [19]:
df = df.sort_values('fpps')
df

Unnamed: 0,smiles,fpps,mcf7,pc3
13,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CSC2=NC3=C(C=CC...,7.062269,11.490914,9.850511
15,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(COC2=NC3=C(C=CC...,7.070434,11.337252,9.89573
17,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CNC2=NC3=C(C=CC...,7.092228,11.136473,9.712812
9,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(COC2=NC3=C(C=CC...,7.114997,10.773749,9.355158
26,O=C1CN=C(SCC(P(O)(O)=O)(O)P(O)(C/C=C(C)/C)=O)N1C,7.143977,9.716478,8.814597
21,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(COC2=NC3=C(C=CC...,7.177248,10.870735,9.425416
11,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CNC2=NC3=C(C=CC...,7.18696,10.400032,8.995617
7,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CSC2=NC3=C(C=CC...,7.205645,10.248968,8.790536
19,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CSC2=NC3=C(C=CC...,7.21259,11.147018,9.686363
23,OC(P(O)(O)=O)(P(O)(O)=O)CN1C=C(CNC2=NC3=C(C=CC...,7.252548,10.796581,9.057979


In [20]:
mols = [Chem.MolFromSmiles(i) for i in df['smiles']]
df['mol'] = mols
PandasTools.SaveXlsxFromFrame(df, '../result_data/predicted.xlsx', molCol='mol', size=(100,100))