In [1]:
%pip install rdkit

Defaulting to user installation because normal site-packages is not writeable
Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting Pillow
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: Pillow, rdkit
Successfully installed Pillow-10.4.0 rdkit-2024.3.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

In [9]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [10]:
prep_df = pd.read_csv('UBQT_bind_raw.csv')
prep_df

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL29097,CC(C)c1ccc2c(c1)c(SC(C)(C)C)c(CC(C)(C)C(=O)O)n...,inactive,41839.8
1,CHEMBL274438,CCCC(=O)NCCc1c2n(c3ccc(OC)cc13)CCCc1ccccc1-2,intermediate,5910.0
2,CHEMBL15192,CC1(C)CCC2=C(O1)c1ccccc1C(=O)C2=O,intermediate,8348.1
3,CHEMBL60718,Cc1ccc(S(=O)(=O)N[C@@H](Cc2ccccc2)C(=O)CCl)cc1,inactive,46945.0
4,CHEMBL267014,Clc1ccc(N2CCN(Cc3c[nH]c4ncccc34)CC2)cc1,inactive,41839.8
...,...,...,...,...
22240,CHEMBL4777381,Nc1nc2cc(C3CCNCC3)ccn2c1-c1ccc(O)cc1,inactive,100000.0
22241,CHEMBL5172138,O=C1c2ccccc2C(=O)c2oc(-c3cccnc3)nc21,active,990.0
22242,CHEMBL5196686,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,1900.0
22243,CHEMBL5173239,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,2100.0


In [11]:
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem)
        moldata.append(mol)

    baseData= np.arange(1,1)
    i=0
    for mol in moldata:

        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)

        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])

        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1

    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)

    return descriptors

In [12]:
df_lipinski = lipinski(prep_df.canonical_smiles)

In [13]:
df_lipinski

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,472.094,8.01030,1.0,3.0
1,376.500,4.72190,1.0,3.0
2,242.274,2.75210,0.0,3.0
3,351.855,2.69252,1.0,3.0
4,326.831,3.53850,1.0,3.0
...,...,...,...,...
22240,308.385,2.75610,3.0,5.0
22241,276.251,2.51200,0.0,5.0
22242,408.841,3.85330,0.0,6.0
22243,426.831,3.99240,0.0,6.0


In [17]:
df_combined = pd.concat([prep_df,df_lipinski], axis=1)
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL29097,CC(C)c1ccc2c(c1)c(SC(C)(C)C)c(CC(C)(C)C(=O)O)n...,inactive,41839.8,472.094,8.01030,1.0,3.0
1,CHEMBL274438,CCCC(=O)NCCc1c2n(c3ccc(OC)cc13)CCCc1ccccc1-2,intermediate,5910.0,376.500,4.72190,1.0,3.0
2,CHEMBL15192,CC1(C)CCC2=C(O1)c1ccccc1C(=O)C2=O,intermediate,8348.1,242.274,2.75210,0.0,3.0
3,CHEMBL60718,Cc1ccc(S(=O)(=O)N[C@@H](Cc2ccccc2)C(=O)CCl)cc1,inactive,46945.0,351.855,2.69252,1.0,3.0
4,CHEMBL267014,Clc1ccc(N2CCN(Cc3c[nH]c4ncccc34)CC2)cc1,inactive,41839.8,326.831,3.53850,1.0,3.0
...,...,...,...,...,...,...,...,...
22240,CHEMBL4777381,Nc1nc2cc(C3CCNCC3)ccn2c1-c1ccc(O)cc1,inactive,100000.0,308.385,2.75610,3.0,5.0
22241,CHEMBL5172138,O=C1c2ccccc2C(=O)c2oc(-c3cccnc3)nc21,active,990.0,276.251,2.51200,0.0,5.0
22242,CHEMBL5196686,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,1900.0,408.841,3.85330,0.0,6.0
22243,CHEMBL5173239,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,2100.0,426.831,3.99240,0.0,6.0


In [18]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', axis = 1)

    return x

In [19]:
df_combined.standard_value.describe()

count    2.224500e+04
mean     2.864069e+04
std      6.712246e+04
min      3.000000e-01
25%      5.011900e+03
50%      2.511890e+04
75%      3.981070e+04
max      4.466836e+06
Name: standard_value, dtype: float64

In [20]:
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis = 1)

    return x

In [22]:
df_norm = norm_value(df_combined)
df_norm

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,standard_value_norm
0,CHEMBL29097,CC(C)c1ccc2c(c1)c(SC(C)(C)C)c(CC(C)(C)C(=O)O)n...,inactive,472.094,8.01030,1.0,3.0,41839.8
1,CHEMBL274438,CCCC(=O)NCCc1c2n(c3ccc(OC)cc13)CCCc1ccccc1-2,intermediate,376.500,4.72190,1.0,3.0,5910.0
2,CHEMBL15192,CC1(C)CCC2=C(O1)c1ccccc1C(=O)C2=O,intermediate,242.274,2.75210,0.0,3.0,8348.1
3,CHEMBL60718,Cc1ccc(S(=O)(=O)N[C@@H](Cc2ccccc2)C(=O)CCl)cc1,inactive,351.855,2.69252,1.0,3.0,46945.0
4,CHEMBL267014,Clc1ccc(N2CCN(Cc3c[nH]c4ncccc34)CC2)cc1,inactive,326.831,3.53850,1.0,3.0,41839.8
...,...,...,...,...,...,...,...,...
22240,CHEMBL4777381,Nc1nc2cc(C3CCNCC3)ccn2c1-c1ccc(O)cc1,inactive,308.385,2.75610,3.0,5.0,100000.0
22241,CHEMBL5172138,O=C1c2ccccc2C(=O)c2oc(-c3cccnc3)nc21,active,276.251,2.51200,0.0,5.0,990.0
22242,CHEMBL5196686,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,408.841,3.85330,0.0,6.0,1900.0
22243,CHEMBL5173239,CN1CC[C@@H](Oc2ccc(Cl)c(-c3nc4c(o3)C(=O)c3cccc...,intermediate,426.831,3.99240,0.0,6.0,2100.0
