In [None]:
! pip install chembl_webresource_client rdkit

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=0.7.0 (from chembl_webresource_client)
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting attrs<22.0,>=21.2 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize<2.0,>=1.4 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.

## **Load bioactivity data**

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
df = pd.read_csv('/content/gdrive/Shareddrives/1:1 Awani Gadre/Dataset/JAK3_processed_PIC50.csv')

In [None]:
print (df.shape)

(4154, 2)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4154 entries, 0 to 4153
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   canonical_smiles  4154 non-null   object 
 1   pIC50             4154 non-null   float64
dtypes: float64(1), object(1)
memory usage: 65.0+ KB


In [None]:
df['canonical_smiles'].isnull().sum()

0

## **Calculate Lipinski descriptors**
Christopher Lipinski, a scientist at Pfizer, came up with a set of rule-of-thumb for evaluating the **druglikeness** of compounds. Such druglikeness is based on the Absorption, Distribution, Metabolism and Excretion (ADME) that is also known as the pharmacokinetic profile. Lipinski analyzed all orally active FDA-approved drugs in the formulation of what is to be known as the **Rule-of-Five** or **Lipinski's Rule**.

The Lipinski's Rule stated the following:
* Molecular weight < 500 Dalton
* Octanol-water partition coefficient (LogP) < 5
* Hydrogen bond donors < 5
* Hydrogen bond acceptors < 10

### **Calculate descriptors**

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
import numpy as np

def extract_lipinski_descriptors(raw):
    """
    Function to extract Lipinski descriptors.
    The Lipinski's Rule stated the following:
        * Molecular weight < 500 Dalton
        * Octanol-water partition coefficient (LogP) < 5
        * Hydrogen bond donors < 5
        * Hydrogen bond acceptors < 10

    @retrun
    dataframe raw with each new extractor lipinski descriptor
    """

    #get the canonical smiles formula
    smile = raw["canonical_smiles"]
    processed_smile = Chem.MolFromSmiles(smile)

    #extract the lipinski features
    raw["MolWt"] = Descriptors.MolWt(processed_smile)
    raw["MolLogP"] = Descriptors.MolLogP(processed_smile)
    raw["NumHDonors"] = Lipinski.NumHDonors(processed_smile)
    raw["NumHAcceptors"] = Lipinski.NumHAcceptors(processed_smile)

    return raw

In [None]:
def extract_lipinski_and_other_descriptors(raw):
    """
    This function extracts descriptors from the canonical smile form of the data.

    @retrun
    dataframe raw with each new extractor descriptor
    """
    from rdkit import Chem
    from rdkit.Chem import Descriptors, Lipinski
    import numpy as np
    #get the canonical smiles formula
    smile = raw["canonical_smiles"]
    processed_smile = Chem.MolFromSmiles(smile)

    #loop through descriptor functions of rdkit descriptors
    for descriptor_name, descriptor_function in Descriptors._descList:
        try:
            descriptor_value = descriptor_function(processed_smile)
            raw[descriptor_name] = descriptor_value
        except Exception as error:
            print(str(error))
            raw[descriptor_name] = np.nan

    return raw

#run the descriptor function for the dataset
df = df.apply(lambda raw: extract_lipinski_and_other_descriptors(raw), axis = 1)

In [None]:
df.head()

Unnamed: 0,canonical_smiles,pIC50,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,O=C1NCc2c(-c3ccc(F)cc3F)cc(C3CCNCC3)cc2N1c1c(C...,5.0,14.911933,14.911933,0.199792,-0.653809,0.4277,488.365,467.197,487.102974,...,0,0,0,0,0,0,0,0,0,1
1,CC(C)(C)c1nc2c3ccc(F)cc3c3c(=O)[nH]ccc3c2[nH]1,8.30103,13.750318,13.750318,0.153872,-0.367608,0.482368,309.344,293.216,309.12774,...,0,0,0,0,0,0,0,0,0,0
2,Cc1cccc(Cl)c1NC(=O)c1cnc(NC(=O)C2CC2)s1,7.356547,12.251818,12.251818,0.024763,-0.291092,0.894383,335.816,321.704,335.049525,...,0,0,0,0,0,1,0,0,0,0
3,O=C1Nc2ccc(NC(=O)c3ccccc3)cc2/C1=C/c1ccc[nH]1,6.060481,12.303131,12.303131,0.156356,-0.187287,0.640131,329.359,314.239,329.116427,...,0,0,0,0,0,0,0,0,0,0
4,O=C1Nc2ccc(C(=O)O)cc2/C1=C/c1ccc[nH]1,6.67162,11.897902,11.897902,0.159848,-1.012618,0.718379,254.245,244.165,254.069142,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(4154, 211)

In [None]:
df1 = df.drop(columns=['canonical_smiles'], axis=1)

In [None]:
df1.shape

(4154, 210)

In [None]:
column_name=df1.columns
for column_name in column_name:
    print(column_name)

pIC50
MaxAbsEStateIndex
MaxEStateIndex
MinAbsEStateIndex
MinEStateIndex
qed
MolWt
HeavyAtomMolWt
ExactMolWt
NumValenceElectrons
NumRadicalElectrons
MaxPartialCharge
MinPartialCharge
MaxAbsPartialCharge
MinAbsPartialCharge
FpDensityMorgan1
FpDensityMorgan2
FpDensityMorgan3
BCUT2D_MWHI
BCUT2D_MWLOW
BCUT2D_CHGHI
BCUT2D_CHGLO
BCUT2D_LOGPHI
BCUT2D_LOGPLOW
BCUT2D_MRHI
BCUT2D_MRLOW
AvgIpc
BalabanJ
BertzCT
Chi0
Chi0n
Chi0v
Chi1
Chi1n
Chi1v
Chi2n
Chi2v
Chi3n
Chi3v
Chi4n
Chi4v
HallKierAlpha
Ipc
Kappa1
Kappa2
Kappa3
LabuteASA
PEOE_VSA1
PEOE_VSA10
PEOE_VSA11
PEOE_VSA12
PEOE_VSA13
PEOE_VSA14
PEOE_VSA2
PEOE_VSA3
PEOE_VSA4
PEOE_VSA5
PEOE_VSA6
PEOE_VSA7
PEOE_VSA8
PEOE_VSA9
SMR_VSA1
SMR_VSA10
SMR_VSA2
SMR_VSA3
SMR_VSA4
SMR_VSA5
SMR_VSA6
SMR_VSA7
SMR_VSA8
SMR_VSA9
SlogP_VSA1
SlogP_VSA10
SlogP_VSA11
SlogP_VSA12
SlogP_VSA2
SlogP_VSA3
SlogP_VSA4
SlogP_VSA5
SlogP_VSA6
SlogP_VSA7
SlogP_VSA8
SlogP_VSA9
TPSA
EState_VSA1
EState_VSA10
EState_VSA11
EState_VSA2
EState_VSA3
EState_VSA4
EState_VSA5
EState_VSA6
EStat

In [None]:
#pd.set_option('display.max_columns', None)
nan_counts = df1.isna().sum()
print (nan_counts)

pIC50                0
MaxAbsEStateIndex    0
MaxEStateIndex       0
MinAbsEStateIndex    0
MinEStateIndex       0
                    ..
fr_thiazole          0
fr_thiocyan          0
fr_thiophene         0
fr_unbrch_alkane     0
fr_urea              0
Length: 210, dtype: int64


In [None]:
df1.isnull().sum()

pIC50                0
MaxAbsEStateIndex    0
MaxEStateIndex       0
MinAbsEStateIndex    0
MinEStateIndex       0
                    ..
fr_thiazole          0
fr_thiocyan          0
fr_thiophene         0
fr_unbrch_alkane     0
fr_urea              0
Length: 210, dtype: int64

In [None]:
columns_with_nan = df1.columns[df1.isna().any()]
print (columns_with_nan)

Index(['BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
       'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW'],
      dtype='object')


In [None]:
for col in columns_with_nan:
    nan_sum = df[col].isna().sum()
    print(f"Column '{col}': Sum of NaN values = {nan_sum}")

Column 'BCUT2D_MWHI': Sum of NaN values = 7
Column 'BCUT2D_MWLOW': Sum of NaN values = 7
Column 'BCUT2D_CHGHI': Sum of NaN values = 7
Column 'BCUT2D_CHGLO': Sum of NaN values = 7
Column 'BCUT2D_LOGPHI': Sum of NaN values = 7
Column 'BCUT2D_LOGPLOW': Sum of NaN values = 7
Column 'BCUT2D_MRHI': Sum of NaN values = 7
Column 'BCUT2D_MRLOW': Sum of NaN values = 7


In [None]:
df2 = df1.dropna()

In [None]:
df2.shape

(4147, 210)

### Generate Morgan fingerprints


In [None]:
# def smiles_to_morgan_fingerprints_array(dataframe, smiles_column, radius=2, nBits=2048):
#     """
#     Convert SMILES notations in a DataFrame to a numpy array of Morgan fingerprints.

#     Parameters:
#         dataframe (pd.DataFrame): DataFrame containing SMILES notations.
#         smiles_column (str): Name of the column containing SMILES.
#         radius (int): Radius parameter for Morgan fingerprint calculation.
#         nBits (int): Number of bits for the fingerprint.

#     Returns:
#         np.ndarray: Numpy array containing Morgan fingerprints.
#     """

#     from rdkit import Chem
#     from rdkit.Chem import AllChem
#     import numpy as np

#     fingerprints = []
#     for smiles in dataframe[smiles_column]:
#         mol = Chem.MolFromSmiles(smiles)
#         fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
#         fingerprints.append(fingerprint)

#     fingerprint_array = np.array(fingerprints)[indices.astype(int)]
#     return fingerprint_array



In [None]:
df2.shape

(4147, 210)

In [None]:
df2.columns

Index(['pIC50', 'MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex',
       'MinEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt',
       'NumValenceElectrons',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=210)

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4147 entries, 0 to 4153
Columns: 210 entries, pIC50 to fr_urea
dtypes: float64(106), int64(104)
memory usage: 6.7 MB


In [None]:
# Calculate statistics for each column
statistics = {
    'Min': df2.min(),
    'Max': df2.max(),
    'Std': df2.std(),
    'Avg': df2.mean()
}

# Create a new DataFrame for statistics
statistics_df = pd.DataFrame(statistics)

# Save the statistics DataFrame to a CSV file
statistics_df.to_csv('column_statistics.csv', index_label='Column')

In [None]:
columns_to_drop = ['NumRadicalElectrons', 'SMR_VSA8','SlogP_VSA9', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_dihydropyridine',
                   'fr_isocyan', 'fr_isothiocyan', 'fr_nitroso', 'fr_prisulfonamd', 'fr_quatN', 'fr_thiocyan', 'BalabanJ']
df_final = df2.drop(columns = columns_to_drop, axis =1)

In [None]:
print (len(columns_to_drop))

14


In [None]:
df_final.shape

(4147, 196)

In [None]:
df_final.to_csv('/content/gdrive/Shareddrives/1:1 Awani Gadre/Dataset/JAK3_descriptor_pIC50.csv', index=False)

---