In [45]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None 
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, Lipinski
from rdkit.Chem.SaltRemover import SaltRemover
import numpy as np

In [46]:
raw_data_directory = '/Users/Avi/Dissertation/Data/Non_Curated/Raw'

preprocessed_data_directory = '/Users/Avi/Dissertation/Data/Non_Curated/Preprocessed'
os.makedirs(preprocessed_data_directory, exist_ok=True)

#### Selecting Same Targets as Curated Approach

In [47]:
target_CHEMBL4078 = pd.read_csv(os.path.join(raw_data_directory, 'target_CHEMBL4078.csv.gz'))
target_CHEMBL279 = pd.read_csv(os.path.join(raw_data_directory, 'target_CHEMBL279.csv.gz'))
target_CHEMBL5763 = pd.read_csv(os.path.join(raw_data_directory, 'target_CHEMBL5763.csv.gz'))
target_CHEMBL240 = pd.read_csv(os.path.join(raw_data_directory, 'target_CHEMBL240.csv.gz'))
target_CHEMBL4005 = pd.read_csv(os.path.join(raw_data_directory, 'target_CHEMBL4005.csv.gz'))

In [48]:
print(f"Target CHEMBL4005: {target_CHEMBL4005['compound_chembl_id'].nunique()} unique compounds")
print(f"Target CHEMBL279: {target_CHEMBL279['compound_chembl_id'].nunique()} unique compounds")
print(f"Target CHEMBL5763: {target_CHEMBL5763['compound_chembl_id'].nunique()} unique compounds")
print(f"Target CHEMBL240: {target_CHEMBL240['compound_chembl_id'].nunique()} unique compounds")
print(f"Target CHEMBL4005: {target_CHEMBL4005['compound_chembl_id'].nunique()} unique compounds")

Target CHEMBL4005: 5734 unique compounds
Target CHEMBL279: 7373 unique compounds
Target CHEMBL5763: 3907 unique compounds
Target CHEMBL240: 8748 unique compounds
Target CHEMBL4005: 5734 unique compounds


#### Dropping Duplicate Compound Entries

In [49]:
target_CHEMBL4078 = target_CHEMBL4078.drop_duplicates(subset='compound_chembl_id')
target_CHEMBL279 = target_CHEMBL279.drop_duplicates(subset='compound_chembl_id')
target_CHEMBL5763 = target_CHEMBL5763.drop_duplicates(subset='compound_chembl_id')
target_CHEMBL240 = target_CHEMBL240.drop_duplicates(subset='compound_chembl_id')
target_CHEMBL4005 = target_CHEMBL4005.drop_duplicates(subset='compound_chembl_id')

#### Creating Activity Classes

In [50]:
threshold = 6.5

target_CHEMBL4078['activity'] = target_CHEMBL4078['pchembl_value'].apply(lambda x: 1 if x >= threshold else 0)
target_CHEMBL279['activity'] = target_CHEMBL279['pchembl_value'].apply(lambda x: 1 if x >= threshold else 0)
target_CHEMBL5763['activity'] = target_CHEMBL5763['pchembl_value'].apply(lambda x: 1 if x >= threshold else 0)
target_CHEMBL240['activity'] = target_CHEMBL240['pchembl_value'].apply(lambda x: 1 if x >= threshold else 0)
target_CHEMBL4005['activity'] = target_CHEMBL4005['pchembl_value'].apply(lambda x: 1 if x >= threshold else 0)

#### Cleaning Canonical SMILES

In [51]:
def clean_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        remover = SaltRemover()
        mol = remover.StripMol(mol)
        if mol is not None:
            clean_smiles = Chem.MolToSmiles(mol, canonical=True)
            return clean_smiles
    return None

In [52]:
target_CHEMBL4078['canonical_smiles'] = target_CHEMBL4078['canonical_smiles'].apply(clean_smiles)
target_CHEMBL279['canonical_smiles'] = target_CHEMBL279['canonical_smiles'].apply(clean_smiles)
target_CHEMBL5763['canonical_smiles'] = target_CHEMBL5763['canonical_smiles'].apply(clean_smiles)
target_CHEMBL240['canonical_smiles'] = target_CHEMBL240['canonical_smiles'].apply(clean_smiles)
target_CHEMBL4005['canonical_smiles'] = target_CHEMBL4005['canonical_smiles'].apply(clean_smiles)

#### Calculating Lipinski's Descriptors

In [53]:
def lipinski(df, smiles_column='canonical_smiles', verbose=False):
    moldata = []
    for elem in df[smiles_column]:
        mol = Chem.MolFromSmiles(elem)
        moldata.append(mol)
       
    baseData = []
    for mol in moldata:
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
        row = [desc_MolWt, desc_MolLogP, desc_NumHDonors, desc_NumHAcceptors]
        baseData.append(row)
    
    columnNames = ["MW", "LogP", "NumHDonors", "NumHAcceptors"]
    descriptors = pd.DataFrame(data=baseData, columns=columnNames, index=df.index)
    
    # Append the new columns to the original DataFrame
    df = pd.concat([df, descriptors], axis=1)
    
    return df

In [54]:
target_CHEMBL4078_combined = lipinski(target_CHEMBL4078)
target_CHEMBL279_combined = lipinski(target_CHEMBL279)
target_CHEMBL5763_combined = lipinski(target_CHEMBL5763)
target_CHEMBL240_combined = lipinski(target_CHEMBL240)
target_CHEMBL4005_combined = lipinski(target_CHEMBL4005)

#### Filtering Data According to Lipinski's Rule of Five

In [55]:
def apply_lipinski(df):
    df['MW_violation'] = df['MW'] > 500
    df['LogP_violation'] = df['LogP'] > 5
    df['NumHDonors_violation'] = df['NumHDonors'] > 5
    df['NumHAcceptors_violation'] = df['NumHAcceptors'] > 10

    df['NumViolations'] = (df['MW_violation'].astype(int) +
                           df['LogP_violation'].astype(int) +
                           df['NumHDonors_violation'].astype(int) +
                           df['NumHAcceptors_violation'].astype(int))

    df_preprocessed = df[df['NumViolations'] < 2]
    df_preprocessed = df_preprocessed[['assay_chembl_id', 'compound_chembl_id', 'pchembl_value', 'activity', 'canonical_smiles']]
    return df_preprocessed

target_CHEMBL4078_preprocessed = apply_lipinski(target_CHEMBL4078_combined)
target_CHEMBL279_preprocessed = apply_lipinski(target_CHEMBL279_combined)
target_CHEMBL5763_preprocessed = apply_lipinski(target_CHEMBL5763_combined)
target_CHEMBL240_preprocessed = apply_lipinski(target_CHEMBL240_combined)
target_CHEMBL4005_preprocessed = apply_lipinski(target_CHEMBL4005_combined)

In [56]:
print(f"Target CHEMBL4078: {target_CHEMBL4078_preprocessed['compound_chembl_id'].nunique()} unique compounds after applying Lipinski's Rule of Five")
print(f"Target CHEMBL279: {target_CHEMBL279_preprocessed['compound_chembl_id'].nunique()} unique compounds after applying Lipinski's Rule of Five")
print(f"Target CHEMBL5763: {target_CHEMBL5763_preprocessed['compound_chembl_id'].nunique()} unique compounds after applying Lipinski's Rule of Five")
print(f"Target CHEMBL240: {target_CHEMBL240_preprocessed['compound_chembl_id'].nunique()} unique compounds after applying Lipinski's Rule of Five")
print(f"Target CHEMBL4005: {target_CHEMBL4005_preprocessed['compound_chembl_id'].nunique()} unique compounds after applying Lipinski's Rule of Five")

Target CHEMBL4078: 4042 unique compounds after applying Lipinski's Rule of Five
Target CHEMBL279: 5990 unique compounds after applying Lipinski's Rule of Five
Target CHEMBL5763: 3173 unique compounds after applying Lipinski's Rule of Five
Target CHEMBL240: 7826 unique compounds after applying Lipinski's Rule of Five
Target CHEMBL4005: 5322 unique compounds after applying Lipinski's Rule of Five


In [57]:
target_CHEMBL4078_preprocessed.to_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL4078_Non_Curated_Preprocessed.csv'))
target_CHEMBL279_preprocessed.to_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL279_Non_Curated_Preprocessed.csv'))
target_CHEMBL5763_preprocessed.to_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL5763_Non_Curated_Preprocessed.csv'))
target_CHEMBL240_preprocessed.to_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL240_Non_Curated_Preprocessed.csv'))
target_CHEMBL4005_preprocessed.to_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL4005_Non_Curated_Preprocessed.csv'))