In [1]:
!pip install rdkit --quiet
!pip install scikit-learn
print("Packages downloaded successfully")

Packages downloaded successfully


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem import AllChem
import numpy as np
from rdkit.DataStructs import ConvertToNumpyArray

In [3]:
compounds = pd.read_csv("~/datasets/compound_bioactivity_data.csv")
df = compounds[["smiles", "pIC50"]]
df.head()

Unnamed: 0,smiles,pIC50
0,COc1ccc(CCN2CCC(C(=O)NO)(S(=O)(=O)c3ccc(OC)cc3...,7.468521
1,COc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3cccc(OC)c3)...,8.045757
2,CCCCOc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3ccc(Cl)c...,7.69897
3,O=C(NO)C1(S(=O)(=O)c2ccc(OCc3ccccc3)cc2)CCN(Cc...,8.522879
4,COc1ccc(S(=O)(=O)N(Cc2cccnc2)[C@@H](C(=O)NO)C(...,8.045757


In [4]:
df.loc[:, 'activity'] = (df['pIC50'] >= 5).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'activity'] = (df['pIC50'] >= 5).astype(int)


In [5]:
df

Unnamed: 0,smiles,pIC50,activity
0,COc1ccc(CCN2CCC(C(=O)NO)(S(=O)(=O)c3ccc(OC)cc3...,7.468521,1
1,COc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3cccc(OC)c3)...,8.045757,1
2,CCCCOc1ccc(S(=O)(=O)C2(C(=O)NO)CCN(Cc3ccc(Cl)c...,7.698970,1
3,O=C(NO)C1(S(=O)(=O)c2ccc(OCc3ccccc3)cc2)CCN(Cc...,8.522879,1
4,COc1ccc(S(=O)(=O)N(Cc2cccnc2)[C@@H](C(=O)NO)C(...,8.045757,1
...,...,...,...
2043,CCCCCCN([C@@H](C(=O)O)C(C)C)S(=O)(=O)c1ccc(-c2...,6.482804,1
2044,CCCCCCCCN([C@@H](C(=O)O)C(C)C)S(=O)(=O)c1ccc(-...,5.397940,1
2045,Cc1ccc(-c2noc(-c3ccc(S(=O)(=O)N(Cc4ccccc4)[C@@...,7.022276,1
2046,Cc1ccc(-c2noc(-c3ccc(S(=O)(=O)N(CCCCCCCCCCCCN=...,6.030118,1


In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

def calculate_morgan_fingerprint(smiles, radius=2, nBits=1024):
    """
    Calculate Morgan fingerprints for a given SMILES string.

    Parameters:
    - smiles (str): SMILES representation of a molecule
    - radius (int): Radius for Morgan fingerprint calculation (default=2)
    - nBits (int): Number of bits in the fingerprint (default=1024)

    Returns:
    - np.ndarray: Array of fingerprint bits
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        return np.array(fp)
    else:
        return np.zeros(nBits, dtype=np.float32)

def calculate_ap_fingerprint(smiles, nBits=2048):
    """
    Calculate Atom Pair fingerprints for a given SMILES string.

    Parameters:
    - smiles (str): SMILES representation of a molecule
    - nBits (int): Number of bits in the fingerprint (default=2048)

    Returns:
    - np.ndarray: Array of Atom Pair fingerprint bits
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=nBits)
        return np.array(fp)
    else:
        return np.zeros(nBits, dtype=np.float32)

def calculate_rdk5_fingerprint(smiles, nBits=2048):
    """
    Calculate RDKit 5-bit hashed fingerprints for a given SMILES string.

    Parameters:
    - smiles (str): SMILES representation of a molecule
    - nBits (int): Number of bits for RDKit 5-bit fingerprints (default=2048)

    Returns:
    - np.ndarray: Array of RDKit 5-bit fingerprint bits
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = Chem.RDKFingerprint(mol)
        arr = np.zeros((nBits,), dtype=np.float32)
        ConvertToNumpyArray(fp, arr)
        return arr
    else:
        return np.zeros(nBits, dtype=np.float32)

def get_morgan_fingerprints(df, smiles_column='smiles', radius=2, nBits=1024):
    """
    Generate a DataFrame with Morgan fingerprints for the SMILES strings in the given DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to which fingerprints will be added
    - smiles_column (str): The column containing SMILES strings (default='smiles')
    - radius (int): Radius for Morgan fingerprint calculation (default=2)
    - nBits (int): Number of bits for Morgan fingerprints (default=1024)

    Returns:
    - pd.DataFrame: DataFrame with Morgan fingerprint columns
    """
    morgan_fingerprints = df[smiles_column].apply(lambda x: calculate_morgan_fingerprint(x, radius, nBits))
    morgan_df = pd.DataFrame(morgan_fingerprints.tolist(), columns=[f'morgan_fp_{i}' for i in range(nBits)])
    return morgan_df

def get_ap_fingerprints(df, smiles_column='smiles', nBits=2048):
    """
    Generate a DataFrame with Atom Pair fingerprints for the SMILES strings in the given DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to which fingerprints will be added
    - smiles_column (str): The column containing SMILES strings (default='smiles')
    - nBits (int): Number of bits for Atom Pair fingerprints (default=2048)

    Returns:
    - pd.DataFrame: DataFrame with Atom Pair fingerprint columns
    """
    ap_fingerprints = df[smiles_column].apply(lambda x: calculate_ap_fingerprint(x, nBits))
    ap_df = pd.DataFrame(ap_fingerprints.tolist(), columns=[f'ap_fp_{i}' for i in range(nBits)])
    return ap_df

def get_rdk5_fingerprints(df, smiles_column='smiles', nBits=2048):
    """
    Generate a DataFrame with RDKit 5-bit fingerprints for the SMILES strings in the given DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to which fingerprints will be added
    - smiles_column (str): The column containing SMILES strings (default='smiles')
    - nBits (int): Number of bits for RDKit 5-bit fingerprints (default=2048)

    Returns:
    - pd.DataFrame: DataFrame with RDKit 5-bit fingerprint columns
    """
    rdk5_fingerprints = df[smiles_column].apply(lambda x: calculate_rdk5_fingerprint(x, nBits))
    rdk5_df = pd.DataFrame(rdk5_fingerprints.tolist(), columns=[f'rdk5_fp_{i}' for i in range(nBits)])
    return rdk5_df

# Example usage:
# Assuming train_df and test_df are your original DataFrames with SMILES column

morgan_fp_df = get_morgan_fingerprints(df)
ap_fp_df = get_ap_fingerprints(df)
rdk5_fp_df = get_rdk5_fingerprints(df)



In [10]:
morgan_fp_df.head()

Unnamed: 0,morgan_fp_0,morgan_fp_1,morgan_fp_2,morgan_fp_3,morgan_fp_4,morgan_fp_5,morgan_fp_6,morgan_fp_7,morgan_fp_8,morgan_fp_9,...,morgan_fp_1014,morgan_fp_1015,morgan_fp_1016,morgan_fp_1017,morgan_fp_1018,morgan_fp_1019,morgan_fp_1020,morgan_fp_1021,morgan_fp_1022,morgan_fp_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
ap_fp_df.head()

Unnamed: 0,ap_fp_0,ap_fp_1,ap_fp_2,ap_fp_3,ap_fp_4,ap_fp_5,ap_fp_6,ap_fp_7,ap_fp_8,ap_fp_9,...,ap_fp_2038,ap_fp_2039,ap_fp_2040,ap_fp_2041,ap_fp_2042,ap_fp_2043,ap_fp_2044,ap_fp_2045,ap_fp_2046,ap_fp_2047
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
rdk5_fp_df.head()

Unnamed: 0,rdk5_fp_0,rdk5_fp_1,rdk5_fp_2,rdk5_fp_3,rdk5_fp_4,rdk5_fp_5,rdk5_fp_6,rdk5_fp_7,rdk5_fp_8,rdk5_fp_9,...,rdk5_fp_2038,rdk5_fp_2039,rdk5_fp_2040,rdk5_fp_2041,rdk5_fp_2042,rdk5_fp_2043,rdk5_fp_2044,rdk5_fp_2045,rdk5_fp_2046,rdk5_fp_2047
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [13]:
# Remove the 'smiles' column from the original dataframe
df_without_smiles = df.drop(columns=['smiles'])

# Concatenate each fingerprint DataFrame with the original data (excluding 'smiles')
morgan_df = pd.concat([df_without_smiles, morgan_fp_df], axis=1)
ap_df = pd.concat([df_without_smiles, ap_fp_df], axis=1)
rdk5_df = pd.concat([df_without_smiles, rdk5_fp_df], axis=1)

In [14]:
morgan_df

Unnamed: 0,pIC50,activity,morgan_fp_0,morgan_fp_1,morgan_fp_2,morgan_fp_3,morgan_fp_4,morgan_fp_5,morgan_fp_6,morgan_fp_7,...,morgan_fp_1014,morgan_fp_1015,morgan_fp_1016,morgan_fp_1017,morgan_fp_1018,morgan_fp_1019,morgan_fp_1020,morgan_fp_1021,morgan_fp_1022,morgan_fp_1023
0,7.468521,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8.045757,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7.698970,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8.522879,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,8.045757,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,6.482804,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2044,5.397940,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2045,7.022276,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2046,6.030118,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
ap_df

Unnamed: 0,pIC50,activity,ap_fp_0,ap_fp_1,ap_fp_2,ap_fp_3,ap_fp_4,ap_fp_5,ap_fp_6,ap_fp_7,...,ap_fp_2038,ap_fp_2039,ap_fp_2040,ap_fp_2041,ap_fp_2042,ap_fp_2043,ap_fp_2044,ap_fp_2045,ap_fp_2046,ap_fp_2047
0,7.468521,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,8.045757,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,7.698970,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
3,8.522879,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,8.045757,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,6.482804,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2044,5.397940,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2045,7.022276,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2046,6.030118,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
rdk5_df

Unnamed: 0,pIC50,activity,rdk5_fp_0,rdk5_fp_1,rdk5_fp_2,rdk5_fp_3,rdk5_fp_4,rdk5_fp_5,rdk5_fp_6,rdk5_fp_7,...,rdk5_fp_2038,rdk5_fp_2039,rdk5_fp_2040,rdk5_fp_2041,rdk5_fp_2042,rdk5_fp_2043,rdk5_fp_2044,rdk5_fp_2045,rdk5_fp_2046,rdk5_fp_2047
0,7.468521,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,8.045757,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,7.698970,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,8.522879,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,8.045757,1,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,6.482804,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2044,5.397940,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2045,7.022276,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2046,6.030118,1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


In [17]:
morgan_df.to_csv("~/datasets/morgan_dataset.csv", index=False)
ap_df.to_csv("~/datasets/ap_dataset.csv", index=False)
rdk5_df.to_csv("~/datasets/rdk5_dataset.csv", index=False)