Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Augmentation of fingerprints/SMILES #52

Closed
ansariyusuf opened this issue May 19, 2022 · 1 comment
Closed

Augmentation of fingerprints/SMILES #52

ansariyusuf opened this issue May 19, 2022 · 1 comment

Comments

@ansariyusuf
Copy link

Hi,

I read the paper and it states FP-break and FP-concat techniques for augmenting fingerprints. Is there an example that demonstrates how can I use this package for FP-break/FP-concat?

@CoopLo
Copy link
Collaborator

CoopLo commented Jun 2, 2022

Hello @ansariyusuf. @yuyangw provided these scripts used for FP-break and FP-concat. You should be able to call read_smiles to get the smile for a given path, target, and task, then use either create_dataset in each script from there. Please let me know if you have any further questions.

FP-Break:

import os
import csv
import math
import yaml
import time
import random
import pandas as pd
import networkx as nx
import numpy as np
import xgboost as xgb
from copy import deepcopy

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from rdkit.Chem.BRICS import BRICSDecompose, BreakBRICSBonds, FindBRICSBonds
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')       


from sklearn.metrics import roc_auc_score, mean_absolute_error, accuracy_score
from sklearn.model_selection import GridSearchCV


def _generate_scaffold(smiles, include_chirality=False):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    return scaffold


def generate_scaffolds(smiles_data, log_every_n=1000):
    scaffolds = {}
    data_len = len(smiles_data)
    print(data_len)

    print("About to generate scaffolds")
    for ind, smiles in enumerate(smiles_data):
        if ind % log_every_n == 0:
            print("Generating scaffold %d/%d" % (ind, data_len))
        scaffold = _generate_scaffold(smiles)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [ind]
        else:
            scaffolds[scaffold].append(ind)

    # Sort from largest to smallest scaffold sets
    scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
    scaffold_sets = [
        scaffold_set for (scaffold, scaffold_set) in sorted(
            scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
    ]
    return scaffold_sets


def scaffold_split(smiles_data, valid_size, test_size, seed=None, log_every_n=1000):
    train_size = 1.0 - valid_size - test_size
    scaffold_sets = generate_scaffolds(smiles_data)

    train_cutoff = train_size * len(smiles_data)
    valid_cutoff = (train_size + valid_size) * len(smiles_data)
    train_inds: List[int] = []
    valid_inds: List[int] = []
    test_inds: List[int] = []

    print("About to sort in scaffold sets")
    for scaffold_set in scaffold_sets:
        if len(train_inds) + len(scaffold_set) > train_cutoff:
            if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
                test_inds += scaffold_set
            else:
                valid_inds += scaffold_set
        else:
            train_inds += scaffold_set
    return train_inds, valid_inds, test_inds


def read_smiles(data_path, target, task):
    smiles_data, labels = [], []
    with open(data_path) as csv_file:
        # csv_reader = csv.reader(csv_file, delimiter=',')
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        for i, row in enumerate(csv_reader):
            if i != 0:
                # smiles = row[3]
                smiles = row['smiles']
                label = row[target]
                mol = Chem.MolFromSmiles(smiles)
                if mol != None and label != '':
                    smiles_data.append(smiles)
                    if task == 'classification':
                        labels.append(int(label))
                    elif task == 'regression':
                        labels.append(float(label))
                    else:
                        ValueError('task must be either regression or classification')
    print(len(smiles_data))
    return smiles_data, labels


def break_mol(mol):
    # res = list(BRICSDecompose(mol, returnMols=True, singlePass=True))
    res = list(BRICSDecompose(mol, returnMols=True, keepNonLeafNodes=True))
    return res


def get_fp(mol, fp_type='RDK'):
    if fp_type == 'RDK':
        return Chem.RDKFingerprint(mol)
    elif fp_type == 'Morgan':
        return AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048)
    else:
        raise ValueError('Undefined fingerprint!')


def create_dataset(smiles, labels, fp_type, aug_time=4):
    aug_dataset, aug_labels = [], []
    for s, l in zip(smiles, labels):
        mol = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(mol)

        fp = get_fp(mol, fp_type)
        aug_dataset.append(np.array(fp))
        aug_labels.append(l)
        
        # for __ in range(aug_time - 1):
            # mol_aug = remove_frag(mol)
            # fp_aug = get_fp(mol_aug, fp_type)
            # aug_dataset.append(np.array(fp_aug))
            # aug_labels.append(l)
        
        if aug_time > 1:
            res = break_mol(mol)
            for r in res:
                fp_aug = get_fp(r, fp_type)
                sim = DataStructs.FingerprintSimilarity(fp, fp_aug)
                if sim > 0.6:
                    aug_dataset.append(np.array(fp_aug))
                    aug_labels.append(l)
    
    return np.array(aug_dataset), np.array(aug_labels)

FP-Concat:

import os
import csv
import math
import yaml
import time
import random
import pandas as pd
import networkx as nx
import numpy as np
import xgboost as xgb
from copy import deepcopy

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from rdkit.Chem.BRICS import BRICSDecompose, BreakBRICSBonds, FindBRICSBonds
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')       


from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV


def _generate_scaffold(smiles, include_chirality=False):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    return scaffold


def generate_scaffolds(smiles_data, log_every_n=1000):
    scaffolds = {}
    data_len = len(smiles_data)
    print(data_len)

    print("About to generate scaffolds")
    for ind, smiles in enumerate(smiles_data):
        if ind % log_every_n == 0:
            print("Generating scaffold %d/%d" % (ind, data_len))
        scaffold = _generate_scaffold(smiles)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [ind]
        else:
            scaffolds[scaffold].append(ind)

    # Sort from largest to smallest scaffold sets
    scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
    scaffold_sets = [
        scaffold_set for (scaffold, scaffold_set) in sorted(
            scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
    ]
    return scaffold_sets


def scaffold_split(smiles_data, valid_size, test_size, seed=None, log_every_n=1000):
    train_size = 1.0 - valid_size - test_size
    scaffold_sets = generate_scaffolds(smiles_data)

    train_cutoff = train_size * len(smiles_data)
    valid_cutoff = (train_size + valid_size) * len(smiles_data)
    train_inds: List[int] = []
    valid_inds: List[int] = []
    test_inds: List[int] = []

    print("About to sort in scaffold sets")
    for scaffold_set in scaffold_sets:
        if len(train_inds) + len(scaffold_set) > train_cutoff:
            if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
                test_inds += scaffold_set
            else:
                valid_inds += scaffold_set
        else:
            train_inds += scaffold_set
    return train_inds, valid_inds, test_inds


def read_smiles(data_path, target, task):
    smiles_data, labels = [], []
    with open(data_path) as csv_file:
        # csv_reader = csv.reader(csv_file, delimiter=',')
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        for i, row in enumerate(csv_reader):
            if i != 0:
                # smiles = row[3]
                smiles = row['smiles']
                label = row[target]
                mol = Chem.MolFromSmiles(smiles)
                if mol != None and label != '':
                    smiles_data.append(smiles)
                    if task == 'classification':
                        labels.append(int(label))
                    elif task == 'regression':
                        labels.append(float(label))
                    else:
                        ValueError('task must be either regression or classification')
    print(len(smiles_data))
    return smiles_data, labels


def break_mol(mol):
    res = list(BRICSDecompose(mol, returnMols=True, singlePass=True))
    # res = list(BRICSDecompose(mol, returnMols=True, keepNonLeafNodes=True))
    return res


def get_fp(mol, fp_type='RDK'):
    if fp_type == 'RDK':
        return Chem.RDKFingerprint(mol)
    elif fp_type == 'Morgan':
        return AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048)
    else:
        raise ValueError('Undefined fingerprint!')


def create_dataset(smiles, labels, fp_type, aug_time=4):
    aug_dataset, aug_labels = [], []
    counter = 0
    for s, l in zip(smiles, labels):
        mol = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(mol)

        fp = get_fp(mol, fp_type)
        fps = [fp]
        
        if aug_time > 1:
            res = break_mol(mol)
            for r in res:
                fp_aug = get_fp(r, fp_type)
                sim = DataStructs.FingerprintSimilarity(fp, fp_aug)
                if sim > 0.6:
                    fps.append(fp)
        
        aug_fps = [fp] * 3
        aug_fps = np.array(aug_fps).reshape(-1)
        aug_dataset.append(aug_fps)
        aug_labels.append(l)

        for i in range(aug_time - 1):
            aug_fps = random.choices(fps, k=3)
            aug_fps = np.array(aug_fps).reshape(-1)
            aug_dataset.append(aug_fps)
            aug_labels.append(l)
        
        if counter % 100 == 0:
            print(counter)

        counter += 1

    return np.array(aug_dataset), np.array(aug_labels)

@CoopLo CoopLo closed this as completed Jun 22, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants