In [18]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from pubchempy import get_cids, get_properties
import os
import pickle
import lmdb
from rdkit import Chem
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  
import warnings
warnings.filterwarnings(action='ignore')
from multiprocessing import Pool, cpu_count
import random
import sys
from sklearn.cluster import KMeans
from rdkit.Chem import rdMolTransforms
from rdkit.Chem.rdMolAlign import AlignMolConformers
import re
import json
import copy
import time
import threading
from rdkit.Chem import Crippen
from rdkit.Chem.Descriptors import MolWt, MolLogP, HeavyAtomMolWt
from joblib import Parallel, delayed



In [19]:
threshold = 50
sim_smiles2 = get_cids("CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O",'smiles', searchtype='similarity', Threshold = threshold, as_dataframe=True)

In [20]:
len(sim_smiles2)

688709

In [21]:
import time
from pandas import DataFrame

def process_in_batches(data, batch_size):
    """Yield successive n-sized chunks from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Assuming sim_smiles2 is a list of SMILES strings
batch_size = 20000
results = []

for batch in process_in_batches(sim_smiles2, batch_size):
    batch_result = get_properties(['CanonicalSMILES'], batch, namespace='cid', searchtype=None, as_dataframe=True)
    results.append(batch_result)
    time.sleep(2)  # Sleep for 50 seconds

# Combine all batch results into a single DataFrame
combined_results = DataFrame().append(results, ignore_index=True)


In [22]:
combined_results.to_csv('50.csv', index=False)

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import FingerprintSimilarity
from multiprocessing import Pool

def calculate_similarity_for_smiles(target_fp, smiles, similarity_threshold):
    # Convert the SMILES to a RDKit molecule
    mol = Chem.MolFromSmiles(smiles)

    # If the molecule is None, return None to indicate an invalid SMILES
    if mol is None:
        return None

    # Compute the MACCS fingerprint for the molecule
    fp = MACCSkeys.GenMACCSKeys(mol)

    # Calculate the Tanimoto similarity
    similarity = FingerprintSimilarity(target_fp, fp)

    # Return the SMILES string if the similarity is above the threshold
    return smiles if similarity >= similarity_threshold else None

def calculate_similarity_parallel(csv_file, target_smiles, similarity_threshold=0.7, num_workers=32):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Convert the target molecule to a RDKit molecule and compute its MACCS fingerprint
    target_mol = Chem.MolFromSmiles(target_smiles)
    if target_mol is None:
        raise ValueError("Invalid target SMILES string")
    target_fp = MACCSkeys.GenMACCSKeys(target_mol)

    # Use multiprocessing pool to parallelize the computation
    with Pool(num_workers) as pool:
        results = pool.starmap(calculate_similarity_for_smiles, [(target_fp, smiles, similarity_threshold) for smiles in df['CanonicalSMILES']])

    # Filter out None and collect valid SMILES strings
    similar_smiles = [smiles for smiles in results if smiles is not None]

    # Create a DataFrame with similar molecules
    similar_df = pd.DataFrame(similar_smiles, columns=['SimilarSMILES'])

    # Save the DataFrame to CSV
    similar_df.to_csv('SearchSpace.csv', index=False)

    return len(similar_smiles)

# Example usage
csv_file = '50.csv'
target_smiles = 'CCCCCC(C)CCCCCCCCCOS(=O)(=O)O'
number_of_similar_molecules = calculate_similarity_parallel(csv_file, target_smiles)
print(f"Number of similar molecules: {number_of_similar_molecules}")


[03:09:21] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[03:09:21] Explicit valence for atom # 38 Br, 3, is greater than permitted
[03:09:21] Explicit valence for atom # 34 Cl, 3, is greater than permitted
[03:09:21] Explicit valence for atom # 37 Cl, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 9 Cl, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 8 S, 8, is greater than permitted
[03:09:22] Explicit valence for atom # 2 Cl, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 22 Cl, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 13 Br, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 1 Br, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 20 Br, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 5 Cl, 3, is greater than permitted
[03:09:22] Explicit valence for atom # 2 Cl, 3, is greater than permitted
[03:09:23] Explicit valence for a

Number of similar molecules: 29861
