In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import FingerprintSimilarity
from multiprocessing import Pool

def calculate_similarity_for_smiles(target_fp, smiles, similarity_threshold):
    # Convert the SMILES to a RDKit molecule
    mol = Chem.MolFromSmiles(smiles)

    # If the molecule is None, return None to indicate an invalid SMILES
    if mol is None:
        return None

    # Compute the MACCS fingerprint for the molecule
    fp = MACCSkeys.GenMACCSKeys(mol)

    # Calculate the Tanimoto similarity
    similarity = FingerprintSimilarity(target_fp, fp)

    # Return the SMILES string if the similarity is above the threshold
    return smiles if similarity >= similarity_threshold else None

def calculate_similarity_parallel(csv_file, target_smiles, similarity_threshold=0.7, num_workers=32):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Convert the target molecule to a RDKit molecule and compute its MACCS fingerprint
    target_mol = Chem.MolFromSmiles(target_smiles)
    if target_mol is None:
        raise ValueError("Invalid target SMILES string")
    target_fp = MACCSkeys.GenMACCSKeys(target_mol)

    # Use multiprocessing pool to parallelize the computation
    with Pool(num_workers) as pool:
        results = pool.starmap(calculate_similarity_for_smiles, [(target_fp, smiles, similarity_threshold) for smiles in df['SMILES']])

    # Filter out None and collect valid SMILES strings
    similar_smiles = [smiles for smiles in results if smiles is not None]

    # Create a DataFrame with similar molecules
    similar_df = pd.DataFrame(similar_smiles, columns=['SMILES'])

    # Save the DataFrame to CSV
    #similar_df.to_csv('SearchSpace.csv', index=False)
    
    return similar_df

# Example usage
csv_file = 'SearchSpace.csv'
target_smiles = 'CCCCCC(C)CCCCCCCCCOS(=O)(=O)O'
similar_df80 = calculate_similarity_parallel(csv_file, target_smiles, similarity_threshold=0.8)
similar_df81 = calculate_similarity_parallel(csv_file, target_smiles, similarity_threshold=0.81)

print(len(similar_df80))
print(len(similar_df81))

filtered_df80 = similar_df80[~similar_df80['SMILES'].isin(similar_df81['SMILES'])]

# Select 5 random rows from the new dataframe
random_rows = filtered_df80.sample(n=5, random_state=1)

for smile in random_rows['SMILES']:
    print(smile)