In [110]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

In [111]:
# Path to the original data.pkl file
data_path = "D:\Knowledge\Hiwi\python-chebai\data\chebi_v231\ChEBI50\processed\data.pkl"
# data_path1="data\chebi_v231\ChEBI50\processed\data.pkl"

In [112]:
df = pd.read_pickle(
    open(data_path, "rb"
    )
)
df[:5]

Unnamed: 0,id,name,SMILES,1722,2440,2468,2571,2580,2634,3098,...,176910,177333,183508,183509,189832,189840,192499,194321,197504,229684
0,33429,monoatomic monoanion,[*-],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,30151,aluminide(1-),[Al-],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,16042,halide anion,[*-],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,17051,fluoride,[F-],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,28741,sodium fluoride,[F-].[Na+],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [113]:
df.shape

(185007, 1514)

In [114]:
result = df[df["SMILES"] == "CC1=C(NC(=C1C(=O)NN=CC2=CC=C(C=C2)N(C)C)C)C(=O)NN=CC3=CC=C(C=C3)N(C)C"]
result

Unnamed: 0,id,name,SMILES,1722,2440,2468,2571,2580,2634,3098,...,176910,177333,183508,183509,189832,189840,192499,194321,197504,229684
16992,112763,"N2,N4-bis[[4-(dimethylamino)phenyl]methylidene...",CC1=C(NC(=C1C(=O)NN=CC2=CC=C(C=C2)N(C)C)C)C(=O...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [115]:
# Create a new empty DataFrame for storing new variations
new_df = pd.DataFrame(columns=df.columns)


In [116]:
# Function to generate SMILES variations using different configurations
def generate_smiles_variations1(smiles, num_variations=5):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return []  # Return an empty list if conversion fails

    variations = set()

    # Loop through all combinations of doRandom and rootedAtAtom values
    for do_random in [True, False]:
        for rooted_at_atom in [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]:
            try:
                # Generate SMILES with the given configuration
                variant = Chem.MolToSmiles(mol, doRandom=do_random, rootedAtAtom=rooted_at_atom)
                if variant != smiles:  # Avoid duplicates with the original SMILES
                    variations.add(variant)
                    # print("len-variations:", len(variations))

                # Check the number of variations after adding
                if len(variations) >= num_variations:
                    return list(variations)  # Return immediately when enough variations are found

            except Exception as e:
                # Skip invalid configurations
                continue

    return list(variations)

In [117]:
import random
from rdkit import Chem
from tqdm import tqdm

# Function to generate SMILES variations using different configurations
def generate_smiles_variations(smiles, num_variations=5):
    """
    Generates a list of SMILES variations based on different configurations.

    Parameters:
    smiles (str): The input SMILES string.
    num_variations (int): The number of SMILES variations to generate.
    canonical (bool): Whether to generate canonical SMILES.

    Returns:
    list: A list of unique SMILES variations.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return []  # Return an empty list if conversion fails

    variations = set()

    # List of rootedAtAtom values to pick from randomly
    rooted_at_atoms = [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
    random.shuffle(rooted_at_atoms)  # Randomize the order of rootedAtAtom values

    # Flag to track if we've already computed a SMILES with doRandom=False and a negative rootedAtAtom
    already_computed_negative_rooted = False
    # Initialize tqdm progress bar for SMILES variation generation
    with tqdm(total=num_variations, desc="Generating SMILES Variations", unit="variant", leave=False) as pbar:
    # Loop through all combinations of doRandom and rootedAtAtom values
        for do_random in [True, False]:
            for rooted_at_atom in rooted_at_atoms:
                try:
                    # Skip redundant computations
                    if not do_random and rooted_at_atom < 0:
                        if already_computed_negative_rooted:
                            continue
                        already_computed_negative_rooted = True

                    # Generate SMILES with the given configuration
                    variant = Chem.MolToSmiles(
                        mol, 
                        doRandom=do_random, 
                        rootedAtAtom=rooted_at_atom, 
                        canonical=False
                    )

                    # Print the configuration and the generated SMILES string
                    # print(f"Config: doRandom={do_random}, rootedAtAtom={rooted_at_atom}, canonical={False} -> SMILES: {variant}")
                    
                    # Avoid duplicates with the original SMILES
                    if variant != smiles:
                        variations.add(variant)
                        pbar.update(1)  # Update tqdm progress bar with each new variant

                    # Check the number of variations after adding
                    if len(variations) >= num_variations:
                        pbar.close()  # Close the progress bar when done
                        return list(variations)  # Return immediately when enough variations are found

                except Exception as e:
                    # Skip invalid configurations
                    continue
    pbar.close()  # Close the progress bar if not already closed
    return list(variations)




In [118]:
# Example usage
smile1="OC(=O)C(C(N)C(O)=O)C"
smile2="[Al](O[Si](O[Si](O[Al]=O)=O)=O)=O.O.O"
smile3="[Cl-].[H][N+]([H])([H])[H]"
smile4="[Ca++].OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)C([O-])=O.OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)C([O-])=O"
smile5="C(CC[N+]1(C)CCCC1)(O)(C2CCCCC2)C3=CC=CC=C3.[Cl-]"
variations = generate_smiles_variations(smile5, num_variations=5)
print(variations)

                                                                        

['C(CC[N+]1(C)CCCC1)(C1CCCCC1)(c1ccccc1)O.[Cl-]', 'C(O)(C1CCCCC1)(c1ccccc1)CC[N+]1(CCCC1)C.[Cl-]', 'C(c1ccccc1)(C1CCCCC1)(CC[N+]1(C)CCCC1)O.[Cl-]', '[N+]1(C)(CCCC1)CCC(C1CCCCC1)(O)c1ccccc1.[Cl-]', 'C(O)(C1CCCCC1)(c1ccccc1)CC[N+]1(C)CCCC1.[Cl-]']




In [119]:
# Set to keep track of already seen SMILES to avoid duplicates
seen_smiles = set(df['SMILES'])

In [120]:
test_df=df[-5::]

In [121]:
test_df

Unnamed: 0,id,name,SMILES,1722,2440,2468,2571,2580,2634,3098,...,176910,177333,183508,183509,189832,189840,192499,194321,197504,229684
185011,229518,2-Amino-3-methylsuccinic acid,OC(=O)C(C(N)C(O)=O)C,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
185012,83380,dinocap-4,C\C=C\C(=O)Oc1c(cc([*])cc1[N+]([O-])=O)[N+]([O...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
185013,140503,kaolin,[Al](O[Si](O[Si](O[Al]=O)=O)=O)=O.O.O,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
185014,81948,tralkoxydim,CCO\N=C(CC)\C1=C(O)CC(CC1=O)c1c(C)cc(C)cc1C,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
185015,140499,kaolinite,[OH-].[OH-].[OH-].[OH-].O=[Si]([O-])O[Si](=O)[...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [122]:
test_df.nunique()

id        5
name      5
SMILES    5
1722      1
2440      1
         ..
189840    1
192499    1
194321    1
197504    1
229684    1
Length: 1514, dtype: int64

In [123]:
from tqdm import tqdm

In [124]:
# Process each row in the original DataFrame
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Rows", unit="row"):
    original_smiles = row['SMILES']
    
    # Generate new SMILES variations
    variations = generate_smiles_variations(original_smiles)
    
    # Filter out variations that are already seen
    variations = [var for var in variations if var not in seen_smiles]
    
    for var in variations:
        # Create a new row with the new SMILES and the rest of the features and labels unchanged
        new_row = row.copy()
        new_row['SMILES'] = var
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
        
        # Add the new SMILES to the seen set to avoid duplicates
        seen_smiles.add(var)

Processing Rows:  60%|██████    | 3/5 [00:00<00:00,  6.54row/s][17:59:42] 

****
Range Error
idx
Violation occurred on line 209 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\ROMol.cpp
Failed Expression: 4 < 1
****

[17:59:42] 

****
Range Error
idx
Violation occurred on line 209 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\ROMol.cpp
Failed Expression: 5 < 1
****

[17:59:42] 

****
Range Error
idx
Violation occurred on line 209 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\ROMol.cpp
Failed Expression: 1 < 1
****

[17:59:42] 

****
Range Error
idx
Violation occurred on line 209 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\ROMol.cpp
Failed Expression: 3 < 1
****

[17:59:42] 

****
Range Error
idx
Violation occurred on line 209 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\ROMol.cpp
Failed Expression: 2 < 1
****

Processing Rows: 

In [125]:
new_df.nunique()

id         5
name       5
SMILES    25
1722       1
2440       1
          ..
189840     1
192499     1
194321     1
197504     1
229684     1
Length: 1514, dtype: int64

In [126]:
# Append the new DataFrame (new_df) to the original DataFrame (df)
df_combined = pd.concat([test_df, new_df], ignore_index=True)

In [127]:
df_combined.shape

(30, 1514)

In [128]:
new_data_path="augmented_data.pkl"

In [129]:
pd.to_pickle(df_combined, open(new_data_path, "wb"))

In [130]:
data_df= pd.read_pickle(
    open("augmented_data.pkl", "rb"
    )
)

In [131]:
data_df

Unnamed: 0,id,name,SMILES,1722,2440,2468,2571,2580,2634,3098,...,176910,177333,183508,183509,189832,189840,192499,194321,197504,229684
0,229518,2-Amino-3-methylsuccinic acid,OC(=O)C(C(N)C(O)=O)C,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,83380,dinocap-4,C\C=C\C(=O)Oc1c(cc([*])cc1[N+]([O-])=O)[N+]([O...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,140503,kaolin,[Al](O[Si](O[Si](O[Al]=O)=O)=O)=O.O.O,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,81948,tralkoxydim,CCO\N=C(CC)\C1=C(O)CC(CC1=O)c1c(C)cc(C)cc1C,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,140499,kaolinite,[OH-].[OH-].[OH-].[OH-].O=[Si]([O-])O[Si](=O)[...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,229518,2-Amino-3-methylsuccinic acid,C(O)(C(C(N)C(O)=O)C)=O,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,229518,2-Amino-3-methylsuccinic acid,O=C(C(C(C(=O)O)N)C)O,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,229518,2-Amino-3-methylsuccinic acid,OC(=O)C(C(N)C(=O)O)C,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,229518,2-Amino-3-methylsuccinic acid,NC(C(C)C(=O)O)C(=O)O,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,229518,2-Amino-3-methylsuccinic acid,OC(=O)C(C)C(C(=O)O)N,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [132]:
data_df.shape

(30, 1514)

In [133]:
data_df.nunique()

id         5
name       5
SMILES    30
1722       1
2440       1
          ..
189840     1
192499     1
194321     1
197504     1
229684     1
Length: 1514, dtype: int64

In [134]:
def find_smiles_variations(smiles):
    original_smiles = smiles
    mol = Chem.MolFromSmiles(original_smiles)
    smiles_variations=Chem.MolToSmiles(mol,doRandom=True,rootedAtAtom=2,canonical=False)
    return smiles_variations


In [135]:
smile_variations= find_smiles_variations("[Cl-].[H][N+]([H])([H])[H]")

[17:59:42] 

****
Pre-condition Violation
rootedAtomAtom must be less than the number of atoms
Violation occurred on line 534 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\SmilesParse\SmilesWrite.cpp
Failed Expression: params.rootedAtAtom < 0 || static_cast<unsigned int>(params.rootedAtAtom) < mol.getNumAtoms()
****



RuntimeError: Pre-condition Violation
	rootedAtomAtom must be less than the number of atoms
	Violation occurred on line 534 in file Code\GraphMol\SmilesParse\SmilesWrite.cpp
	Failed Expression: params.rootedAtAtom < 0 || static_cast<unsigned int>(params.rootedAtAtom) < mol.getNumAtoms()
	RDKIT: 2024.03.5
	BOOST: 1_85


In [None]:
smile_variations

'C(C(O)=O)(C(C)C(O)=O)N'