In [1]:
!pip install --user rdkit

Collecting rdkit
  Using cached rdkit-2025.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Using cached rdkit-2025.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (36.7 MB)
Installing collected packages: rdkit
Successfully installed rdkit-2025.9.5


In [2]:
!pip install --user pandas



In [3]:
!pip install --user torch



In [4]:
!pip install matplotlib



In [5]:
import sys
!{sys.executable} -m pip install --user rdkit

Collecting rdkit
  Using cached rdkit-2025.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Using cached rdkit-2025.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (36.7 MB)
Installing collected packages: rdkit
Successfully installed rdkit-2025.9.5


In [12]:
import site
print(site.getusersitepackages())

/u/ahernandez9/.local/lib/python3.11/site-packages


In [13]:
import sys, site
sys.path.append(site.getusersitepackages())

In [14]:
from rdkit import Chem 

In [15]:
# import rdkit

In [16]:
import rdkit 
from rdkit import Chem 
from rdkit.Chem import AllChem

In [47]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

exp_file = "200mol_updated.csv"
smiles_col = "smiles"
main_folder = "turbomole_inputs"

ANGSTROM_TO_BOHR = 1.889726125

os.makedirs(main_folder, exist_ok=True)

df = pd.read_csv(exp_file)

for idx, SMILES in enumerate(df[smiles_col]):
    try:
        mol = Chem.MolFromSmiles(SMILES)
        if mol is None:
            print(f"Skipping invalid SMILES at index {idx}")
            continue

        mol = Chem.AddHs(mol)

        params = AllChem.ETKDGv3()
        params.randomSeed = 42

        conf_ids = AllChem.EmbedMultipleConfs(
            mol,
            numConfs=5,
            randomSeed=42,
            useExpTorsionAnglePrefs=True,
            useBasicKnowledge=True,
            maxAttempts=1000
        )

        if not conf_ids:
            print(f"Embedding failed for molecule {idx}")
            continue

        min_energy = None
        best_conf = None

        for conf_id in conf_ids:
            ff = AllChem.UFFGetMoleculeForceField(mol, confId=conf_id)
            if ff is None:
                continue
            ff.Initialize()
            ff.Minimize(maxIts=2000)
            energy = ff.CalcEnergy()

            if min_energy is None or energy < min_energy:
                min_energy = energy
                best_conf = conf_id

        if best_conf is None:
            print(f"Optimization failed for molecule {idx}")
            continue

        mol_dir = os.path.join(main_folder, f"mol_{idx}")
        os.makedirs(mol_dir, exist_ok=True)

        mol.SetProp("_Name", f"mol_{idx}")

        conf = mol.GetConformer(best_conf)

        xyz_path = os.path.join(mol_dir, "structure.xyz")

        with open(xyz_path, 'w') as f:
            f.write(f"{mol.GetNumAtoms()}\n")
            f.write(f"mol_{idx}\n")
            for atom in mol.GetAtoms():
                pos = conf.GetAtomPosition(atom.GetIdx())
                f.write(
                    f"{atom.GetSymbol()} "
                    f"{pos.x:.6f} {pos.y:.6f} {pos.z:.6f}\n"
                )

        coord_path = os.path.join(mol_dir, "coord")

        with open(coord_path, "w") as f:
            f.write("$coord\n")
            for atom in mol.GetAtoms():
                pos = conf.GetAtomPosition(atom.GetIdx())

                x = pos.x * ANGSTROM_TO_BOHR
                y = pos.y * ANGSTROM_TO_BOHR
                z = pos.z * ANGSTROM_TO_BOHR

                symbol = atom.GetSymbol().lower()

                f.write(
                    f"  {x: .8f}  {y: .8f}  {z: .8f}  {symbol}\n"
                )

            f.write("$end\n")

        print(f"Prepared Turbomole input for mol_{idx}")

    except Exception as e:
        print(f"Error at index {idx}: {e}")

        
        

Prepared Turbomole input for mol_0
Prepared Turbomole input for mol_1
Prepared Turbomole input for mol_2
Prepared Turbomole input for mol_3
Prepared Turbomole input for mol_4
Prepared Turbomole input for mol_5
Prepared Turbomole input for mol_6
Prepared Turbomole input for mol_7
Prepared Turbomole input for mol_8
Prepared Turbomole input for mol_9
Prepared Turbomole input for mol_10
Prepared Turbomole input for mol_11
Prepared Turbomole input for mol_12
Prepared Turbomole input for mol_13
Prepared Turbomole input for mol_14
Prepared Turbomole input for mol_15
Prepared Turbomole input for mol_16
Skipping invalid SMILES at index 17
Prepared Turbomole input for mol_18


[22:20:37] SMILES Parse Error: syntax error while parsing: Nc1ccc(cc1N+=O)N+=O
[22:20:37] SMILES Parse Error: check for mistakes around position 12:
[22:20:37] Nc1ccc(cc1N+=O)N+=O
[22:20:37] ~~~~~~~~~~~^
[22:20:37] SMILES Parse Error: extra open parentheses while parsing: Nc1ccc(cc1N+=O)N+=O
[22:20:37] SMILES Parse Error: check for mistakes around position 7:
[22:20:37] Nc1ccc(cc1N+=O)N+=O
[22:20:37] ~~~~~~^
[22:20:37] SMILES Parse Error: Failed parsing SMILES 'Nc1ccc(cc1N+=O)N+=O' for input: 'Nc1ccc(cc1N+=O)N+=O'


Prepared Turbomole input for mol_19
Prepared Turbomole input for mol_20
Prepared Turbomole input for mol_21
Prepared Turbomole input for mol_22
Prepared Turbomole input for mol_23
Prepared Turbomole input for mol_24
Prepared Turbomole input for mol_25
Prepared Turbomole input for mol_26
Prepared Turbomole input for mol_27
Prepared Turbomole input for mol_28
Prepared Turbomole input for mol_29
Prepared Turbomole input for mol_30
Prepared Turbomole input for mol_31
Prepared Turbomole input for mol_32
Prepared Turbomole input for mol_33
Prepared Turbomole input for mol_34
Prepared Turbomole input for mol_35
Skipping invalid SMILES at index 36


[22:20:38] SMILES Parse Error: syntax error while parsing: CCCN+=O
[22:20:38] SMILES Parse Error: check for mistakes around position 5:
[22:20:38] CCCN+=O
[22:20:38] ~~~~^
[22:20:38] SMILES Parse Error: Failed parsing SMILES 'CCCN+=O' for input: 'CCCN+=O'


Prepared Turbomole input for mol_37
Prepared Turbomole input for mol_38
Prepared Turbomole input for mol_39
Prepared Turbomole input for mol_40
Prepared Turbomole input for mol_41
Prepared Turbomole input for mol_42
Prepared Turbomole input for mol_43
Prepared Turbomole input for mol_44
Prepared Turbomole input for mol_45
Prepared Turbomole input for mol_46
Prepared Turbomole input for mol_47
Prepared Turbomole input for mol_48
Prepared Turbomole input for mol_49
Prepared Turbomole input for mol_50
Prepared Turbomole input for mol_51
Prepared Turbomole input for mol_52
Prepared Turbomole input for mol_53
Prepared Turbomole input for mol_54
Prepared Turbomole input for mol_55
Prepared Turbomole input for mol_56
Prepared Turbomole input for mol_57
Prepared Turbomole input for mol_58
Prepared Turbomole input for mol_59
Prepared Turbomole input for mol_60
Prepared Turbomole input for mol_61
Prepared Turbomole input for mol_62
Prepared Turbomole input for mol_63
Prepared Turbomole input for

[22:20:41] UFFTYPER: Unrecognized charge state for atom: 0
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 0
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (0)
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (0)
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 5
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 6
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (7)
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (8)
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 5
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 6
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (7)
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (8)
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 5
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 6
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (7)
[22:20:41] UFFTYPER: Unrecognized atom type: Co3+3 (8)
[22:20:41] UFFTYPER: Unrecognized charge state for atom: 5
[22:20:41] UFFTYPER: Unrecogn

Prepared Turbomole input for mol_66
Prepared Turbomole input for mol_67
Prepared Turbomole input for mol_68
Prepared Turbomole input for mol_69
Prepared Turbomole input for mol_70
Prepared Turbomole input for mol_71
Prepared Turbomole input for mol_72
Prepared Turbomole input for mol_73
Prepared Turbomole input for mol_74
Prepared Turbomole input for mol_75
Prepared Turbomole input for mol_76
Prepared Turbomole input for mol_77
Prepared Turbomole input for mol_78
Prepared Turbomole input for mol_79
Prepared Turbomole input for mol_80
Prepared Turbomole input for mol_81
Prepared Turbomole input for mol_82
Prepared Turbomole input for mol_83
Prepared Turbomole input for mol_84
Prepared Turbomole input for mol_85
Prepared Turbomole input for mol_86
Prepared Turbomole input for mol_87
Skipping invalid SMILES at index 88
Prepared Turbomole input for mol_89
Prepared Turbomole input for mol_90


[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] UFFTYPER: Unrecognized atom type: Se2+2 (6)
[22:20:42] SMILES Parse Error: syntax error while parsing: [O-]N+C1=C(C#N)C(=CC=C1)N+=O
[22:20:42] SMILES Parse Error: check for mistakes around position 6:
[22:20:42] [O-]N+C1=C(C#N)C(=CC=C1)N+=O
[22:20:42] ~~~~~^
[22:20:42] SMILES Parse Error: Failed parsing SMILES '[O-]N+C1=C(C#N)C(=CC=C1)N+=O' for input: '[O-]N+C1=C(C#N)C(=CC=C1)N+=O'


Prepared Turbomole input for mol_91
Prepared Turbomole input for mol_92
Prepared Turbomole input for mol_93
Prepared Turbomole input for mol_94
Prepared Turbomole input for mol_95
Prepared Turbomole input for mol_96
Prepared Turbomole input for mol_97
Prepared Turbomole input for mol_98
Prepared Turbomole input for mol_99
Prepared Turbomole input for mol_100
Prepared Turbomole input for mol_101
Prepared Turbomole input for mol_102
Prepared Turbomole input for mol_103
Prepared Turbomole input for mol_104
Prepared Turbomole input for mol_105
Prepared Turbomole input for mol_106
Prepared Turbomole input for mol_107
Skipping invalid SMILES at index 108


[22:20:43] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11


Prepared Turbomole input for mol_109
Prepared Turbomole input for mol_110
Prepared Turbomole input for mol_111
Prepared Turbomole input for mol_112
Prepared Turbomole input for mol_113
Prepared Turbomole input for mol_114
Prepared Turbomole input for mol_115
Prepared Turbomole input for mol_116
Prepared Turbomole input for mol_117
Prepared Turbomole input for mol_118
Prepared Turbomole input for mol_119


[22:20:45] UFFTYPER: Unrecognized charge state for atom: 0
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (0)
[22:20:45] UFFTYPER: Unrecognized charge state for atom: 29
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (29)
[22:20:45] UFFTYPER: Unrecognized charge state for atom: 29
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (29)
[22:20:45] UFFTYPER: Unrecognized charge state for atom: 29
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (29)
[22:20:45] UFFTYPER: Unrecognized charge state for atom: 29
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (29)
[22:20:45] UFFTYPER: Unrecognized charge state for atom: 29
[22:20:45] UFFTYPER: Unrecognized atom type: Ce+3 (29)


Prepared Turbomole input for mol_120
Prepared Turbomole input for mol_121
Prepared Turbomole input for mol_122
Prepared Turbomole input for mol_123
Prepared Turbomole input for mol_124
Prepared Turbomole input for mol_125
Prepared Turbomole input for mol_126
Prepared Turbomole input for mol_127
Skipping invalid SMILES at index 128
Prepared Turbomole input for mol_129
Prepared Turbomole input for mol_130


[22:20:47] SMILES Parse Error: extra open parentheses while parsing: C1=NC2=C(N1CC
[22:20:47] SMILES Parse Error: check for mistakes around position 9:
[22:20:47] C1=NC2=C(N1CC
[22:20:47] ~~~~~~~~^
[22:20:47] SMILES Parse Error: Failed parsing SMILES 'C1=NC2=C(N1CC' for input: 'C1=NC2=C(N1CC'


In [None]:
# def smiles_to_xyz(smiles_string):
#     mol = Chem.MolFromSmiles(smiles_string)
#     if mol is None:
#         raise ValueError("Invalid SMILES string at {a}: {smiles_string}")
        
#     # addition of hydrogens
#     mol = Chem.AddHs(mol)
    
#     # Generate 3d coordinates of the molecule
#     success = AllChem.EmbedMolecule(mol, AllChem.ETKDG())
#     if success !=0:
#         raise ValueError("3D embedding failed")
        
#     AllChem.UFFOptimizeMolecule(mol)
    
#     # xyz coordinates extraction
#     conf = mol.GetConformer()
#     xyz_lines = []
#     xyz_lines.append(str(mol.GetNumAtoms()))
#     xyz_lines.append("Generated from SMILES: " + smiles_string)
    
    
#     for atom in mol.GetAtoms():
#         pos = conf.GetAtomPosition(atom.GetIdx())
#         xyz_lines.append(f"{atom.GetSymbol()} {pos.x:.4f} {pos.y:.4f} {pos.z:.4f}")
    
    
#     return "\n".join(xyz_lines)


# # convert this to string
# xyz_string = smiles_to_xyz(ssl)

# # Loop this for all 932 molecules and save it into a XYZ file 

# folder = "XYZcoordinates"

# # file_path = os.path.join(folder, output_file_xyzcoordinates)

# for n, ssl in enumerate(smiles_string_list):
#     try:
#         xyz_string = smiles_to_xyz(ssl)
#         file_name = f"mol_{n+1:04d}.xyz"
#         file_path = os.path.join(folder, file_name)
        
#         with open(file_path, "w") as f:
#             f.write(xyz_string)
            
#         print(f"[{n+1:04d}] Saved {file_name}")
        
#     except Exception as e:
#         print(f"[{n+1:04d}] Failed {ssl}")
#         print(f" Error: {e}")