In [1]:
import torch
import json
import os

from rdkit import Chem
from rdkit.Chem import AllChem

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '../MP/MOSES_dataset.txt'

# Define folder in which all data will be stored
data_folder = f'data/MOSES_dataset'

# Define name for storing dataset basic description
dataset_parameters_name = f'{data_folder}/dataset_parameters.json'

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name, 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of POSCAR files from SMILES

In [3]:
def smiles_to_poscar(smiles, path_to_folder):
    """Convert a SMILES string into a POSCAR file.
    
    Args:
        smiles         (str): SMILES string codifying a molecule.
        path_to_folder (str): Path to the folder where to save the generated POSCAR.
    """
    
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # Add hydrogens for a more accurate representation

    # Generate 3D coordinates
    AllChem.EmbedMolecule(mol)
    mol.AddConformer(Chem.Conformer(mol.GetNumAtoms()))  # Add a conformer
    AllChem.MMFFOptimizeMolecule(mol)

    # Extract atomic coordinates
    coords = mol.GetConformer().GetPositions()

    # Get atomic symbols
    symbols = [atom.GetSymbol() for atom in mol.GetAtoms()]

    # Write POSCAR format
    poscar  = "SMILES to POSCAR\n"
    poscar += "1.0\n"  # Scale factor
    poscar += "    100.0    0.000    0.000\n"
    poscar += "    0.000    100.0    0.000\n"
    poscar += "    0.000    0.000    100.0\n"
    poscar += " ".join(symbols) + "\n"
    poscar += " ".join(map(str, [len(s) for s in symbols])) + "\n"
    poscar += "Cartesian\n"

    for atom, coord in zip(symbols, coords):
        poscar += " ".join([str(c) for c in coord]) + "\n"

    # Save POSCAR to file
    with open(f'{path_to_folder}/POSCAR', 'w') as f:
        f.write(poscar)

In [None]:
# Read all molecules within the database
with open(data_path, 'r') as file:
    smiles_database = file.readlines()

idx = 0
for line in smiles_database:
    smiles_mol = line[:-1]  # Removing slash n
    print(idx, smiles_mol)
    
    # Generate a folder with the SMILES mol as name
    mol_folder = f'{data_folder}/{idx}'
    if not os.path.exists(mol_folder):
        os.system(f'mkdir {mol_folder}')
    
    # Generate a subfolder, so the structure is the same as in crystals
    mol_subfolder = f'{mol_folder}/mol'
    if not os.path.exists(mol_subfolder):
        os.system(f'mkdir {mol_subfolder}')
    
    # Save SMILES as POSCAR
    smiles_to_poscar(smiles_mol, path_to_folder=mol_subfolder)
    
    # Save corresponding SMILES string
    with open(f'{mol_subfolder}/mol.txt', 'w') as f:
        f.write(smiles_mol)
    idx += 1

print(idx)

0 CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1 CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2 CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
3 Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4 Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
5 CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O
6 O=C(C1CCCCC1)N1CC(=O)N2CCCc3ccccc3C2C1
7 CCOC(=O)c1cncn1C1CCCc2ccccc21
8 COc1ccccc1OC(=O)c1ccccc1OC(C)=O
9 COc1ccccc1OC(=O)Oc1ccccc1OC
10 O=C1Nc2ccc(Cl)cc2C(c2ccccc2Cl)=NC1O
11 CN1C(=O)C(O)N=C(c2ccccc2Cl)c2cc(Cl)ccc21
12 CCC(=O)c1ccc(OCC(O)CO)c(OC)c1
13 Cc1nc2c([nH]1)c(=O)n(C)c(=O)n2CC1CC=CCC1
14 COc1cc2c(cc1O)N=CC1CCC(O)N1C2=O
15 COc1c(C)cnc(CS(=O)c2nc3ccccc3[nH]2)c1C
16 COc1cc(C)c(Cc2cnc(N)nc2N)cc1OC
17 O=C1Nc2ccc(Cl)cc2C(c2ccccc2)=NC1O
18 CC1CC(OC(=O)CN2CCCC2=O)CC(C)(C)C1
19 COc1ccc(OC)c(Cc2cnc3nc(N)nc(N)c3c2C)c1
20 O=C(C1CCCCC1)N1CC(=O)N2CCc3ccccc3C2C1
21 COC(=O)c1c[nH]c2cc(OC(C)C)c(OC(C)C)cc2c1=O
22 CCC1NC(=O)c2cc(S(N)(=O)=O)c(Cl)cc2N1
23 CN1C(=O)C(O)N=C(c2ccccc2)c2cc(Cl)ccc21
24 CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1cncn1
25 COc1cc(C(=O)N2CCOCC2)cc(OC)c1OC
26 CC

In [ ]:
print()

In [5]:
print()



In [6]:
print()

