In [2]:
from pylab import *
import pandas as pd
import re
import json
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdDetermineBonds

In [3]:
# Initialize variables
molecules = []
current_molecule = None

# Read the XYZ archive
with open('all_QM9.xyz', 'r') as xyz_file:
    for line in xyz_file:
        line = line.strip()
        if line.isdigit():
            # A new molecule is starting
            if current_molecule:
                molecules.append(current_molecule)
            current_molecule = {'atoms': []}
        elif line and current_molecule:
            # Parse lines with atomic information
            parts = line.split()
            atom_info = {
                'element': parts[0],
                'coordinates': [float(parts[1]),float(parts[2]),float(parts[3])],
                "GW_charged":  (parts[4]),
                "dKS_charged": (parts[5]),
                "dKS_neutral": (parts[6])
            }
            current_molecule['atoms'].append(atom_info)
# Add the last molecule
if current_molecule:
    molecules.append(current_molecule)

# Save the extracted data as a JSON file
with open('output.json', 'w') as json_file:
    json.dump(molecules, json_file, indent=2)

print("Data extracted and saved as 'output.json'")


Data extracted and saved as 'output.json'


In [4]:

 
# Opening JSON file
f = open('output.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)


In [5]:
db_smiles = []
for i in range(0,len(data)):
    data_2= data[i]["atoms"]
    # print(data_2)
    # Define the file path where you want to save the XYZ file
    xyz_file_path = 'output.xyz'
    
    with open(xyz_file_path, 'w') as xyz_file:
        # Write the number of atoms
        xyz_file.write(str(len(data_2)-1) + '\n')
        
        # Write a comment line (can be left empty)
        # xyz_file.write('This is a comment line\n')
        
        # Write atom information for each entry in the data list
        for entry in data_2:
            # print(entry)
            element = entry['element']
            coordinates = entry['coordinates']
            
            # Write the element and coordinates
            xyz_file.write(f"{element} {' '.join(map(str, coordinates))}\n")
        
    raw_mol = Chem.MolFromXYZFile('output.xyz')
    mol = Chem.Mol(raw_mol)
    
    rdDetermineBonds.DetermineBonds(mol,charge=0)
    smiles = Chem.MolToSmiles(mol, allHsExplicit=True,canonical=True)
    
    m = Chem.MolFromSmiles(smiles)
    smiles_r = Chem.MolToSmiles(m, canonical=True)
    db_smiles.append(smiles_r)


In [6]:
len(data)

2089

In [7]:
# Define a list of SMILES for labeling
db_smiles = db_smiles

# Initialize variables
molecules = []
current_molecule = None

# Read the XYZ archive
i=0
with open('all_QM9.xyz', 'r') as xyz_file:
    for line in xyz_file:
        line = line.strip()
        if line.isdigit():
            # A new molecule is starting
            i+=1
            if current_molecule:
                molecules.append(current_molecule)
            current_molecule = {'atoms': [], 'label': db_smiles[i-1]}  # Include 'label' key
        elif line and current_molecule:
            # Parse lines with atomic information
            parts = line.split()
            atom_info = {
                'element': parts[0],
                'coordinates': [float(parts[1]), float(parts[2]), float(parts[3])],
                "GW_charged": parts[4],
                "dKS_charged": parts[5],
                "dKS_neutral": parts[6]
            }
            current_molecule['atoms'].append(atom_info)

# Add the last molecule
if current_molecule:
    molecules.append(current_molecule)

# Save the extracted data as a JSON file
with open('output.json', 'w') as json_file:
    json.dump(molecules, json_file, indent=2)

print("Data extracted and saved as 'output.json'")

Data extracted and saved as 'output.json'


In [11]:
# Load the JSON data into a dictionary
with open('output.json', 'r') as json_file:
    data = json.load(json_file)

# Create a dictionary where SMILES codes are keys
smiles_data = {entry['label']: entry for entry in data}

# Input a SMILES code to look up information
input_smiles = "C[C@@H](CO)c1ccco1"  # Replace with the SMILES code you want to look up

if input_smiles in smiles_data:
    molecule_info = smiles_data[input_smiles]
    print("Molecule Information:")
    print(f"SMILES: {input_smiles}")
    print("Coordinates:")
    for atom in molecule_info['atoms']:
        print(f"{atom['element']}: {atom['coordinates']}")
    # You can access other properties like 'GW_charged', 'dKS_charged', and 'dKS_neutral' as well.
else:
    print(f"SMILES code '{input_smiles}' not found in the dataset.")

Molecule Information:
SMILES: C[C@@H](CO)c1ccco1
Coordinates:
Lattice="14.90608069: [0.0, 0.0, 0.0]
C: [7.46490198, 8.17632178, 8.8730463]
C: [7.46857499, 6.63858743, 8.84806035]
C: [6.04554372, 6.06003798, 8.86425289]
O: [5.41834643, 6.4547918, 10.08959729]
C: [8.24410827, 6.0841779, 7.69881192]
C: [9.33338503, 5.26161322, 7.60942939]
C: [9.61098423, 5.08675432, 6.21517742]
C: [8.67086013, 5.81351132, 5.54802639]
O: [7.82779982, 6.42877457, 6.43259124]
H: [8.4892235, 8.56545865, 8.93486985]
H: [7.00304507, 8.58070314, 7.96201308]
H: [6.89759286, 8.54032656, 9.73772115]
H: [7.96849225, 6.27137893, 9.75873449]
H: [5.48287931, 6.43747986, 7.99160224]
H: [6.10204527, 4.95993896, 8.78586847]
H: [4.5, 6.14898421, 10.07161899]
H: [9.875159, 4.83301175, 8.44556007]
H: [10.40608069, 4.5, 5.76898044]
H: [8.46733704, 5.99521155, 4.5]


In [34]:
molecule -> 
Fragments (MF)-> smiles_fragments 

list = [smiles_fragments,2,4,5]

smiles_data["CCO"]["atoms"][1]

List_2 = 

{'element': 'C',
 'coordinates': [5.51553057, 7.41829008, 5.69850831],
 'GW_charged': '290.64880000',
 'dKS_charged': '293.32100000',
 'dKS_neutral': '286.16000000'}

In [None]:
# smiles_data is dic where you can input the smiles, and you'll obtain all the information of the molecule. 


In [7]:
from rdkit.Chem import AllChem
fpgen = AllChem.GetMorganGenerator(radius=2)
m1 = Chem.MolFromSmiles('Cc1ccccc1')
fp1 = fpgen.GetSparseCountFingerprint(m1)
fp1

AttributeError: module 'rdkit.Chem.AllChem' has no attribute 'GetMorganGenerator'

# Mounir program

In [None]:
#Function that decomposes and returns constitutive fragments around every single atoms of mol, wizth a cutoff of radius
def getSubmolRadN(mol, radius):
    atoms=mol.GetAtoms()
    submols=[]
    for atom in atoms:
        env=Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atom.GetIdx())
        amap={}
        submol=Chem.PathToSubmol(mol, env, atomMap=amap)
        subsmi=Chem.MolToSmiles(submol, rootedAtAtom=amap[atom.GetIdx()], canonical=False)
        submols.append(Chem.MolFromSmiles(subsmi, sanitize=False))
    return submols

#get the bits of the fingerprint
MFPRadius = 3
biAll = {}
fpAll = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius = MFPRadius, bitInfo = biAll)
# show 10 of the set bits:
listAll = list(fpAll.GetOnBits())
biRestricted = {}
fpRestricted = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius = MFPRadius - 1 , bitInfo = biRestricted)
listRestricted = list(fpRestricted.GetOnBits())
listReduced = np.setdiff1d(listAll, listRestricted)
print(listReduced)

#Draw all bits
tpls = [(mol,x,biAll) for x in fpAll.GetOnBits()]
Draw.DrawMorganBits(tpls[:100],molsPerRow=4,legends=[str(x) for x in fpAll.GetOnBits()][:100])
10:12
#Draw reduced bits, i.e., only the ones with have a radiuus == set Radius
#WARNING: the number of bits might be smaller than the number of atoms. This is usefull only to get the binding energy of a given bit.
tpls = [(mol,x,biAll) for x in listReduced]
Draw.DrawMorganBits(tpls[:100],molsPerRow=4,legends=[str(x) for x in listReduced][:100])