In [1]:
import sys
import os
from glob import glob
import pandas as pd
import numpy as np


# SHAP
import shap

# RDKit
from rdkit.Chem.rdmolfiles import MolFromXYZFile, MolToXYZFile
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds, MACCSkeys, AllChem

# DScribe
from dscribe.descriptors import ACSF, CoulombMatrix, MBTR, SOAP, CoulombMatrix
from dscribe.kernels import REMatchKernel
from ase.io import read
from sklearn.preprocessing import normalize


In [2]:
def xyz_to_cleanmol(path):
    '''
    Clean mol object from xyz 

    params
    ------
    path: str
        Path to structure

    fragment: str
        Options: A, B, AB
        
    returns
    -------
    mol: rdkit.Chem.rdchem.Mol
        Cleaned mol object
    '''

    mol=MolFromXYZFile(path)
    rdDetermineBonds.DetermineConnectivity(mol)
    Chem.SanitizeMol(mol)
    Chem.AssignStereochemistry(mol,cleanIt=True)
    Chem.Kekulize(mol)
    
    return mol

In [3]:
data={}
# Loop dirs and skip the .ipynb file
for dirs in os.listdir():
    if os.path.isdir(dirs) and '.ipynb' not in dirs:
        data[dirs]={}
        # Loop over radii
        for file in glob(os.path.join(dirs,'*/*.output')):
            molecule,radius=file.split('/')[0:2]
            data[dirs][radius]={}
            # Find energies!
            with open(file,'r') as f:
                for line in f.readlines():
                    if '::    Total SCF' in line:
                        scf=float(line.split()[-1])
                        data[dirs][radius]['SCF']=scf
                    if '::    Total MBPT2' in line:
                        MP2=float(line.split()[-1])
                        data[dirs][radius]['MP2']=MP2
                    if '::    RASSCF' in line:
                        CASSCF=float(line.split()[-1])
                        data[dirs][radius]['CASSCF']=CASSCF
                    if '::    CASPT2' in line:
                        CASPT2=float(line.split()[-1])
                        data[dirs][radius]['CASPT2']=CASPT2
        # Loop over xyz
        for file in glob(os.path.join(dirs,'*/*.xyz')):
            molecule,radius=file.split('/')[0:2]
            data[dirs][radius]['structures']={'ASE':read(file),'mol':xyz_to_cleanmol(file)}

In [4]:
with open("diatomics.txt") as f:
    diatomics=[i.strip('\n').split('_') for i in f.readlines()]

In [5]:
all_species=set(sum(diatomics,[]))

In [6]:
Morgan={}
RDKit={}
MACCS={}

fpgen = AllChem.GetRDKitFPGenerator()
Morgangen = AllChem.GetMorganGenerator(radius=2)
for k,v in data.items():
    Morgan[k]={}
    RDKit[k]={}
    MACCS[k]={}
    for k1,v1 in v.items():
        mol=v1['structures']['mol']
        
        MACCS[k][k1]=list(MACCSkeys.GenMACCSKeys(mol))
        RDKit[k][k1]=list(fpgen.GetFingerprint(mol))
        Morgan[k][k1]=list(Morgangen.GetFingerprint(mol))

In [7]:
# Initialize dscribe feature parameters


# Set up the SOAP descriptor with parameters:
# species, rcut, nmax, and lmax
soap = SOAP(species=all_species,r_cut=3,n_max=4,l_max=3,sigma=1.5,periodic=False)
re = REMatchKernel(metric="rbf", gamma=2, alpha=1.2, threshold=1e-8, normalize_kernel=False)

# Setting up the CM descriptor
cm = CoulombMatrix(n_atoms_max=2,permutation='eigenspectrum',seed=42)

In [19]:
ordering=sum([[(k,k1) for k1,v1 in v.items()] for k,v in data.items()],[])
flattened_ASE=sum([[v1['structures']['ASE'] for k1,v1 in v.items()]for k,v in data.items()],[])
flattened_mol=sum([[v1['structures']['mol'] for k1,v1 in v.items()]for k,v in data.items()],[])

In [20]:
MACCS_mol=[list(MACCSkeys.GenMACCSKeys(mol)) for mol in flattened_mol]
RDKit_mol=[list(fpgen.GetFingerprint(mol)) for mol in flattened_mol]
Morgan_mol=[list(Morgangen.GetFingerprint(mol)) for mol in flattened_mol]
cm_mol = cm.create(flattened_ASE)
soap_mol = re.create([normalize(i) for i in soap.create(flattened_ASE)])

In [21]:
MACCS_df=pd.DataFrame(MACCS_mol)
MACCS_df[['molecule','radius']]=ordering
MACCS_df.set_index(['molecule','radius'])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
molecule,radius,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HH,1.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
HH,2.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
HH,2.00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
HH,1.80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
HH,1.40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN,0.60,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
CN,1.20,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
CN,2.70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
CN,1.70,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [22]:
RDKit_df=pd.DataFrame(RDKit_mol)
RDKit_df[['molecule','radius']]=ordering
RDKit_df.set_index(['molecule','radius'])


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
molecule,radius,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HH,1.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,2.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,2.00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,1.80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,1.40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN,0.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
CN,1.20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
CN,2.70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CN,1.70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
Morgan_df=pd.DataFrame(Morgan_mol)
Morgan_df[['molecule','radius']]=ordering
Morgan_df.set_index(['molecule','radius'])


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
molecule,radius,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HH,1.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,2.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,2.00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,1.80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HH,1.40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN,0.60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CN,1.20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CN,2.70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CN,1.70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
soap_df=pd.DataFrame(soap_mol)
soap_df[['molecule','radius']]=ordering
soap_df.set_index(['molecule','radius'])

In [18]:
cm_df=pd.DataFrame(cm_mol)
cm_df[['molecule','radius']]=ordering
cm_df.set_index(['molecule','radius'])


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
molecule,radius,Unnamed: 2_level_1,Unnamed: 3_level_1
HH,1.60,1.125000,-0.125000
HH,2.60,0.884615,0.115385
HH,2.00,1.000000,0.000000
HH,1.80,1.055556,-0.055556
HH,1.40,1.214286,-0.214286
...,...,...,...
CN,0.60,115.592926,-25.376113
CN,1.20,81.067654,9.149159
CN,2.70,62.716438,27.500375
CN,1.70,71.155442,19.061370


In [25]:
[[{k2:v2 for k2,v2 in v1.items() if k2!='structures'} for k1,v1 in v.items()]for k,v in data.items()]

[[{'SCF': -0.958931377,
   'MP2': -0.9978667965,
   'CASSCF': -1.03680157,
   'CASPT2': -1.03680157},
  {'SCF': -0.7990181138,
   'MP2': -0.9130986769,
   'CASSCF': -0.99089419,
   'CASPT2': -0.99089419},
  {'SCF': -0.8801690216,
   'MP2': -0.9433206807,
   'CASSCF': -1.00558834,
   'CASPT2': -1.00558834},
  {'SCF': -0.9170375046,
   'MP2': -0.9670728021,
   'CASSCF': -1.01802052,
   'CASPT2': -1.01802052},
  {'SCF': -1.0049768138,
   'MP2': -1.034749916,
   'CASSCF': -1.06262132,
   'CASPT2': -1.06262132},
  {'SCF': -0.8344002606,
   'MP2': -0.9209682267,
   'CASSCF': -0.99536979,
   'CASPT2': -0.99536979},
  {'SCF': -1.0756382578,
   'MP2': -1.0949492134,
   'CASSCF': -1.11012419,
   'CASPT2': -1.11012419},
  {'SCF': -1.1152074927,
   'MP2': -1.1255107467,
   'CASSCF': -1.13123494,
   'CASPT2': -1.13123494},
  {'SCF': -0.897956437,
   'MP2': -0.9542961499,
   'CASSCF': -1.01110147,
   'CASPT2': -1.01110147},
  {'SCF': -1.0288932616,
   'MP2': -1.05476641,
   'CASSCF': -1.07782647,
  