In [None]:
# Extract All Features
from MLPredictDeltaG import *
import os
import pandas as pd

struc_dir = '/path/to/SplitHostGuest' # split_struc_dir in 1.5
calculation_dir = '/path/to/CalculateMoldenFile' # calculation_dir in 1.5
prefixes = [i.split('_')[0] + '_' + i.split('_')[1] for i in os.listdir(struc_dir)]
prefixes = list(set(prefixes))
prefixes.sort()

In [None]:
info_excel = '/path/to/host_guest_features_SI.xlsx'
info_df = pd.read_excel(info_excel, sheet_name='Sheet2')

In [None]:
from sugar.molecule import HostMolecule
from rdkit import Chem, rdBase
from tqdm import tqdm

rdBase.DisableLog('rdApp.warning')
rdBase.DisableLog('rdApp.error')

feature_dict = {'ID': [], 'GuestLogP': [], 'GuestTPSA': [],
                'PoreDiameter': [], 'PoreVolume': [], 'GuestCavityRatio': [],
                'HostMPI': [], 'GuestMPI': [], 'HostPositiveSA': [], 'HostNegativeSA': [], 'GuestPositiveSA': [],
                'GuestNegativeSA': [], 'HostPolarSA': [], 'HostNonpolarSA': [], 'GuestPolarSA': [],
                'GuestNonpolarSA': [], 'InteractionFP': [], 'ESPFitness': [], 'HostMACCSKey': [],
                'GuestMACCSKey': [], 'deltaG': [], 'logKa': []}
for prefix in tqdm(prefixes):
    try:
        deltaG = info_df.loc[info_df['ID'] == prefix, 'deltaG'].values[0]
        logka = info_df.loc[info_df['ID'] == prefix, 'log Ka'].values[0]
        host_mol_file = os.path.join(struc_dir, f'{prefix}_host.mol')
        guest_mol_file = os.path.join(struc_dir, f'{prefix}_guest.mol')

        # RDKit Features
        host_rd_mol = Chem.MolFromMolFile(host_mol_file, removeHs=False)
        guest_rd_mol = Chem.MolFromMolFile(guest_mol_file, removeHs=False)
        guest_logp, guest_tpsa = get_logp(guest_rd_mol), get_tpsa(guest_rd_mol)
        host_maccskey, guest_maccskey = get_maccs_fp(host_rd_mol), get_maccs_fp(guest_rd_mol)

        # Pore Info
        pore_diameter, pore_volume, _ = get_pore_info(os.path.join(calculation_dir,
                                                                         f'{prefix}_host', f'{prefix}_host.xyz'))

        # Wavefunction Info
        host_info_file = os.path.join(calculation_dir, f'{prefix}_host', f'fch2esp.log')
        guest_info_file = os.path.join(calculation_dir, f'{prefix}_guest', f'fch2esp.log')
        host_surf_info = get_surface_info(host_info_file, decimals=6)
        guest_surf_info = get_surface_info(guest_info_file, decimals=6)
        guest_cavity_ratio = guest_surf_info['volume'] / pore_volume
        total_surf_info = [guest_cavity_ratio, host_surf_info['molecular_polarity_index'],
                           host_surf_info['positive_sa'],
                           host_surf_info['negative_sa'], host_surf_info['nonpolar_sa'], host_surf_info['polar_sa'],
                           guest_surf_info['molecular_polarity_index'], guest_surf_info['positive_sa'],
                           guest_surf_info['negative_sa'], guest_surf_info['nonpolar_sa'],
                           guest_surf_info['polar_sa'], ]

        # Interaction FingerPrint
        interaction_fp = get_interaction_fp_plf(host_mol_file, guest_mol_file)

        # ESP Fitness
        host_esp_txt = os.path.join(calculation_dir, f'{prefix}_host', f'esp_aligned.txt')
        guest_esp_txt = os.path.join(calculation_dir, f'{prefix}_guest', f'esp_aligned.txt')
        esp1 = np.loadtxt(host_esp_txt, usecols=-1)
        esp2 = np.loadtxt(guest_esp_txt, usecols=-1)
        esp_fitness_value = np.dot(esp1, esp2)

        # 加入到总体特征中
        feature_dict['ID'].append(prefix)
        feature_dict['GuestLogP'].append(guest_logp)
        feature_dict['GuestTPSA'].append(guest_tpsa)
        feature_dict['PoreDiameter'].append(pore_diameter)
        feature_dict['PoreVolume'].append(pore_volume)
        feature_dict['GuestCavityRatio'].append(guest_cavity_ratio)
        feature_dict['HostMPI'].append(host_surf_info['molecular_polarity_index'])
        feature_dict['GuestMPI'].append(guest_surf_info['molecular_polarity_index'])
        feature_dict['HostPositiveSA'].append(host_surf_info['positive_sa'])
        feature_dict['HostNegativeSA'].append(host_surf_info['negative_sa'])
        feature_dict['GuestPositiveSA'].append(guest_surf_info['positive_sa'])
        feature_dict['GuestNegativeSA'].append(guest_surf_info['negative_sa'])
        feature_dict['HostPolarSA'].append(host_surf_info['polar_sa'])
        feature_dict['HostNonpolarSA'].append(host_surf_info['nonpolar_sa'])
        feature_dict['GuestPolarSA'].append(guest_surf_info['polar_sa'])
        feature_dict['GuestNonpolarSA'].append(guest_surf_info['nonpolar_sa'])
        feature_dict['InteractionFP'].append(list(interaction_fp))
        feature_dict['ESPFitness'].append(esp_fitness_value)
        feature_dict['HostMACCSKey'].append(host_maccskey)
        feature_dict['GuestMACCSKey'].append(guest_maccskey)
        feature_dict['deltaG'].append(deltaG)
        feature_dict['logKa'].append(logka)
    except Exception as e:
        print(f'{prefix} has error: {e}')

In [None]:
df = pd.DataFrame(feature_dict)
save_file = os.path.join('/path/to/save/excel')
df.to_excel(save_file, index=False)