In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import time

start_time = time.time()
# Function to convert SMILES to molecular fingerprint
def smiles_to_ECFP(smiles,n_bits):
    if smiles is not None:
        mol = Chem.MolFromSmiles(smiles)    
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=n_bits)
        return np.array(fp)
    else:
        return np.zeros(n_bits)
        
# Get fingerprints of a reaction 
def get_fingerprint_features(df, components): 
    components_fp_list= [] 
    for i in components:
        component_fp = np.array([smiles_to_ECFP(smiles,1024) for smiles in df[i]])
        components_fp_list.append(component_fp)
    components_fp=np.hstack(components_fp_list)
    return components_fp

# read information from excel files to generate a dataframe    
df1 = pd.read_excel(r'I:\acceptor_HTE.xlsx',sheet_name='substrates_train').drop(columns=['ligands','catalysts'])
df2 = pd.read_excel(r'I:\acceptor_HTE.xlsx',sheet_name='substrates_test').drop(columns=['ligands','catalysts'])

# Encode reactions and generate training dateset and test dataset
column_list=df1.columns.tolist()
components=column_list[3:6]
train_data=pd.DataFrame(get_fingerprint_features(df1, components))
train_data['ee']=df1['ee']
train_data['catalysts_index']=df1['catalysts_index']
test_data=pd.DataFrame(get_fingerprint_features(df2, components))
test_data['ee']=df2['ee']
test_data['catalysts_index']=df2['catalysts_index']

# Construction of AutoGluon model and do prediction
predictor = TabularPredictor(label='ee').fit(train_data)
predictor.predict(test_data.drop(columns=['ee'])).to_excel(r'I:\acceptor_HTE_prediction.xlsx')
test_metrics=predictor.evaluate(test_data, silent=True)
train_metrics=predictor.evaluate(train_data, silent=True)
end_time = time.time()
print('The metric of training dataset:', train_metrics)
print('The metric of test dataset:', test_metrics)
print(f'{end_time-start_time}seconds')