In [None]:
# import data
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors
data_pfp=pd.read_csv('Fiehn_PFP.csv')
data_csh=pd.read_csv('Fiehn_CSH.csv')
data_cshmini=pd.read_csv('Fiehn_CSH_Mini.csv')
data_pfp.head()
data_csh.head()

In [None]:
# Identify isomers
data_pfp_no_split_index = data_pfp.drop(columns=['split_index'])
smiles = []
rt_std = []
rt_mean = []
rt_diff = []
for i in data_pfp['smiles'].unique():
    temp_data = data_pfp_no_split_index.loc[data_pfp_no_split_index['smiles']== i]
    if len(temp_data) > 1:
        smiles.append(temp_data['smiles'].unique())
        rt_mean.append(temp_data['retention_time'].mean())
        rt_std.append(temp_data['retention_time'].std())
        rt_diff.append(temp_data['retention_time'].max()-temp_data['retention_time'].min())
du_data = pd.DataFrame(list(zip(smiles, rt_mean,rt_std,rt_diff)),
                       columns =['smiles', 'retention_time_mean','retention_time_std','retention_time_difference'])
du_data

In [None]:
# Create descriptors of PFP_data
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(smi) for smi in data_csh['smiles']]
df = calc.pandas(mols)
df.head()

In [None]:
# Split training dataset and test dataset
df_train=df.loc[df['split_index'] == 1]
df_train.head()
df_test=df.loc[df['split_index'] == 2]
df_test.head()

In [None]:
# Summary of rentention time variable
from autogluon.tabular import TabularDataset, TabularPredictor
label='retention_time'
print("Summary of rt variable: \n", df_train[label].describe())

In [None]:
# Build predict_model
save_path ='rt_predict_models_csh'
predictor = TabularPredictor(label=label, path=save_path).fit(df_train)

In [None]:
# prepare test set
y_test = df_test[label]
test_data_nolab = df_test.drop(columns=[label])
test_data_nolab.head()

In [None]:
# performance of predict_model on test set
predictor = TabularPredictor.load(save_path)
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

In [None]:
# Performance of different models
predictor.leaderboard(df_test , silent=True)

In [None]:
# feature importance
feature_importance = predictor.feature_importance(df_test)
feature_importance.to_csv('feature_importance_csh.csv')