In [16]:
import numpy as np
import pandas as pd
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import random
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_validate
from autogluon.tabular import TabularDataset, TabularPredictor
import time
%load_ext autoreload
%autoreload 2
import toolsets.feature_engineering as fe
import toolsets.data_prep as data_prep
import toolsets.auto_rt_pred as ap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
descriptors = pd.read_csv("data/multiRT/MultiRT_hilic_descriptors.csv", low_memory=False)
combined_data = pd.read_csv("data/multiRT/Combined dataset.csv")
hilic = combined_data.loc[combined_data['Column']=='HILIC']
hilic.reset_index(inplace=True, drop=True)

In [18]:

SEED = 123456
np.random.seed(SEED)
random.seed(SEED)

In [19]:
# imputing missing descriptors
features_imp = fe.missing_descriptors_imputation(descriptors)

the running time for mice is 20.327151203155516


In [20]:
data = pd.concat([hilic, features_imp], axis=1)

In [21]:
# standardize column names
data= data_prep.dataset_prep(data)

In [22]:
# this is trying to use a feature subset for excluding mislabeled data; please pass a valid categorical column as second parameter; if you dont have 1 you can just make a new categorical column; the demo version use column Organic_modifier
data_confirmed = fe.mislable_exclusion(data, 'Organic_modifier', features_imp.columns)

k-fold: 100%|██████████| 100/100 [01:58<00:00,  1.18s/it]
k-fold: 100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


In [23]:
# convert categorical to dummies
data_confirmed = fe.make_dummies(data_confirmed,['Buffer', 'Organic_modifier', 'Column'])

In [24]:
# drop some useless columns
data_confirmed =data_confirmed.drop(['Compound_name', 'SMILES'],axis = 1)

In [25]:
train, test = data_prep.make_train_test(data_confirmed)

In [26]:
# fitting with autogluon, third parameter use as save model name
ap.autogluon_fit_train_test(train,test, "finalized")

Beginning AutoGluon training ...
AutoGluon will save models to "finalized/"
AutoGluon Version:  0.4.1b20220423
Python Version:     3.9.12
Operating System:   Darwin
Train Data Rows:    635
Train Data Columns: 1513
Label Column: retention_time
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (11.357142857142858, 1.0871428571428572, 2.44968, 1.39653)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    4267.31 MB
	Train Data (Original)  Memory Usage: 7.21 MB (0.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metada

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.374683       0.141819  37.135914                0.000146           0.083173            2       True         11
1         LightGBMLarge  -0.385187       0.004336  23.562688                0.004336          23.562688            1       True         10
2               XGBoost  -0.390255       0.007525   8.780650                0.007525           8.780650            1       True          8
3       RandomForestMSE  -0.394996       0.018778   5.413740                0.018778           5.413740            1       True          5
4         ExtraTreesMSE  -0.397499       0.018449   1.799263                0.018449           1.799263            1       True          7
5              CatBoost  -0.398034       0.012736   6.302033                0.012736           6.302033 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x28dc1dcd0>