# Importing libraries

In [1]:
import sys
from pathlib import Path
import glob
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score as r2
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
import pickle
from utils import fragment_control, bounding_box, mkdir

# Reading descriptors

In [45]:
desc_dict = {}
for desc in glob.glob('../data/*.csv'):
    desc_dict[Path(desc).stem] = pd.read_csv(desc)

# Models for the computed rate of reaction (R<sub>QSPR</sub>)

## Reading hyperparameters of the best models

In [47]:
target_col = 'CompRate_ylog'
descs_rf = pd.read_csv('../results/best_hyperparameters/Comprate_best_models_RF.csv')
descs_xgb = pd.read_csv('../results/best_hyperparameters/Comprate_best_models_XGB.csv')

In [48]:
model_res = desc_dict[descs_rf.loc[0, 'desc_name']].loc[:, ['cid', 'std_smiles', 'CompRate_ylog', 'dG_MD_ylog']]
for desc in descs_xgb['desc_name']:
    model_res[desc + '_XGB'] = None
for desc in descs_rf['desc_name']:
    model_res[desc + '_RF'] = None

## Evaluation of the models in 5 times repeated 5-fold CV

In [49]:
folds_stats = [] # list for saving the stats from CV folds

nspl = 5  # number of folds for CV procedure

nrpts = 5  # number of repeats of CV procedure

kf = RepeatedKFold(n_splits=nspl, n_repeats=nrpts, random_state=0)

i = 0

for train_index, test_index in kf.split(model_res):
   
    y_train = model_res.loc[train_index, f'{target_col}']
    df_temp = model_res.copy(deep=True)
    
    for desc_name in descs_xgb['desc_name']:
        
        X = desc_dict[desc_name].iloc[:, 4:]
        
        xgb_params = eval(descs_xgb.loc[descs_xgb['desc_name']==desc_name, f'{target_col}_param_XGB'].iloc[0])
        xgb_params.update({'n_jobs': -1})
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X.loc[train_index], y_train)
        
        df_temp.loc[test_index, f'{desc_name}_XGB'] = xgb_model.predict(X.loc[test_index])
        df_temp.loc[bounding_box(X.loc[train_index], X.loc[test_index]), f'{desc_name}_AD'] = 1   
        df_temp.loc[test_index, 'CV_fold'] = i
        
        q2_proper = r2(model_res.loc[test_index, f'{target_col}'], xgb_model.predict(X.loc[test_index]))
        
        print(f'Method: XGB; Q2: {q2_proper}')
        

    
    for desc_name in descs_rf['desc_name']:
        
        X = desc_dict[desc_name].iloc[:, 4:]
        
        rf_params = eval(descs_rf.loc[descs_rf['desc_name']==desc_name, f'{target_col}_param_RF'].iloc[0])
        rf_params.update({'n_jobs': -1})
        rf_model = RandomForestRegressor(**rf_params)
        rf_model.fit(X.loc[train_index], y_train)
        
        df_temp.loc[test_index, f'{desc_name}_RF'] = rf_model.predict(X.loc[test_index])
        
        q2_proper = r2(model_res.loc[test_index, f'{target_col}'], rf_model.predict(X.loc[test_index]))
        
        print(f'Method: RF; Q2: {q2_proper}')
        
        if desc_name not in descs_xgb['desc_name'].dropna().tolist():
            df_temp.loc[bounding_box(X.loc[train_index], X.loc[test_index]), f'{desc_name}_AD'] = 1
       
    folds_stats.append(df_temp.loc[~df_temp['CV_fold'].isna()])
    i+=1
    print(i)

Method: XGB; Q2: 0.7684492705519271
Method: XGB; Q2: 0.7134233832856183
Method: XGB; Q2: 0.7465933923652661
Method: XGB; Q2: 0.7380599223751063
Method: XGB; Q2: 0.7353462874594761
Method: XGB; Q2: 0.7460653548614854
Method: XGB; Q2: 0.7458821886554599
Method: XGB; Q2: 0.755824045942479
Method: XGB; Q2: 0.7555180850918211
Method: XGB; Q2: 0.7244317213218986
Method: XGB; Q2: 0.7371188916778156
Method: XGB; Q2: 0.7682775499047569
Method: XGB; Q2: 0.7465933923652661
Method: XGB; Q2: 0.7380599223751063
Method: XGB; Q2: 0.7458821886554599
Method: XGB; Q2: 0.755824045942479
Method: XGB; Q2: 0.7244317213218986
Method: XGB; Q2: 0.7682775499047569
Method: RF; Q2: 0.7251103559414466
Method: RF; Q2: 0.7490344715515559
Method: RF; Q2: 0.7459310543977968
Method: RF; Q2: 0.7311034011385729
Method: RF; Q2: 0.7351675835126779
Method: RF; Q2: 0.7368445503838703
Method: RF; Q2: 0.7221354281770525
Method: RF; Q2: 0.7412376593793054
Method: RF; Q2: 0.6988062466032786
Method: RF; Q2: 0.7396424246820329
Meth

Method: XGB; Q2: 0.26026211532693855
Method: XGB; Q2: 0.34293479818655304
Method: XGB; Q2: 0.2681966695710112
Method: RF; Q2: 0.39671882269726044
Method: RF; Q2: 0.3749284661893192
Method: RF; Q2: 0.35532381495144494
Method: RF; Q2: 0.4026008199944058
Method: RF; Q2: 0.4334492341690216
Method: RF; Q2: 0.4292179030751093
Method: RF; Q2: 0.41760523116265935
Method: RF; Q2: 0.4016930568301773
Method: RF; Q2: 0.3829971608004773
Method: RF; Q2: 0.4146596685026046
Method: RF; Q2: 0.4001162342884571
Method: RF; Q2: 0.42866487826530897
Method: RF; Q2: 0.40733831468396964
Method: RF; Q2: 0.4075902256690066
Method: RF; Q2: 0.4342440563722614
Method: RF; Q2: 0.41685771513788794
Method: RF; Q2: 0.39675248550933817
Method: RF; Q2: 0.432122351038586
7
Method: XGB; Q2: 0.6369425107744144
Method: XGB; Q2: 0.6379714194461281
Method: XGB; Q2: 0.659019397373213
Method: XGB; Q2: 0.6900929165542642
Method: XGB; Q2: 0.6820564423200901
Method: XGB; Q2: 0.7131111631494955
Method: XGB; Q2: 0.6583553070141672
M

Method: RF; Q2: 0.6737652168446806
Method: RF; Q2: 0.6665901483394848
Method: RF; Q2: 0.6630033535769655
Method: RF; Q2: 0.6482490478347052
Method: RF; Q2: 0.6286926514688279
Method: RF; Q2: 0.6719145261119334
13
Method: XGB; Q2: 0.3970138480705626
Method: XGB; Q2: 0.38577180168839176
Method: XGB; Q2: 0.4028533565939627
Method: XGB; Q2: 0.4206674220479657
Method: XGB; Q2: 0.4710964071748266
Method: XGB; Q2: 0.43844071515502003
Method: XGB; Q2: 0.43035307192217087
Method: XGB; Q2: 0.4399102787913737
Method: XGB; Q2: 0.4615609258307276
Method: XGB; Q2: 0.45332618300534
Method: XGB; Q2: 0.40742288177888464
Method: XGB; Q2: 0.41920626869495003
Method: XGB; Q2: 0.4028533565939627
Method: XGB; Q2: 0.4206674220479657
Method: XGB; Q2: 0.43035307192217087
Method: XGB; Q2: 0.4399102787913737
Method: XGB; Q2: 0.45332618300534
Method: XGB; Q2: 0.41920626869495003
Method: RF; Q2: 0.6169779965313882
Method: RF; Q2: 0.546153096759936
Method: RF; Q2: 0.5577911347250404
Method: RF; Q2: 0.60025668902525

Method: XGB; Q2: 0.7298701912495522
Method: XGB; Q2: 0.70083372517173
Method: XGB; Q2: 0.7208539013196877
Method: XGB; Q2: 0.7106704476046175
Method: XGB; Q2: 0.7259561389142068
Method: XGB; Q2: 0.7346810666386481
Method: XGB; Q2: 0.693697322901455
Method: XGB; Q2: 0.7298701912495522
Method: XGB; Q2: 0.7208539013196877
Method: RF; Q2: 0.7197780797742797
Method: RF; Q2: 0.6603116950483354
Method: RF; Q2: 0.6724482004541454
Method: RF; Q2: 0.7361354278794511
Method: RF; Q2: 0.7244667272146952
Method: RF; Q2: 0.7322091089823628
Method: RF; Q2: 0.7328840366302932
Method: RF; Q2: 0.6992632806613621
Method: RF; Q2: 0.7379287586393812
Method: RF; Q2: 0.7190321201918051
Method: RF; Q2: 0.7227237270814723
Method: RF; Q2: 0.7306197279161941
Method: RF; Q2: 0.7162059812772384
Method: RF; Q2: 0.7343148960050847
Method: RF; Q2: 0.7239998905613938
Method: RF; Q2: 0.7107442480661774
Method: RF; Q2: 0.7219212122185856
Method: RF; Q2: 0.7195569287067187
20
Method: XGB; Q2: 0.5677887920531561
Method: XG

In [50]:
all_stats = pd.concat(folds_stats, axis=0)  # concatenating the stat data from folds
pred_columns = [col for col in all_stats.columns if '_RF' in col or '_XGB' in col]  # list of columns with predicted values

### Consideration of the applicability domain (AD)

In [51]:
ad_columns = [col for col in all_stats.columns if 'AD' in col]  # list of columns with applicability domain info
all_stats.loc[:, ad_columns] = all_stats.loc[:, ad_columns].fillna(0)  # replace None values with 0
all_stats['applicability_domain'] = None  # column to check whether the compound is in AD
all_stats['applicability_domain_conf'] = None  # for how many models compound is in AD
all_stats['applicability_domain_conf'] = (all_stats.loc[:, ad_columns]==0).astype(int).sum(axis=1)  # sum number of descriptor sets for which compound is in AD
all_stats.loc[all_stats['applicability_domain_conf']>=3, 'applicability_domain'] = 1  # if compound is inside AD for at least 3 descriptor sets => 1
all_stats['applicability_domain'] = all_stats['applicability_domain'].fillna(0)

In [52]:
all_stats

Unnamed: 0,cid,std_smiles,CompRate_ylog,dG_MD_ylog,IA26AP+OPERA_XGB,IIAB13+OPERA_XGB,IIRA13+OPERA_XGB,IIRA14+OPERA_XGB,IIRA14P+OPERA_XGB,IIRA15P+OPERA_XGB,...,IIRAB26+OPERA_AD,IA36AP+OPERA_AD,IAB26AP+OPERA_AD,IIRA15+OPERA_AD,IIRA16+OPERA_AD,IIRA17+OPERA_AD,IIRAB15+OPERA_AD,IIRAB17+OPERA_AD,applicability_domain,applicability_domain_conf
2,100-37-8,CCN(CC)CCO,-0.684356,-1.806987,-0.536577,-0.552883,-0.523912,-0.538123,-0.543032,-0.541948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,25
7,7005-47-2,CN(C)C(C)(C)CO,-0.396975,-1.779119,-0.546219,-0.467214,-0.512419,-0.452513,-0.529866,-0.50461,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0
8,19059-68-8,CN(C)CC(C)(C)CO,-0.356186,-1.761684,-0.49872,-0.551612,-0.516388,-0.52264,-0.525994,-0.585314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,25
10,139-87-7,CCN(CCO)CCO,-0.176428,-1.720622,-0.230906,-0.244749,-0.21171,-0.256978,-0.232602,-0.238692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,25
16,2955-88-6,OCCN1CCCC1,-0.679613,-1.801336,-0.591066,-0.58478,-0.537137,-0.583798,-0.558821,-0.614406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,59941-12-7,CCCCCC1CCCN1CCO,-0.382947,-1.760751,-0.232163,-0.281058,-0.29106,-0.22265,-0.24732,-0.235897,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1,16
117,64897-89-8,CC(C)CCN(CCO)CCO,-0.366887,-1.760357,-0.345632,-0.348134,-0.334635,-0.37313,-0.385786,-0.331456,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,25
118,40694-17-5,CN(CCO)CCCCCO,-0.379963,-1.769528,-0.600749,-0.603915,-0.635035,-0.542838,-0.57967,-0.603301,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1,16
119,102-81-8,CCCCN(CCO)CCCC,-0.549957,-1.799773,-0.480979,-0.476299,-0.477567,-0.480391,-0.517371,-0.42841,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0


In [53]:
all_stats.groupby('cid')['applicability_domain'].mean().value_counts()  # how many compounds are outside AD in CV procedure?

1.0    86
0.0    28
0.6     6
0.8     3
0.4     1
Name: applicability_domain, dtype: int64

In [54]:
all_stats_temp = all_stats.copy(deep=True)  # create a temporary dataframe for aggregation of predictions cosnidering AD
for col in ad_columns:
    all_stats_temp.loc[all_stats_temp[col]==1, [coli for coli in all_stats_temp.columns if col.replace('_AD', '_RF') in coli or col.replace('_AD', '_XGB') in coli]] = None
all_stats_temp['avg_pred'] = all_stats_temp.loc[:, pred_columns].median(axis=1)
all_stats.loc[:, 'avg_pred'] = all_stats_temp['avg_pred']
all_stats.loc[:, 'avg_pred_AD'] = all_stats_temp['avg_pred']
all_stats.loc[all_stats['avg_pred_AD'].isna(), 'avg_pred'] = all_stats.loc[all_stats['avg_pred_AD'].isna(), pred_columns].median(axis=1)

In [55]:
all_stats

Unnamed: 0,cid,std_smiles,CompRate_ylog,dG_MD_ylog,IA26AP+OPERA_XGB,IIAB13+OPERA_XGB,IIRA13+OPERA_XGB,IIRA14+OPERA_XGB,IIRA14P+OPERA_XGB,IIRA15P+OPERA_XGB,...,IAB26AP+OPERA_AD,IIRA15+OPERA_AD,IIRA16+OPERA_AD,IIRA17+OPERA_AD,IIRAB15+OPERA_AD,IIRAB17+OPERA_AD,applicability_domain,applicability_domain_conf,avg_pred,avg_pred_AD
2,100-37-8,CCN(CC)CCO,-0.684356,-1.806987,-0.536577,-0.552883,-0.523912,-0.538123,-0.543032,-0.541948,...,0.0,0.0,0.0,0.0,0.0,0.0,1,25,-0.548086,-0.548086
7,7005-47-2,CN(C)C(C)(C)CO,-0.396975,-1.779119,-0.546219,-0.467214,-0.512419,-0.452513,-0.529866,-0.50461,...,1.0,1.0,1.0,1.0,1.0,1.0,0,0,-0.526532,
8,19059-68-8,CN(C)CC(C)(C)CO,-0.356186,-1.761684,-0.49872,-0.551612,-0.516388,-0.52264,-0.525994,-0.585314,...,0.0,0.0,0.0,0.0,0.0,0.0,1,25,-0.524433,-0.524433
10,139-87-7,CCN(CCO)CCO,-0.176428,-1.720622,-0.230906,-0.244749,-0.21171,-0.256978,-0.232602,-0.238692,...,0.0,0.0,0.0,0.0,0.0,0.0,1,25,-0.245284,-0.245284
16,2955-88-6,OCCN1CCCC1,-0.679613,-1.801336,-0.591066,-0.58478,-0.537137,-0.583798,-0.558821,-0.614406,...,0.0,0.0,0.0,0.0,0.0,0.0,1,25,-0.574516,-0.574516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,59941-12-7,CCCCCC1CCCN1CCO,-0.382947,-1.760751,-0.232163,-0.281058,-0.29106,-0.22265,-0.24732,-0.235897,...,0.0,1.0,1.0,1.0,1.0,1.0,1,16,-0.274997,-0.274997
117,64897-89-8,CC(C)CCN(CCO)CCO,-0.366887,-1.760357,-0.345632,-0.348134,-0.334635,-0.37313,-0.385786,-0.331456,...,0.0,0.0,0.0,0.0,0.0,0.0,1,25,-0.334635,-0.334635
118,40694-17-5,CN(CCO)CCCCCO,-0.379963,-1.769528,-0.600749,-0.603915,-0.635035,-0.542838,-0.57967,-0.603301,...,0.0,1.0,1.0,1.0,1.0,1.0,1,16,-0.593111,-0.593111
119,102-81-8,CCCCN(CCO)CCCC,-0.549957,-1.799773,-0.480979,-0.476299,-0.477567,-0.480391,-0.517371,-0.42841,...,1.0,1.0,1.0,1.0,1.0,1.0,0,0,-0.526468,


### Showing stats obtained in 5x5 CV

In [58]:
q2_cv = []
mae_cv = []
rmse_cv = []
all_stats_res = []
for cv_fold in range(0, nspl*nrpts, nspl):
    if cv_fold == 0:
        all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['cid', 'avg_pred']])
    else:
        all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']])
    q2_cv.append(r2(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']]))
    rmse_cv.append(mse(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']])**0.5)             
    mae_cv.append(mae(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']]))                         

In [57]:
q2_cv = []
mae_cv = []
rmse_cv = []
all_stats_res = []
for col in pred_columns:
    q2_cv = []
    mae_cv = []
    rmse_cv = []
    all_stats_res = []
    for cv_fold in range(0, nspl*nrpts, nspl):
        if cv_fold == 0:
            all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['cid', col]])
        else:
            all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [col]])
        q2_cv.append(r2(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [col]]))
        rmse_cv.append(mse(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [col]])**0.5)             
        mae_cv.append(mae(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [col]]))                         
    print(f'Gas: {target_col}; {col} Q2(5x5CV): {round(np.mean(q2_cv), 2)}; Q2_std(5x5CV): {round(np.std(q2_cv), 2)}')
    print(f'Gas: {target_col}; {col} RMSE(5x5CV): {round(np.mean(rmse_cv), 2)}; RMSE_std(5x5CV): {round(np.std(rmse_cv), 3)}')
    print(f'Gas: {target_col}; {col} MAE(5x5CV): {round(np.mean(mae_cv), 2)}; MAE_std(5x5CV): {round(np.std(mae_cv), 3)}')

Gas: CompRate_ylog; IA26AP+OPERA_XGB Q2(5x5CV): 0.62; Q2_std(5x5CV): 0.02
Gas: CompRate_ylog; IA26AP+OPERA_XGB RMSE(5x5CV): 0.15; RMSE_std(5x5CV): 0.003
Gas: CompRate_ylog; IA26AP+OPERA_XGB MAE(5x5CV): 0.12; MAE_std(5x5CV): 0.002
Gas: CompRate_ylog; IIAB13+OPERA_XGB Q2(5x5CV): 0.62; Q2_std(5x5CV): 0.01
Gas: CompRate_ylog; IIAB13+OPERA_XGB RMSE(5x5CV): 0.16; RMSE_std(5x5CV): 0.003
Gas: CompRate_ylog; IIAB13+OPERA_XGB MAE(5x5CV): 0.13; MAE_std(5x5CV): 0.002
Gas: CompRate_ylog; IIRA13+OPERA_XGB Q2(5x5CV): 0.63; Q2_std(5x5CV): 0.01
Gas: CompRate_ylog; IIRA13+OPERA_XGB RMSE(5x5CV): 0.15; RMSE_std(5x5CV): 0.003
Gas: CompRate_ylog; IIRA13+OPERA_XGB MAE(5x5CV): 0.12; MAE_std(5x5CV): 0.002
Gas: CompRate_ylog; IIRA14+OPERA_XGB Q2(5x5CV): 0.65; Q2_std(5x5CV): 0.02
Gas: CompRate_ylog; IIRA14+OPERA_XGB RMSE(5x5CV): 0.15; RMSE_std(5x5CV): 0.004
Gas: CompRate_ylog; IIRA14+OPERA_XGB MAE(5x5CV): 0.12; MAE_std(5x5CV): 0.004
Gas: CompRate_ylog; IIRA14P+OPERA_XGB Q2(5x5CV): 0.62; Q2_std(5x5CV): 0.02
Gas: 

In [59]:
print(f'Gas: {target_col}; Q2(5x5CV): {round(np.mean(q2_cv), 3)}; Q2_std(5x5CV): {round(np.std(q2_cv), 2)}')
print(f'Gas: {target_col}; RMSE(5x5CV): {round(np.mean(rmse_cv), 2)}; RMSE_std(5x5CV): {round(np.std(rmse_cv), 3)}')
print(f'Gas: {target_col}; MAE(5x5CV): {round(np.mean(mae_cv), 2)}; MAE_std(5x5CV): {round(np.std(mae_cv), 3)}')

Gas: CompRate_ylog; Q2(5x5CV): 0.645; Q2_std(5x5CV): 0.01
Gas: CompRate_ylog; RMSE(5x5CV): 0.15; RMSE_std(5x5CV): 0.003
Gas: CompRate_ylog; MAE(5x5CV): 0.12; MAE_std(5x5CV): 0.002


### Analyzing outliers

In [60]:
rmse_cv = np.mean(rmse_cv)  # the value of rmse from CV for outlier analysis

In [61]:
all_stats_res = pd.concat(all_stats_res, axis=1)
all_stats_res = pd.merge(all_stats_res, model_res.loc[:, ['cid', f'{target_col}']], on='cid', how='left')
all_stats_res.loc[:, 'avg_pred_fin'] = all_stats_res.loc[:, ['avg_pred']].mean(axis=1)
all_stats_res['abs_err'] = all_stats_res.loc[:, 'avg_pred_fin'] - all_stats_res.loc[:, f'{target_col}']
all_stats_res['abs_err'] = all_stats_res['abs_err'].abs()

In [62]:
all_stats_res.loc[all_stats_res['abs_err']>=2*rmse_cv, 'outlier'] = 1

### Saving the dataframe with stats for the figure

## (re)Fitting and saving the models

In [67]:
y = model_res.loc[:, f'{target_col}']
for desc_name in descs_xgb['desc_name']:
    X = desc_dict[desc_name].iloc[:, 4:]
    xgb_params = eval(descs_xgb.loc[descs_xgb['desc_name']==desc_name, f'{target_col}_param_XGB'].iloc[0])
    xgb_params.update({'n_jobs': -1})
    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X, y)
    with open(f'../results/models/comprate/Comprate_{desc_name}_xgb.pkl', 'wb') as mf:
        pickle.dump(xgb_model, mf)
for desc_name in descs_rf['desc_name']:
    X = desc_dict[desc_name].iloc[:, 4:]
    rf_params = eval(descs_rf.loc[descs_rf['desc_name']==desc_name, f'{target_col}_param_RF'].iloc[0])
    rf_params.update({'n_jobs': -1})
    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(X, y)
    with open(f'../results/models/comprate/Comprate_{desc_name}_rf.pkl', 'wb') as mf:
        pickle.dump(rf_model, mf)