# Importing libraries

In [None]:
import sys
from pathlib import Path
import glob
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score as r2
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
import pickle
from utils import fragment_control, bounding_box, mkdir

# Reading descriptors

In [None]:
desc_dict = {}
for desc in glob.glob('../data/*.csv'):
    desc_dict[Path(desc).stem] = pd.read_csv(desc)

# Models for the computed rate of reaction (R<sub>QSPR</sub>)

## Reading hyperparameters of the best models

In [None]:
target_col = 'dG_MD_ylog'
descs_rf = pd.read_csv('../results/best_hyperparameters/dG_best_models_RF.csv')
descs_xgb = pd.read_csv('../results/best_hyperparameters/dG_best_models_XGB.csv')

In [None]:
model_res = desc_dict[descs_rf.loc[0, 'desc_name']].loc[:, ['cid', 'std_smiles', 'CompRate_ylog', 'dG_MD_ylog']]
for desc in descs_xgb['desc_name']:
    model_res[desc + '_XGB'] = None
for desc in descs_rf['desc_name']:
    model_res[desc + '_RF'] = None

## Evaluation of the models in 5 times repeated 5-fold CV

In [None]:
folds_stats = [] # list for saving the stats from CV folds

nspl = 5  # number of folds for CV procedure

nrpts = 5  # number of repeats of CV procedure

kf = RepeatedKFold(n_splits=nspl, n_repeats=nrpts, random_state=0)

i = 0

for train_index, test_index in kf.split(model_res):
   
    y_train = model_res.loc[train_index, f'{target_col}']
    df_temp = model_res.copy(deep=True)
    
    for desc_name in descs_xgb['desc_name']:
        
        X = desc_dict[desc_name].iloc[:, 4:]
        
        xgb_params = eval(descs_xgb.loc[descs_xgb['desc_name']==desc_name, f'{target_col}_param_XGB'].iloc[0])
        xgb_params.update({'n_jobs': -1})
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X.loc[train_index], y_train)
        
        df_temp.loc[test_index, f'{desc_name}_XGB'] = xgb_model.predict(X.loc[test_index])
        df_temp.loc[bounding_box(X.loc[train_index], X.loc[test_index]), f'{desc_name}_AD'] = 1   
        df_temp.loc[test_index, 'CV_fold'] = i
        
        

    
    for desc_name in descs_rf['desc_name']:
        
        X = desc_dict[desc_name].iloc[:, 4:]
        
        rf_params = eval(descs_rf.loc[descs_rf['desc_name']==desc_name, f'{target_col}_param_RF'].iloc[0])
        rf_params.update({'n_jobs': -1})
        rf_model = RandomForestRegressor(**rf_params)
        rf_model.fit(X.loc[train_index], y_train)
        
        df_temp.loc[test_index, f'{desc_name}_RF'] = rf_model.predict(X.loc[test_index])
        
        
        if desc_name not in descs_xgb['desc_name'].dropna().tolist():
            df_temp.loc[bounding_box(X.loc[train_index], X.loc[test_index]), f'{desc_name}_AD'] = 1
       
    folds_stats.append(df_temp.loc[~df_temp['CV_fold'].isna()])
    i+=1
    print(i)

In [None]:
all_stats = pd.concat(folds_stats, axis=0)  # concatenating the stat data from folds
pred_columns = [col for col in all_stats.columns if '_RF' in col or '_XGB' in col]  # list of columns with predicted values

### Consideration of the applicability domain (AD)

In [None]:
ad_columns = [col for col in all_stats.columns if 'AD' in col]  # list of columns with applicability domain info
all_stats.loc[:, ad_columns] = all_stats.loc[:, ad_columns].fillna(0)  # replace None values with 0
all_stats['applicability_domain'] = None  # column to check whether the compound is in AD
all_stats['applicability_domain_conf'] = None  # for how many models compound is in AD
all_stats['applicability_domain_conf'] = (all_stats.loc[:, ad_columns]==0).astype(int).sum(axis=1)  # sum number of descriptor sets for which compound is in AD
all_stats.loc[all_stats['applicability_domain_conf']>=3, 'applicability_domain'] = 1  # if compound is inside AD for at least 3 descriptor sets => 1
all_stats['applicability_domain'] = all_stats['applicability_domain'].fillna(0)

In [None]:
all_stats.groupby('cid')['applicability_domain'].mean().value_counts()  # how many compounds are outside AD in CV procedure?

In [None]:
all_stats_temp = all_stats.copy(deep=True)  # create a temporary dataframe for aggregation of predictions cosnidering AD
for col in ad_columns:
    all_stats_temp.loc[all_stats_temp[col]==1, [coli for coli in all_stats_temp.columns if col.replace('_AD', '_RF') in coli or col.replace('_AD', '_XGB') in coli]] = None
all_stats_temp['avg_pred'] = all_stats_temp.loc[:, pred_columns].median(axis=1)
all_stats.loc[:, 'avg_pred'] = all_stats_temp['avg_pred']
all_stats.loc[:, 'avg_pred_AD'] = all_stats_temp['avg_pred']
all_stats.loc[all_stats['avg_pred_AD'].isna(), 'avg_pred'] = all_stats.loc[all_stats['avg_pred_AD'].isna(), pred_columns].median(axis=1)

In [None]:
all_stats

### Showing stats obtained in 5x5 CV

In [None]:
q2_cv = []
mae_cv = []
rmse_cv = []
all_stats_res = []
for cv_fold in range(0, nspl*nrpts, nspl):
    if cv_fold == 0:
        all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['cid', 'avg_pred']])
    else:
        all_stats_res.append(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']])
    q2_cv.append(r2(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']]))
    rmse_cv.append(mse(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']])**0.5)             
    mae_cv.append(mae(all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), [f'{target_col}']], all_stats.loc[all_stats['CV_fold'].isin([i for i in range(cv_fold, cv_fold+nspl)]), ['avg_pred']]))                         

In [None]:
print(f'Gas: {target_col}; Q2(5x5CV): {round(np.mean(q2_cv), 2)}; Q2_std(5x5CV): {round(np.std(q2_cv), 2)}')
print(f'Gas: {target_col}; RMSE(5x5CV): {round(np.mean(rmse_cv), 2)}; RMSE_std(5x5CV): {round(np.std(rmse_cv), 3)}')
print(f'Gas: {target_col}; MAE(5x5CV): {round(np.mean(mae_cv), 2)}; MAE_std(5x5CV): {round(np.std(mae_cv), 3)}')

### Analyzing outliers

In [None]:
rmse_cv = np.mean(rmse_cv)  # the value of rmse from CV for outlier analysis

In [None]:
rmse_cv*2

In [None]:
all_stats_res = pd.concat(all_stats_res, axis=1)
all_stats_res = pd.merge(all_stats_res, model_res.loc[:, ['cid', f'{target_col}']], on='cid', how='left')
all_stats_res.loc[:, 'avg_pred_fin'] = all_stats_res.loc[:, ['avg_pred']].mean(axis=1)
all_stats_res['abs_err'] = all_stats_res.loc[:, 'avg_pred_fin'] - all_stats_res.loc[:, f'{target_col}']
all_stats_res['abs_err'] = all_stats_res['abs_err'].abs()

In [None]:
all_stats_res.loc[all_stats_res['abs_err']>=2*rmse_cv, 'outlier'] = 1

In [None]:
all_stats_res.sort_values('abs_err', ascending=False).head(10)

### Saving the dataframe with stats for the figure

In [None]:
all_stats_res.to_csv('../results/model_stats/dG_5x5CV_stats.csv', index=False)

## (re)Fitting and saving the models

In [None]:
y = model_res.loc[:, f'{target_col}']
for desc_name in descs_xgb['desc_name']:
    X = desc_dict[desc_name].iloc[:, 4:]
    xgb_params = eval(descs_xgb.loc[descs_xgb['desc_name']==desc_name, f'{target_col}_param_XGB'].iloc[0])
    xgb_params.update({'n_jobs': -1})
    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X, y)
    with open(f'../results/models/dg/dG_{desc_name}_xgb.pkl', 'wb') as mf:
        pickle.dump(xgb_model, mf)
for desc_name in descs_rf['desc_name']:
    X = desc_dict[desc_name].iloc[:, 4:]
    rf_params = eval(descs_rf.loc[descs_rf['desc_name']==desc_name, f'{target_col}_param_RF'].iloc[0])
    rf_params.update({'n_jobs': -1})
    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(X, y)
    with open(f'../results/models/dg/dG_{desc_name}_rf.pkl', 'wb') as mf:
        pickle.dump(rf_model, mf)