# Importing libraries

In [43]:
import pandas as pd
import pickle
from utils import bounding_box

# Screening

## Uploading best models and the dataset for saving results

In [44]:
descs_rf = pd.read_csv('../results/best_hyperparameters/dG_best_models_RF.csv')
descs_xgb = pd.read_csv('../results/best_hyperparameters/dG_best_models_XGB.csv')

In [45]:
screen_pred = pd.read_excel('../screening/dataset.xlsx')

## Screening with Random Forest models

In [46]:
for desc_name in descs_rf['desc_name']:
    train_set = pd.read_csv(f'../data/{desc_name}.csv')  # training set for AD estimation
    train_set = train_set.iloc[:, 4:]  # removing unnecessary columns
    screen_set = pd.read_csv(f'../screening/{desc_name}_screen.csv')  # screening set
    screen_set = screen_set.iloc[:, 2:]  # removing unnecessary columns
    # uploading model
    with open(f'../results/models/dg/dG_{desc_name}_rf.pkl', 'rb') as mf:
        model = pickle.load(mf)
    # checking AD and making predictions
    screen_pred.loc[bounding_box(train_set, screen_set), f'{desc_name}_AD'] = 1
    screen_set = screen_set.loc[:, train_set.columns]
    screen_set = screen_set.fillna(0)
    screen_pred.loc[:, f'{desc_name}_RF'] = model.predict(screen_set)

## Screening with XGBoost models

In [47]:
for desc_name in descs_xgb['desc_name']:
    train_set = pd.read_csv(f'../data/{desc_name}.csv')  # training set for AD estimation
    train_set = train_set.iloc[:, 4:]  # removing unnecessary columns
    screen_set = pd.read_csv(f'../screening/{desc_name}_screen.csv')  # screening set
    screen_set = screen_set.iloc[:, 2:]  # removing unnecessary columns
    # uploading model
    with open(f'../results/models/dg/dG_{desc_name}_xgb.pkl', 'rb') as mf:
        model = pickle.load(mf)
    # checking AD and making predictions
    if desc_name not in  descs_rf['desc_name']:
        screen_pred.loc[bounding_box(train_set, screen_set), f'{desc_name}_AD'] = 1
    screen_set = screen_set.loc[:, train_set.columns]
    screen_set = screen_set.fillna(0)
    screen_pred.loc[:, f'{desc_name}_XGB'] = model.predict(screen_set)

## Averaging results considering AD

In [48]:
cols = [col for col in screen_pred.columns if '_RF' in col or '_XGB' in col]
ad_cols = [col for col in screen_pred.columns if '_AD' in col]
temp_df = screen_pred.copy(deep=True)
for col in ad_cols:
    temp_df.loc[temp_df[col]==1, [coli for coli in temp_df.columns if col.replace('_AD', '_RF') in coli or col.replace('_AD', '_XGB') in coli]] = np.nan
temp_df['avg_pred'] = temp_df.loc[:, cols].median(axis=1)
screen_pred.loc[:, 'dg_avg_pred'] = temp_df['avg_pred']
screen_pred.loc[:, 'dg_avg_pred_AD'] = temp_df['avg_pred']
screen_pred.loc[screen_pred['dg_avg_pred_AD'].isna(), 'dg_avg_pred'] = screen_pred.loc[screen_pred['dg_avg_pred_AD'].isna(), cols].mean(axis=1)
screen_pred['dg_antilog_avg_pred'] = screen_pred['dg_avg_pred'].apply(lambda x: 10**(-x))

## Saving results

In [49]:
screen_pred.to_csv('../results/screening/dG_pred.csv', index=False)