In [21]:
import pandas as pd
import numpy as np

df = pd.read_csv('sl_results/10Aug21/1/matbench_expt_gap/matbench_expt_gap-1939-215-90-100.csv')
df = pd.read_csv('sl_results/10Aug21/1/matbench_expt_gap/matbench_expt_gap-1939-215-0-10.csv')

# df = pd.read_csv('/Users/chrisborg/Downloads/matbenchbandgap-90-100-percentile.csv')


In [24]:
import sys
sys.path.append('..')

def calculate_DAF(df, n_targets=1):
    '''
    Calculates and returns Discovery Acceleration Factor (the avg number of SL iterations required to id N compounds in target range)
    Args:
        df (pd.DataFrame): sl_workflow dataframe
        n_targets (int):  adjustable parameter for number of targets researcher wants to find
    Return:
        itt_avg_dict (dict): Dict with avg of n_targets_found
    '''
    
    DAF = {'EV':[], 'EI':[], 'MU':[], 'Random':[]}

    for g in df.groupby(['acquisition_function', 'trial']):

        # af = acquisition function
        af = g[0][0]

        # if desired targets were found
        if n_targets in g[1]['n_targets_found'].values:

            # get the first iteration where n_targets_found == desired number of targets
            iterations_to_n_targets = g[1][g[1]['n_targets_found']==n_targets]['iteration'].values[0]
            # append to dict
            DAF[af].append(iterations_to_n_targets)

    
    # compute average and stdev
    DAF_avg = {key: (round(np.average(val), 1), round(np.std(val), 1)) for key, val in DAF.items()}
    print(DAF_avg)
    # normalize by random search (for a given decile, it should take 10 iterations on average to id a single target)
    n_iter_random = int(n_targets*10)
    
    # to calc norm_std we take the percent error (std/avg) and multiple it by the normalized avg val
    DAF_norm = {}
    for key, val in DAF_avg.items():
        avg = val[0]
        std = val[1]
        norm_avg = n_iter_random/avg
        percent_error = std/avg
        norm_std = norm_avg*percent_error
        DAF_norm[key] = (round(norm_avg,2), round(norm_std,2))

    return DAF_norm

In [25]:
print(calculate_DAF(df, n_targets=1))
print(calculate_DAF(df, n_targets=3))
print(calculate_DAF(df, n_targets=5))

{'EV': (5.3, 7.3), 'EI': (2.7, 2.4), 'MU': (41.2, 23.9), 'Random': (9.8, 8.4)}
{'EV': (1.89, 2.6), 'EI': (3.7, 3.29), 'MU': (0.24, 0.14), 'Random': (1.02, 0.87)}
{'EV': (8.7, 8.6), 'EI': (6.0, 3.5), 'MU': (67.3, 25.2), 'Random': (29.8, 14.0)}
{'EV': (3.45, 3.41), 'EI': (5.0, 2.92), 'MU': (0.45, 0.17), 'Random': (1.01, 0.47)}
{'EV': (12.3, 10.4), 'EI': (8.8, 3.9), 'MU': (86.6, 14.3), 'Random': (49.4, 17.0)}
{'EV': (4.07, 3.44), 'EI': (5.68, 2.52), 'MU': (0.58, 0.1), 'Random': (1.01, 0.35)}
