# Evaluating feature sets

This notebook contains the subprotocol for evaluating feature sets that have already been filtered in previous stages of the overall protocol. The following are the expected outputs for each feature set, with respect to each of the 'active caseness', 'possible caseness', and 'no caseness' variables (18 in total):
1. A contingency table.
2. A prevalence value per 1,000.
3. A [class balance accuracy value](http://search.proquest.com/docview/1500559170?accountid=37552).
4. An odds ratio value.
5. A positive predictive value.
6. A negative predictive value.

This notebook defines a function that accepts a feature set as an n-by-1 vector and a caseness variable as an n-by-1 vector, and computes the six outputs mentioned above.

## What

## How

## Define the function to compute the evaluation outputs.

In [113]:
# Define function that will compute the evalaution outputs.
# The function automatically saves the contingency table but 
# also returns it.
def evaloutputs(vec_featureSet,
                vec_caseness,
                savelocation = None):
    # ## Assess argument validty.
    
    # Check that both vectors are the same length.
    if len(vec_featureSet) != len(vec_caseness):
        print("\n**",
              "Feature-set and caseness vectors are of different lengths.",
             "**\n")
        return
    
    # Check and set save location.
    if chk_savelocation == False:
        # This ^^^ first check is to supress repeatedly telling the user
        # that we've set the save location.
        if savelocation == None:
            savelocation = "Evaluation/"
            print("\nNo save location provided." +
                  "\n...Defaulting to ~/" + savelocation)
            chk_savelocation = True
        else:
            print("\nSave location is" + savelocation)
            chk_savelocation = True
        
        
    
    
    # ## Contingency table.
    # Make contingency table.
    contingencyTable = \
        pandas.crosstab(
            index = vec_featureSet,
            columns = vec_caseness
    )
    # Save contingency table.
    contingencyTable.to_csv(
        savelocation + \
        "ct__" + \
        vec_featureSet.name + "__" + \
        vec_caseness.name + "__" + \
    ".csv", index = False)
    
    # Extract components of contingency table
    tn = contingencyTable.loc[0,0]
    fn = contingencyTable.loc[0,1]
    fp = contingencyTable.loc[1,0]
    tp = contingencyTable.loc[1,1]
    
    # ## Compute outputs.
    
    # Prevalence value per 1,000.
    #
    # I use 1 minus the prevalence of zeros because that
    # combines all the possibly-many values that indicate
    # the presence of the feature set.
    prevalence = \
        round(
            1 - \
             sum(vec_featureSet == 0) / \
              len(vec_featureSet),
        2)
    
    # Class balance accuracy.
    cba = \
        round( 0.5 * \
              ( (tp / max( (tp + fn), (tp + fp) ) ) + \
               (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
    
    # Odds ratio.
    if min( (tp * tn) , (fp * fn) ) == 0:
        oddsRatio = 'Not\ a\ number:\ One\ of\ the\ odds\ is\ zero.'
    else:
        oddsRatio = round( (tp * tn) / (fp * fn), 2)
    
    # Positive predictive value.
    ppv = 0 if (tp + fp) == 0 else round( tp / (tp + fp), 2)
    
    # Negative predictive value.
    npv = 0 if (tn + fn) == 0 else round( tn / (tn + fn), 2)
    
    
    
    return contingencyTable, prevalence, cba, oddsRatio, ppv, npv

## Define function to iterate through the three caseness variables with a given feature set

In [None]:
# Define function to iterate through the three caseness variables with a given feature set.
#
# This function will be called by another, for each feature set.
#

def evaleachcaseness(vec_featureSet,
                     vec_caseness,
                     savelocation = None):
    counter = 0
    for vec_caseness in caseness_array[["CMHD_dx_and_rx", "CMHD_rx_not_dx","CMHD_control"]]:
        contingencyTable,
        prevalence[counter],
        cba[counter],
        oddsRatio[counter],
        ppv[counter],
        npv[counter] = \
            evaloutputs(vec_featureSet,
                        vec_caseness,
                        savelocation = None)
        counter = counter + 1