# Evaluating feature sets

This notebook contains the subprotocol for evaluating feature sets that have already been filtered in previous stages of the overall protocol. The following are the expected outputs for each feature set:

1. A prevalence value per 1,000.
2. A [class balance accuracy value](http://search.proquest.com/docview/1500559170?accountid=37552).
3. An odds ratio value.
4. A positive predictive value.
5. A negative predictive value.

We also provide the counts that make up the contingency table so that readers can calculate their own evaluation statistics.


## Imports and helper functions

In [1]:
import os
os.chdir('/home/jupyter/UNSEEN/c-mcinerney-workspace')
%run 'UNSEEN_helper_functions.ipynb'
%store -r

### Retrieve the retriebable parts of the output table from existing files.

In [2]:
order_options = ['Individuals', 'Pairs', 'Triplets']
data = []
for i_order in order_options:
    fs_order = i_order
    data_dir = pathlib.Path(os.getcwd() + '/Mutual information saves/' + fs_order)
    globbed_files = data_dir.glob("*.parquet") #creates a list of all csv files
    for parquet_file in globbed_files:
        read_list = pandas.read_parquet(parquet_file).values.tolist() # Read parquet to list.
        read_list = numpy.array(list((map(list, zip(*read_list))))) # Transpose to make list for feature set and list for mutual information.
        len_read_list = numpy.shape(read_list)[1]
        file_source = os.path.basename(parquet_file)
        fs_source, fs_casenessType, fs_representation, dump = file_source.split("_")
        temp_list = \
            list(
                zip(numpy.repeat(file_source, len_read_list),
                    numpy.repeat(fs_source, len_read_list),
                    numpy.repeat(fs_order, len_read_list),
                    numpy.repeat(fs_casenessType, len_read_list),
                    numpy.repeat(fs_representation, len_read_list),
                    numpy.array(read_list[:][0]),
                    numpy.array(read_list[:][1])
                   )
        ) # A zip colwise concat of lists.
        data.append(temp_list)      
        
flat_metadata = \
    pandas.DataFrame([item for sublist in data for item in sublist],
                     columns = ['file_source',
                                'Source',
                                'Order',
                                'Caseness_type',
                                'Representation',
                                'Feature_set',
                                'Mutual_information']
                    )

### Check that the required fs_* dataframes exist.

In [3]:
%%capture
ls_fs_sources = flat_metadata['Source'].unique().tolist()
for i_fs_source in ls_fs_sources:
    if 'fs_' + i_fs_source not in globals():
        script_to_run = "\"./UNSEEN_create_" + i_fs_source + "_feature_sets.ipynb\""
        print(script_to_run)
        %run $script_to_run

### For each feature set in 'flat_metadata', extract and append evaluation statistics.

In [4]:
ls_output = []
for i_fs in range(len(flat_metadata)):
    # Choose the caseness variable of interest.
    caseness_type = flat_metadata['Caseness_type'][i_fs]
    if caseness_type == 'multinomial':
        vec_caseness = caseness_array['CMHD']
    elif caseness_type == 'definite':
        vec_caseness = caseness_array['CMHD_dx_and_rx']
    elif caseness_type == 'possible':
        vec_caseness = caseness_array['CMHD_rx_not_dx']
    elif caseness_type == 'control':
        vec_caseness = caseness_array['CMHD_control']

    
    # Choose the feature-set components for the feature set of interest.
    fs_components_names = flat_metadata['Feature_set'][i_fs].split("-")
    fs_components = fs_literature[fs_components_names]
    
    # Choose the representation for the feature set of interest.
    representation = flat_metadata['Representation'][i_fs]
    if representation == 'ALL':
        vec_featureSet = fs_components.all(True)
    elif representation == "MULTI":
        vec_featureSet, dump = mutlinomRepresentation(fs_components)
    
    
    # Pass the caseness and the feature-set vectors to evaloutputs().
    ls_output.append(evaloutputs(vec_featureSet,
                                 vec_caseness)
                )
# Flatten the appended list of evaluation statistics.
flat_output = \
    pandas.DataFrame(ls_output,
                     columns = ['prevalence per thousand',
                                'cba',
                                'odds ratio',
                                'ppv',
                                'npv',
                                'tn',
                                'fn',
                                'fp',
                                'tp']
                    )

# Append the evaluation statistics to the metadata about the feature set.
evaluation_dataframe = pandas.concat([flat_metadata, flat_output], axis=1, join='inner')
#display(evaluation_dataframe)

# Rename column names for saving.
evaluation_dataframe.rename = \
                    ['Order',
                     'Caseness_type',
                     'Representation',
                     'Feature_set',
                     'Mutual_information',
                     'FeatureSet_prevalence_per_thousand',
                     'Class_balance_acccuracy',
                     'Odds_ratio',
                     'Positive_predictive_value',
                     'Negative_predictive_value',
                     'True_positive_count',
                     'True_negativecount',
                     'False_positive_count',
                     'False_negative_count']

# Save evaluation outputs.
savelocation = "Evaluation/"
evaluation_dataframe.to_csv(savelocation + datetime.strftime(datetime.now(), '%Y_%m_%d_%H:%M:%S') + "_Evaluation statistics.csv", index = False)
evaluation_dataframe.astype(str).to_parquet(savelocation + datetime.strftime(datetime.now(), '%Y_%m_%d_%H:%M:%S') + "_Evaluation statistics.parquet", index = False)
print("\nEvaluation statistics saved.")


Evaluation statistics saved.


In [23]:
evaluation_dataframe

Unnamed: 0,file_source,Source,Order,Caseness_type,Representation,Feature_set,Mutual_information,prevalence per thousand,cba,odds ratio,ppv,npv,tn,fn,fp,tp
0,literature_definite_MULTI_batch1.parquet,literature,Individuals,definite,MULTI,metabolicSyndrome,0.0010560507846995,< 0.01,0.5,Undefined: One of the odds is zero.,0.0,≈ 1.00,703123,10,73,0
1,literature_definite_ALL_batch1.parquet,literature,Individuals,definite,ALL,metabolicSyndrome,0.0010560507846995,< 0.01,0.5,Undefined: One of the odds is zero.,0.0,≈ 1.00,703123,10,73,0
2,literature_definite_MULTI_batch1.parquet,literature,Pairs,definite,MULTI,homeless-metabolicSyndrome,0.0010560507846995,0.06,0.5,Undefined: One of the odds is zero.,0.0,≈ 1.00,699029,10,4094,0
3,literature_definite_MULTI_batch1.parquet,literature,Pairs,definite,MULTI,poverty-metabolicSyndrome,0.0010560507846995,0.03,0.5,Undefined: One of the odds is zero.,0.0,≈ 1.00,701100,10,2023,0
4,literature_definite_MULTI_batch1.parquet,literature,Pairs,definite,MULTI,sleepDisturbance-metabolicSyndrome,0.0010560507846978,0.42,0.48,9.88,< 0.01,≈ 1.00,673877,7,29246,3
5,literature_definite_MULTI_batch1.parquet,literature,Pairs,definite,MULTI,tinnitus-metabolicSyndrome,0.0010560507846995,0.17,0.49,6.6,< 0.01,≈ 1.00,691484,9,11639,1
6,literature_definite_MULTI_batch1.parquet,literature,Triplets,definite,MULTI,homeless-poverty-metabolicSyndrome,0.0010560507846977,0.09,0.5,Undefined: One of the odds is zero.,0.0,≈ 1.00,697213,10,1816,0
7,literature_definite_MULTI_batch1.parquet,literature,Triplets,definite,MULTI,homeless-sleepDisturbance-metabolicSyndrome,0.0010560507846979,0.47,0.48,9.99,< 0.01,≈ 1.00,670285,7,28744,3
8,literature_definite_MULTI_batch1.parquet,literature,Triplets,definite,MULTI,homeless-tinnitus-metabolicSyndrome,0.0010560507846995,0.22,0.49,6.6,< 0.01,≈ 1.00,687452,9,11577,1
9,literature_definite_MULTI_batch1.parquet,literature,Triplets,definite,MULTI,poverty-sleepDisturbance-metabolicSyndrome,0.0010560507846994,0.44,0.48,9.91,< 0.01,≈ 1.00,672028,7,29072,3
