# \<TITLE\>

The purpose of this notebook is to define the workflow set out in the PID.

My Vertex workbench is broken so I have to simulate the outcome array and the feature-set array produced from the Connected Bradford dataset.

In [1]:
import pandas
import numpy
import scipy.stats
import math
import sklearn.metrics
n_persons = 699620

## Protocol set out in PID

### 1. Outcome array.
Simulate the outcome array

In [2]:
outcome_array = \
    pandas.DataFrame(data =\
                    {
                        'person_id' : range(n_persons),
                        'CMHD' : numpy.random.binomial(n = 1, p = 0.137, size = n_persons)
                    }
                    )

### 2. Calculate the entropy of the outcome.

In [3]:
entropy_outcome = scipy.stats.entropy(outcome_array['CMHD'].value_counts(), base = math.e)
entropy_outcome_scaled = round(entropy_outcome / math.log(2, math.e) * 100, 1)
entropy_outcome = round(entropy_outcome, 3)
print("Outcome entropy = ", entropy_outcome, "nats")
print("Outcome scaled entropy = ", entropy_outcome_scaled, "%")

Outcome entropy =  0.399 nats
Outcome scaled entropy =  57.5 %


### 3. Calculate hit rates.

In [4]:
numerator = round(outcome_array['CMHD'].sum() / 5) * 5
denominator = round(outcome_array['CMHD'].shape[0] / 5) * 5
hitRate_all = round((numerator / denominator) * 100, 1)
hitRate_none = 100 - hitRate_all
Odds_noYes = hitRate_none / (100 - hitRate_none)
print("Hit rate (all) =", hitRate_all, "%")
print("Hit rate (none) =", hitRate_none, "%")
print("Odds (No CMHD : CMHD) =", round(Odds_noYes, 2), "times less likely to have CMHD than to have it.")

Hit rate (all) = 13.6 %
Hit rate (none) = 86.4 %
Odds (No CMHD : CMHD) = 6.35 times less likely to have CMHD than to have it.


### 4. Create the feature-set array.
Simulate a small feature-set array containing person_id and tallies of a few clinical codes.

In [5]:
n_levels_of_feature = 30
prob_levels_of_feature = 0.05
featureSet_array = \
    pandas.DataFrame(data =\
                    {
                        'person_id' : range(n_persons),
                        'f1'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f2'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f3'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f4'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f5'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f6'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f7'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f8'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f9'  : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f10' : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons),
                        'f11' : numpy.random.binomial(n = n_levels_of_feature, p = prob_levels_of_feature, size = n_persons)
                    }
                    )

### 5. Create the Feature Set ID table.
This table is a look-up table of feature-set IDs that shows which features make up the feature set. The table is instantiated on the assumption that feature sets will include no more than five features.

In [6]:
# Instantiate the feature set id table.
featureSet_ID_table = \
    pandas.DataFrame(columns = ['Feature set ID', 'Feature Set 1', 'Feature Set 2',
                               'Feature Set 3', 'Feature Set 4', 'Feature Set 5'
                               ])
# Populate the feature set id table with the individual features.
featureSet_ID_table['Feature set ID'] = \
    featureSet_ID_table['Feature Set 1'] = \
        featureSet_array.columns[featureSet_array.columns != 'person_id']

### 6. Calculate the entropy of the feature sets.
Calculate the entropy values but do not save any that are less than the entropy of the outcome. Justification for this action is based on the fact that the mutual information between the outcome variable and any feature will be less than or equal to the lesser entropy of the outcome or feature, i.e. $I(X_{i};CMHD) ≤ min\{H(X_{i}), H(CMHD)\}$. We don’t want any feature set that is worse than no feature set (i.e. having only the outcome prevalence to predict a random outcome value) so we don’t bother with any feature set that will lower the possible mutual information.

Note: The code below converts all feature sets into binary, showing 0 when the value is zero (i.e. when a patient does not have a record of the SNOMED CT code) and 1 otherwise (i.e. when a patient has at least one record of the SNOMED CT code).

In [7]:
feature_entropy = pandas.DataFrame(columns = ['Feature set', 'Entropy'])
for i_featureSet in featureSet_array.columns[featureSet_array.columns != 'person_id']:
    name_var = i_featureSet
    binary_var = featureSet_array[i_featureSet] == 0
    feature_entropy.loc[len(feature_entropy)] = \
        name_var, scipy.stats.entropy(binary_var.value_counts(), base = math.e)

# Drop the entropy records for any feature whose entropy is less than the entropy of the outcome.
f_to_drop = feature_entropy[ feature_entropy['Entropy'] < entropy_outcome ].index
features_dropped_due_to_low_entropy = feature_entropy.iloc[f_to_drop]
nameDict={"Feature set":"Dropped feature set"}
features_dropped_due_to_low_entropy = features_dropped_due_to_low_entropy.rename(columns=nameDict)
feature_entropy.drop(f_to_drop, inplace = True)
print("Number of feature entropies dropped =", len(f_to_drop))

Number of feature entropies dropped = 0


### 7. Calculate the two-way mutual information of the feature sets and the outcome.
Two-way mutual information will only be calculated for those features whose entropies were greater than the outcome entropy. These dropped variables are indicated by the `features_dropped_due_to_low_entropy` pandas.DataFrame.

In [8]:
# Instantiate storage for mutual information.
feature_mutual_information = pandas.DataFrame(columns = ['Feature set', 'Mutual information'])

# Define feature set for which mutual information will be calculated.
f_to_calc = \
    set(featureSet_array.columns[featureSet_array.columns != 'person_id']).difference(\
      set(features_dropped_due_to_low_entropy['Dropped feature set']))

# Calculate mutual information and store the values.
if not f_to_calc:
    print("No feature's entropy was greater than or equal to the outcome entropy so no two-way mutual information values will be calculated.")
else:
    print("Some or all features' entropy values were greater than or equal to the outcome entropy so two-way mutual information values will be calculated.")
    for i_featureSet in f_to_calc:
        name_var = i_featureSet
        binary_var = featureSet_array[i_featureSet] == 0
        feature_mutual_information.loc[len(feature_mutual_information)] = \
            name_var, sklearn.metrics.mutual_info_score(binary_var, outcome_array['CMHD'])

Some or all features' entropy values were greater than or equal to the outcome entropy so two-way mutual information values will be calculated.


### 8. Calculate the entropy and two-way mutual information of pair-composite feature sets and the outcome.
The nested FOR LOOPs below also update the Feature Set ID table with the new features.

### 9. Calculate the entropy and two-way mutual information of triplet-composite feature sets and the outcome.
The composite feature sets will each be calculated separately to avoid having all the computation in one call, which risks losing everything if it crashes and places heavy demand on RAM.
The code below is an obvious extension of the nested FOR LOOPs used in step 8.

In [9]:
for i_featureSet in f_to_calc:
    for j_featureSet in f_to_calc:
        # Skip the iteration if the same feature set is selected twice.
        if i_featureSet == j_featureSet:
            continue
            
        for k_featureSet in f_to_calc:
            # Skip the iteration if the same feature set is selected twice.
            if len(set([k_featureSet]) & set([i_featureSet, j_featureSet])) > 0:
                continue

            # Create the feature ID for the pair-composite feature set.
            name_var = "-".join([i_featureSet, j_featureSet, k_featureSet])

            # Update the feature set id table.
            featureSet_ID_table.loc[len(featureSet_ID_table),
                                    ['Feature set ID', 'Feature Set 1', 'Feature Set 2',
                                     'Feature Set 3']] = \
                [name_var, i_featureSet, j_featureSet, k_featureSet]

            # Define the pair-composite feature set values.
            # ## In this case, the pair-composite feature set is defined as 0 when both feature
            # ## sets are 0, and 1 otherwise.
            binary_var = \
                pandas.DataFrame(data = {
                                          'i_binary_var' : featureSet_array[i_featureSet] == 0,
                                          'j_binary_var' : featureSet_array[j_featureSet] == 0,
                                          'k_binary_var' : featureSet_array[k_featureSet] == 0
                                         }
                                ).all(True)

            # Calculate the entropy for the pair-composite feature set.
            f_ent = scipy.stats.entropy(binary_var.value_counts(), base = math.e)
            if f_ent < entropy_outcome:
                continue
            else:
                feature_entropy.loc[len(feature_entropy)] = name_var, f_ent

            # Calculate the mutual information for the pair-composite feature set.
            feature_mutual_information.loc[len(feature_mutual_information)] = \
                name_var, sklearn.metrics.mutual_info_score(binary_var, outcome_array['CMHD'])

### 10. Calculate the entropy and two-way mutual information of quadruplet-composite feature sets and the outcome.

In [10]:
for i_featureSet in f_to_calc:
    for j_featureSet in f_to_calc:
        # Skip the iteration if the same feature set is selected twice.
        if i_featureSet == j_featureSet:
            continue
            
        for k_featureSet in f_to_calc:
            # Skip the iteration if the same feature set is selected twice.
            if len(set([k_featureSet]) & set([i_featureSet, j_featureSet])) > 0:
                continue
                
            for l_featureSet in f_to_calc:
                # Skip the iteration if the same feature set is selected twice.
                if len(set([l_featureSet]) & set([i_featureSet, j_featureSet, k_featureSet])) > 0:
                    continue

                # Create the feature ID for the pair-composite feature set.
                name_var = "-".join([i_featureSet, j_featureSet, k_featureSet, l_featureSet])

                # Update the feature set id table.
                featureSet_ID_table.loc[len(featureSet_ID_table),
                                        ['Feature set ID', 'Feature Set 1', 'Feature Set 2',
                                         'Feature Set 3',  'Feature Set 4']] = \
                    [name_var, i_featureSet, j_featureSet, k_featureSet, l_featureSet]

                # Define the pair-composite feature set values.
                # ## In this case, the pair-composite feature set is defined as 0 when both feature
                # ## sets are 0, and 1 otherwise.
                binary_var = \
                    pandas.DataFrame(data = {
                                              'i_binary_var' : featureSet_array[i_featureSet] == 0,
                                              'j_binary_var' : featureSet_array[j_featureSet] == 0,
                                              'k_binary_var' : featureSet_array[k_featureSet] == 0,
                                              'l_binary_var' : featureSet_array[l_featureSet] == 0
                                             }
                                    ).all(True)

                # Calculate the entropy for the pair-composite feature set.
                f_ent = scipy.stats.entropy(binary_var.value_counts(), base = math.e)
                if f_ent < entropy_outcome:
                    continue
                else:
                    feature_entropy.loc[len(feature_entropy)] = name_var, f_ent

                # Calculate the mutual information for the pair-composite feature set.
                feature_mutual_information.loc[len(feature_mutual_information)] = \
                    name_var, sklearn.metrics.mutual_info_score(binary_var, outcome_array['CMHD'])

### 11. Calculate the entropy and two-way mutual information of quintuplet-composite feature sets and the outcome.

In [None]:
for i_featureSet in f_to_calc:
    for j_featureSet in f_to_calc:
        # Skip the iteration if the same feature set is selected twice.
        if i_featureSet == j_featureSet:
            continue
            
        for k_featureSet in f_to_calc:
            # Skip the iteration if the same feature set is selected twice.
            if len(set([k_featureSet]) & set([i_featureSet, j_featureSet])) > 0:
                continue
                
            for l_featureSet in f_to_calc:
                # Skip the iteration if the same feature set is selected twice.
                if len(set([l_featureSet]) & set([i_featureSet, j_featureSet, k_featureSet])) > 0:
                    continue
                
                for m_featureSet in f_to_calc:
                    # Skip the iteration if the same feature set is selected twice.
                    if len(set([m_featureSet]) & set([i_featureSet, j_featureSet, k_featureSet, l_featureSet])) > 0:
                        continue

                    # Create the feature ID for the pair-composite feature set.
                    name_var = "-".join([i_featureSet, j_featureSet, k_featureSet, l_featureSet, m_featureSet])

                    # Update the feature set id table.
                    # ## Note: 
                    featureSet_ID_table.loc[len(featureSet_ID_table),
                                            ['Feature set ID', 'Feature Set 1', 'Feature Set 2',
                                             'Feature Set 3',  'Feature Set 4', 'Feature Set 5']] = \
                        [name_var, i_featureSet, j_featureSet, k_featureSet, l_featureSet, m_featureSet]

                    # Define the pair-composite feature set values.
                    # ## In this case, the pair-composite feature set is defined as 0 when both feature
                    # ## sets are 0, and 1 otherwise.
                    binary_var = \
                        pandas.DataFrame(data = {
                                                  'i_binary_var' : featureSet_array[i_featureSet] == 0,
                                                  'j_binary_var' : featureSet_array[j_featureSet] == 0,
                                                  'k_binary_var' : featureSet_array[k_featureSet] == 0,
                                                  'l_binary_var' : featureSet_array[l_featureSet] == 0,
                                                  'm_binary_var' : featureSet_array[m_featureSet] == 0
                                                 }
                                        ).all(True)

                    # Calculate the entropy for the pair-composite feature set.
                    f_ent = scipy.stats.entropy(binary_var.value_counts(), base = math.e)
                    if f_ent < entropy_outcome:
                        continue
                    else:
                        feature_entropy.loc[len(feature_entropy)] = name_var, f_ent

                    # Calculate the mutual information for the pair-composite feature set.
                    feature_mutual_information.loc[len(feature_mutual_information)] = \
                        name_var, sklearn.metrics.mutual_info_score(binary_var, outcome_array['CMHD'])