#### Define function.

In [43]:
# A function to compute the evalaution outputs. The function automatically saves the
# contingency table and also returns it.
#
# ARGUMENTS
# 1. vec_featureSet:      A column from a pandas.Dataframe containing the feature set
#                         that needs evaluating.
# 2. vec_caseness:        A column from a pandas.Dataframe containing the caseness
#                         variable of interest.
# 3. savelocation:        The folder location where the output should be saved.
#
# RETURNS
# 1. prevalence:          The proportion of patients satisfying the definition of the
#                         feature set.
# 2. cba:                 Class balanced accuracy - the lower bound of the average
#                         sensitivity and average positive predictive value
#                         (a.k.a. precision) for all caseness values.
# 3. oddsRatio:           The ratio of the odds of caseness given the presence of feature
#                         set, to the odds of CMHD given the absence of the feature set.
#                         It can also be thought of as the multiplicative difference
#                         between correct and incorrect classification.
# 4. ppv:                 The proportion of patients satisfying the definition of the
#                         feature set who satisfy the caseness.
# 5. npv:                 The proportion of patients who do not satisfy the definition
#                         of the feature set who do not satisfy the caseness.
# 6. tn:                  The count of true negatives, i.e. the count of patients whose 
#                         feature-set value and caseness value are both zero.
# 7. fn:                  The count of false negatives, i.e. the count of patients whose 
#                         feature-set value is zero but whose caseness value is one.
# 8. fp:                  The count of false positives, i.e. the count of patients whose 
#                         feature-set value is one but whose caseness value is zero.
# 9. tp:                  The count of true positives, i.e. the count of patients whose 
#                         feature-set value and caseness value are both one.
#
def evaloutputs(vec_featureSet,
                vec_caseness):
    # ## Assess argument validty.
    
    if len(vec_featureSet.value_counts()) < 2:
        print(f"**Feature-set {vec_featureSet.name} only has one value.**")
        return vec_featureSet.name, vec_featureSet.dtype, None, None, None, None, None, None, None, None, None, None
    
    # Check that both vectors are the same length.
    if len(vec_featureSet) != len(vec_caseness):
        print("Feature-set and caseness vectors are of different lengths.")
        return vec_featureSet.name, vec_featureSet.dtype, None, None, None, None, None, None, None, None, None, None
    
    # Change the data type to suit the `statsmodel` function.
    if vec_featureSet.dtype == 'int64':
        vec_featureSet = vec_featureSet.astype(int)
    elif vec_featureSet.dtype == 'boolean':
        vec_featureSet = vec_featureSet.astype(bool)
    
    # Calculate the entropy of the caseness variable.
    pk = vec_caseness.value_counts() / len(vec_caseness)
    entropy_caseness = -numpy.sum(pk * numpy.log(pk)) / numpy.log(numpy.e)
    
    # Check what dtype the feature set is because float64-dtype feature sets need
    # to be processed differently to the categorical ones.
    if vec_featureSet.dtype == 'float64' or len(vec_featureSet.value_counts()) > 3:
        tn = tp = fn = fp = None
        # ## Compute outputs.
        #
        # Mutual information
        MI = mutual_info_regression(vec_featureSet.to_numpy().reshape(-1,1), vec_caseness, n_neighbors = 2)[0]
        sMI = MI / entropy_caseness 

        # Prevalence value per 1,000.
        #
        # When the feature set is a float64, we will use the arithmetic
        # mean in place of the prevalence. When the feature set is a 
        # count, we will use the mode in place of the prevalence.
        # I reason that the prevalence gives an expectation of an occurrence variable.
        # Similarly, the mode and mean give the expectations of count and continuous
        # variables.
        if vec_featureSet.dtype == 'float64':
            prevalence = round(numpy.mean(vec_featureSet), 2)
        else:
            prevalence = scipy.stats.mode(vec_featureSet)[0][0]
        
        # Class balance accuracy.
        cba = None
        
        # Odds ratio.
        # ## Create the required dataframe.
        df = pandas.DataFrame({'feature_set' : vec_featureSet.astype(int), 'caseness' : vec_caseness.astype(int)})
        # ## Build regression model.
        log_reg = statsmodels.formula.api.logit("caseness ~ feature_set", data = df).fit(disp=0)
        # ## Extract odds ratio.
        oddsRatio = round(numpy.exp(log_reg.params)[1], 2)
        
        # Positive predictive value.
        ppv = None
        
        # Negative predictive value.
        npv = None
        
    else:
        # ## Contingency table.
        # Make contingency table.
        contingencyTable = \
            pandas.crosstab(
                index = vec_featureSet,
                columns = vec_caseness
        )

        # Extract components of contingency table
        tn = contingencyTable.iloc[0,0]
        fn = contingencyTable.iloc[0,1]
        fp = contingencyTable.iloc[1,0]
        tp = contingencyTable.iloc[1,1]
    
        # ## Compute outputs.
        #
        # Scaled mutual information.
        #
        # Our particular scaled mutual information values are the proportional improvement in certainty about the
        # caseness variable. For example, a f_nMI = 0.05 means that the feature set improves our certainty about
        # whether the person has CMHD by 5%.
        MI = sklearn.metrics.mutual_info_score(vec_featureSet, vec_caseness)
        sMI = MI / entropy_caseness

        # Prevalence value per 1,000.
        #
        # I use 1 minus the prevalence of zeros because that
        # combines all the possibly-many values that indicate
        # the presence of the feature set.
        prevalence = \
            (1 - (sum(vec_featureSet == 0) / len(vec_featureSet))) * 10
        if prevalence < 0.01:
             prevalence = '< 0.01'
        else:
             prevalence = round(prevalence, 2)

        # Class balance accuracy.
        cba = \
            round( 0.5 * \
                  ( (tp / max( (tp + fn), (tp + fp) ) ) + \
                   (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
        if cba < 0.01:
            cba = '< 0.01'

        # Odds ratio.
        if min( (tp * tn) , (fp * fn) ) == 0:
            oddsRatio = 'Undefined'
        else:
            oddsRatio = round( (tp * tn) / (fp * fn), 2)

        # Positive predictive value.
        ppv = 0.00 if (tp + fp) == 0 else tp / (tp + fp)
        if ppv > 0 and ppv < 0.01:
            ppv = '< 0.01'
        elif ppv < 1 and ppv > 0.999:
            ppv = '\u2248 1.00'
        else:
             ppv = round(ppv, 2)

        # Negative predictive value.
        npv = 0.00 if (tn + fn) == 0 else tn / (tn + fn)
        if npv > 0 and npv < 0.01:
            npv = '< 0.01'
        elif npv < 1 and npv > 0.999:
            npv = '\u2248 1.00'
        else:
             npv = round(npv, 2)
    
    
    
    return vec_featureSet.name, vec_featureSet.dtype, round(sMI, 6), prevalence, cba, oddsRatio, ppv, npv, tn, fn, fp, tp

#### Load required data.

In [28]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

#### Process all feature sets.

In [44]:
evaloutputs(feature_set_array['countAppointmentsPreviousYear'],
            caseness_array.caseness_1isYes.astype(int))

('countAppointmentsPreviousYear',
 Int64Dtype(),
 0.035848,
 0,
 None,
 1.03,
 None,
 None,
 None,
 None,
 None,
 None)

In [45]:
feature_set_array.fillna(0, inplace = True)
ls_output = []
for i_featureSet in feature_set_array.columns[1:]:
    try:
        ls_output.append(
            evaloutputs(feature_set_array[i_featureSet],
                        caseness_array.caseness_1isYes.astype(int))
        )
    except:
        print(i_featureSet)

**Feature-set abandonment_CYP only has one value.**
**Feature-set abandonment_EA only has one value.**
**Feature-set foodInsecurity only has one value.**
**Feature-set foodInsecurity_CYP only has one value.**
**Feature-set foodInsecurity_EA only has one value.**
**Feature-set foodInsecurity_Adult only has one value.**
**Feature-set hoarder_CYP only has one value.**
**Feature-set hoarder_EA only has one value.**
**Feature-set IAPTreferral_CYP only has one value.**
**Feature-set IAPTrevolvingDoor only has one value.**
**Feature-set IAPTrevolvingDoor_CYP only has one value.**
**Feature-set IAPTrevolvingDoor_EA only has one value.**
**Feature-set IAPTrevolvingDoor_Adult only has one value.**
**Feature-set IAPTuse_CYP only has one value.**
**Feature-set IAPTuse_EA only has one value.**
**Feature-set incarcerationImprisonment only has one value.**
**Feature-set incarcerationImprisonment_CYP only has one value.**
**Feature-set incarcerationImprisonment_EA only has one value.**
**Feature-set i

In [46]:
eval_output = \
    pandas.DataFrame(ls_output,
                     columns = ['Feature_set', 'Data_type', 'Scaled_mutual_information',
                                'Prevalence_per_thousand', 'Class_balanced_accuracy',
                                'Odds_ratio', 'ppv', 'npv', 'tn', 'fn', 'fp', 'tp'])
eval_output.sort_values(by=['Scaled_mutual_information'], ascending = False, inplace = True)
pandas.set_option('display.max_rows', 30)
display(eval_output)

Unnamed: 0,Feature_set,Data_type,Scaled_mutual_information,Prevalence_per_thousand,Class_balanced_accuracy,Odds_ratio,ppv,npv,tn,fn,fp,tp
54,countPsychologicalDisorders,Int64,0.166370,1,,1.93,,,,,,
222,antipsychoticsPrescription,object,0.137894,0.27,0.64,27.47,0.43,0.97,196904.0,5439.0,3221.0,2444.0
140,paranoia_Adult,bool,0.054925,0.85,0.53,5.69,0.14,0.97,184979.0,5377.0,15146.0,2506.0
137,paranoia,bool,0.054783,0.92,0.53,5.49,0.14,0.97,183594.0,5275.0,16531.0,2608.0
228,MentalHealthTreatments,bool,0.053215,3.12,0.39,4.03,0.08,0.98,140212.0,2895.0,59913.0,4988.0
...,...,...,...,...,...,...,...,...,...,...,...,...
147,poverty_CYP,boolean,,,,,,,,,,
152,relevantPrescriptions_CYP,boolean,,,,,,,,,,
165,sleepDysfunction_CYP,boolean,,,,,,,,,,
166,sleepDysfunction_EA,boolean,,,,,,,,,,


 ### Process breakdown: simple

In [283]:
feature_set_array.fillna(0, inplace = True)
vec_featureSet = feature_set_array['countUniqueHypnoticsAndAnxiolytics']
vec_caseness = caseness_array.caseness_1isYes.astype(int)
len(vec_featureSet.value_counts())

3

In [284]:
if len(vec_featureSet.value_counts()) < 2:
    print("\n**",
          "Feature-set only has one value.",
         "**\n")

In [285]:
# Check that both vectors are the same length.
if len(vec_featureSet) != len(vec_caseness):
    print("\n**",
          "Feature-set and caseness vectors are of different lengths.",
         "**\n")

In [286]:
# Change the data type to suit the `statsmodel` function.
if vec_featureSet.dtype == 'int64':
    vec_featureSet = vec_featureSet.astype(int)
elif vec_featureSet.dtype == 'boolean':
    vec_featureSet = vec_featureSet.astype(bool)

In [287]:
vec_featureSet.dtype == 'float64' or len(vec_featureSet.value_counts()) > 3

False

In [288]:
# Make contingency table.
contingencyTable = \
    pandas.crosstab(
        index = vec_featureSet,
        columns = vec_caseness
)
contingencyTable

caseness_1isYes,0,1
countUniqueHypnoticsAndAnxiolytics,Unnamed: 1_level_1,Unnamed: 2_level_1
0,198290,7122
1,1694,669
2,141,92


In [295]:
# Scaled mutual information.
#
# Our particular scaled mutual information values are the proportional improvement in certainty about the
# caseness variable. For example, a f_nMI = 0.05 means that the feature set improves our certainty about
# whether the person has CMHD by 5%.
pk = vec_caseness.value_counts() / len(vec_caseness)
entropy_caseness = -numpy.sum(pk * numpy.log(pk)) / numpy.log(numpy.e)
MI = sklearn.metrics.mutual_info_score(vec_featureSet, vec_caseness)
sMI = MI / entropy_caseness       
print(f'sMI = {sMI}')

sMI = 0.03063777735411082


In [401]:
# Prevalence value per 1,000.
#
# I use 1 minus the prevalence of zeros because that
# combines all the possibly-many values that indicate
# the presence of the feature set.
prevalence = \
    (1 - (sum(vec_featureSet == 0) / len(vec_featureSet))) * 10
if prevalence < 0.01:
     prevalence = '< 0.01'
else:
     prevalence = round(prevalence, 2)
print(f'prevalence = {prevalence}')

prevalence = 9.64


In [291]:
# Class balance accuracy.
cba = \
    round( 0.5 * \
          ( (tp / max( (tp + fn), (tp + fp) ) ) + \
           (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
if cba < 0.01:
    cba = '< 0.01'
print(f'cba = {cba}')

cba = 0.02


In [292]:
# Odds ratio.
if min( (tp * tn) , (fp * fn) ) == 0:
    oddsRatio = 'Undefined'
else:
    oddsRatio = round( (tp * tn) / (fp * fn), 2)
print(f'oddsRatio = {oddsRatio}')

oddsRatio = 0.1


In [293]:
# Positive predictive value.
ppv = 0.00 if (tp + fp) == 0 else tp / (tp + fp)
if ppv > 0 and ppv < 0.01:
    ppv = '< 0.01'
elif ppv < 1 and ppv > 0.999:
    ppv = '\u2248 1.00'
else:
     ppv = round(ppv, 2)
print(f'ppv = {ppv}')

ppv = 0.04


In [294]:
# Negative predictive value.
npv = 0.00 if (tn + fn) == 0 else tn / (tn + fn)
if npv > 0 and npv < 0.01:
    npv = '< 0.01'
elif npv < 1 and npv > 0.999:
    npv = '\u2248 1.00'
else:
     npv = round(npv, 2)
print(f'npv = {npv}')

npv = 0.73


 ### Process breakdown: complicatedfeature_set_array

In [15]:
feature_set_array.fillna(0, inplace = True)
vec_featureSet = feature_set_array['countAppointmentsPreviousYear']
vec_caseness = caseness_array.caseness_1isYes.astype(int)
len(vec_featureSet.value_counts())

105

In [16]:
if len(vec_featureSet.value_counts()) < 2:
    print("\n**",
          "Feature-set only has one value.",
         "**\n")

In [17]:
# Check that both vectors are the same length.
if len(vec_featureSet) != len(vec_caseness):
    print("\n**",
          "Feature-set and caseness vectors are of different lengths.",
         "**\n")

In [18]:
# Change the data type to suit the `statsmodel` function.
if vec_featureSet.dtype == 'int64':
    vec_featureSet=vec_featureSet.astype(int)
elif vec_featureSet.dtype == 'boolean':
    vec_featureSet = vec_featureSet.astype(bool)

In [19]:
vec_featureSet.dtype == 'float64' or len(vec_featureSet.value_counts()) > 3

True

In [20]:
pk = vec_caseness.value_counts() / len(vec_caseness)
entropy_caseness = -numpy.sum(pk * numpy.log(pk)) / numpy.log(numpy.e)
# Mutual information
MI = mutual_info_regression(vec_featureSet.to_numpy().reshape(-1,1), vec_caseness, n_neighbors = 2)[0]
sMI = MI / entropy_caseness       
print(f'sMI = {sMI}')

sMI = 0.015111121531988047


In [21]:
# Prevalence value per 1,000.
#
# When the feature set is a float64, we will use the arithmetic
# mean in place of the prevalence. When the feature set is a 
# count, we will use the mode in place of the prevalence.
prevalence = [scipy.stats.mode(vec_featureSet)[0][0], round(numpy.mean(vec_featureSet), 2)]
print(f'Prevalence, in the form of mode and arithmetic mean, = {prevalence}')

Prevalence, in the form of mode and arithmetic mean, = [0, 6.55]


In [22]:
# Class balance accuracy.
cba = None
print('CBA for non-binary feature sets is not meaningful.')

CBA for non-binary feature sets is not meaningful


In [23]:
# Odds ratio.
# ## Create the required dataframe.
df = pandas.DataFrame({'feature_set' : vec_featureSet.astype(int), 'caseness' : vec_caseness.astype(int)})
# ## Build regression model.
log_reg = statsmodels.formula.api.logit("caseness ~ feature_set", data = df).fit()
# ## Extract odds ratio.
oddsRatio = round(numpy.exp(log_reg.params)[1], 2)
print(f'oddsRatio = {oddsRatio}')

Optimization terminated successfully.
         Current function value: 0.159026
         Iterations 7
oddsRatio = 1.03


In [24]:
# Positive predictive value.
ppv = None
print('PPV for non-binary feature sets is not meaningful.')

PPV for non-binary feature sets is not meaningful.


In [25]:
# Negative predictive value.
npv = None
print('NPV for non-binary feature sets is not meaningful.')

NPV for non-binary feature sets is not meaningful.
