# Helper functions

This notebook collates all the functions that help the other notebooks do their thing, without clogging the other notebooks with function definitions.

## Imports

In [1]:
import copy
import csv

import datetime

import EntropyHub

from IPython.display import display, Markdown, Latex
from IPython import get_ipython
import itertools

import math
import matplotlib.pyplot
from multiprocessing.pool import Pool
from multiprocessing import SimpleQueue, Process

import nest_asyncio 
nest_asyncio.apply()
import numpy

import os

import pandas
import pandas_gbq
pandas.set_option('display.max_colwidth', None)
import pathlib
from PIL import Image
import plotly
import pyinform
    
import re
import rpy2.ipython

import scipy.stats
import scipy.special
import sklearn.metrics
import sklearn.neighbors
import sklearn.utils
from sklearn.feature_selection import mutual_info_regression
import statistics
import statsmodels.formula.api

import time
import tqdm

import warnings

## Functions

### `entropy_output()`: Compute and present the entropy

In [2]:
# A function to compute and present the entropy of a column in a pandas.Dataframe.
#
# ARGUMENTS
# 1. my_df_column:            A column from a pandas.Dataframe containing the variable
#                             for which entropy needs calculating.
#
# RETURNS
# 1. entropy_caseness:        Entropy in nats.
# 2. entropy_caseness_scaled: Entropy scaled to the theoretical maximum for a binary variable.
#

def entropy_output(my_df_column):
    my_df_column.dropna(inplace = True)
    entropy_caseness = scipy.stats.entropy(my_df_column.value_counts().astype(int), base = math.e)
    
    entropy_caseness_scaled = round(entropy_caseness / math.log(len(my_df_column.unique()), math.e) * 100, 1)
    if entropy_caseness < 0.001:
        print('\t Caseness variable entropy < 0.001 nats')
    else:
        print(f'\t Caseness variable entropy = {round(entropy_caseness, 3)} nats')
    if entropy_caseness < 0.001:
        print(f'\t The caseness variable\'s entropy is < 0.001 % its theoretical maximum\n')
    else:
        print(f'\t The caseness variable\'s entropy is {entropy_caseness_scaled} % of its theoretical maximum\n')
    
    return entropy_caseness, entropy_caseness_scaled

### `hitrate_output()`: Compute and present the hit rate - a.k.a. (mis)classification error - of a caseness variable

In [3]:
# A function to compute and present the hitrate - a.k.a. (mis)classification
# error - of a caseness variable.
#
# ARGUMENTS
# 1. my_caseness_variable:  A column from a pandas.Dataframe containing patients'
#                           caseness values.
#
# RETURNS
# 1. hitRate_none:          The hitrate assuming no one demonstrates the caseness.
# 2. hitRate_all:           The hitrate assuming everyone demonstrates the caseness.
#

def hitrate_output(my_caseness_variable):
    
    # Calculations.
    my_caseness_variable.dropna(inplace = True)
    numerator = my_caseness_variable.astype(bool).sum()
    denominator = len(my_caseness_variable)
    hitRate_all = (numerator / denominator) * 100
    hitRate_none = 100 - hitRate_all
    Odds_noYes = hitRate_none / hitRate_all
    
    # Message to user.
    if hitRate_all < 0.001:
        print(f'\t Hit rate (all) < 0.001 %')
        print(f'\t Hit rate (none) \u2248 100 %')
        print(f'\t Odds (No : Yes) \u2248 infitely-times less likely to demonstrate caseness than to not.')
    else:
        print(f'\t Hit rate (all) = {round(hitRate_all, 3)} %')
        print(f'\t Hit rate (none) = {round(hitRate_none, 3)} %')
        print(f'\t Odds (No : Yes) = {int(Odds_noYes):,}-times less likely to demonstrate caseness than to not.')
    
    return hitRate_none, hitRate_all

### `evaloutputs()`: Compute the evaluation outputs

In [None]:
def evaloutputs(vec_featureSet,
                vec_caseness):
    '''
     A function to compute the evaluation statistics for submitted features that have more than
     one value.

     ARGUMENTS
     1. vec_featureSet:      A column from a pandas.Dataframe containing the feature set
                             that needs evaluating.
     2. vec_caseness:        A column from a pandas.Dataframe containing the caseness
                             variable of interest.

     RETURNS
     1. evalStats:           A list containing the evaluation statistics:
                             feature_set: A string of the feature set's name, as per the column
                                          heading of `vec_featureSet`.
                             fs_data_type: A datatype object indicating the data type of the
                                           feature set, as per `vec_featureSet`.
                             sMI: The mutual information between `vec_featureSet` and
                                  `vec_caseness`, scaled to the entropy of `vec_caseness`. 
                                   This particular scaled mutual information is the
                                   proportional improvement in certainty about the caseness
                                   variable. For example, a sMI = 0.05 means that the feature
                                   set improves our certainty about whether the person 
                                   demonstrates the caseness of CMHD by 5%.
                             prevalence: The proportion of patients satisfying the definition
                                         of the feature set, per thousand. This is only
                                         calculated if the feature set is binary.
                             mean: The arithmetic mean of the feature set values. This is only
                                   calculated if the feature set is continuous. It is intended
                                   as an alternative for the prevalence statistic, for 
                                   continuous feature sets. I reason that the prevalence gives
                                   an expectation of an occurrence variable, and the mean gives
                                   an expectation of a continuous variable.
                             mode: The mode of the feature set values. This is only calculated
                                   if the feature set is count data. It is intended as an
                                   alternative for the prevalence statistic, for count feature
                                   sets. I reason that the prevalence gives an expectation of
                                   an occurrence variable, and the mean gives an expectation of
                                   a continuous variable.
                             cba: Class balanced accuracy - the lower bound of the average
                                  sensitivity and average positive predictive value (a.k.a.
                                  precision) for all caseness values. This is only calculated
                                  if the feature set is binary.
                             oddsRatio: The ratio of the odds of caseness given the presence
                                        of feature set, to the odds of CMHD given the absence
                                        of the feature set. It can also be thought of as the
                                        multiplicative difference between correct and incorrect
                                        classification.
                             ppv: The proportion of patients satisfying the definition of the
                                  feature set who satisfy the caseness. This is only calculated
                                  if the feature set is binary.
                             npv: The proportion of patients who do not satisfy the definition
                                  of the feature set who do not satisfy the caseness. This is
                                  only calculated if the feature set is binary.
                             tn: The count of true negatives, i.e. the count of patients whose 
                                 feature-set value and caseness value are both zero. This is
                                 only calculated if the feature set is binary.
                             fn: The count of false negatives, i.e. the count of patients whose 
                                 feature-set value is zero but whose caseness value is one. This
                                 is only calculated if the feature set is binary.
                             fp: The count of false positives, i.e. the count of patients whose 
                                 feature-set value is one but whose caseness value is zero. This
                                 is only calculated if the feature set is binary.
                             tp: The count of true positives, i.e. the count of patients whose 
                                 feature-set value and caseness value are both one. This is only
                                 calculated if the feature set is binary.
    '''
    ## ## Assess argument validty.
    #if len(vec_featureSet.value_counts()) < 2:
    #    print(f"**Feature-set {vec_featureSet.name} only has one value.**")
    #    return None, None, None, None, None, None, None, None, None, None, None, None, None, None
    
    # Check that both vectors are the same length.
    if len(vec_featureSet) != len(vec_caseness):
        print("Feature-set and caseness vectors are of different lengths.")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None
    
    # Change the data type to suit the `statsmodel` function.
    if vec_featureSet.dtype == 'int64':
        vec_featureSet = vec_featureSet.astype(int)
    elif vec_featureSet.dtype == 'boolean':
        vec_featureSet = vec_featureSet.astype(bool)
    
    # Calculate the entropy of the caseness variable.
    pk = vec_caseness.value_counts() / len(vec_caseness)
    entropy_caseness = -numpy.sum(pk * numpy.log(pk))
    
    # Check what dtype the feature set is because float64-dtype feature sets need
    # to be processed differently to the categorical ones.
    if vec_featureSet.dtype == 'float64' or len(vec_featureSet.value_counts()) > 3:
        tn = tp = fn = fp = None
        # ## Compute outputs.
        #
        # Mutual information
        # ## The `mutual_info_regression` function doesn't handle NaNs so I will remove those patients from the function
        # ## arguments. I need to remove the same rows from both vectors.
        #
        # ## Remove rows that have NaNs in vec_featureSet, from both vec_caseness and vec_featureSet.
        vec_caseness = vec_caseness[~vec_featureSet.isna().reindex(vec_caseness.index, fill_value=False)]
        vec_featureSet = vec_featureSet[~vec_featureSet.isna()]
        # ## Remove rows that have inf in vec_featureSet, from both vec_caseness and vec_featureSet.
        vec_caseness = vec_caseness[~numpy.isinf(vec_featureSet).reindex(vec_caseness.index, fill_value=False)]
        vec_featureSet = vec_featureSet[~numpy.isinf(vec_featureSet)]
        # Calculate the mutual information.
        MI_runs = []
        for i_runs in range(20):
            MI_runs.append(mutual_info_regression(vec_featureSet.to_numpy().reshape(-1,1), vec_caseness, n_neighbors = 2)[0])
        MI = numpy.mean(MI_runs)
        sMI = MI / entropy_caseness 

        # Prevalence value per 1,000.
        prevalence = None
        if vec_featureSet.dtype == 'float64':
            mean = round(numpy.mean(vec_featureSet), 2)
            mode = None
        else:
            mean = None
            mode = scipy.stats.mode(vec_featureSet)[0][0]
        
        # Class balance accuracy.
        cba = None
        
        # Odds ratio.
        # ## Create the required dataframe.
        df = pandas.DataFrame({'feature_set' : vec_featureSet.astype('float64'), 'caseness' : vec_caseness.astype(int)})
        # ## Build regression model.
        log_reg = statsmodels.formula.api.logit("caseness ~ feature_set", data = df).fit(disp=0)
        # ## Extract odds ratio.
        oddsRatio = round(numpy.exp(log_reg.params)[1], 2)
        
        # Positive predictive value.
        ppv = None
        
        # Negative predictive value.
        npv = None
        
    else:
        # ## Contingency table.
        # Make contingency table.
        contingencyTable = \
            pandas.crosstab(
                index = vec_featureSet,
                columns = vec_caseness
        )

        # Extract components of contingency table
        tn = contingencyTable.iloc[0,0]
        fn = contingencyTable.iloc[0,1]
        fp = contingencyTable.iloc[1,0]
        tp = contingencyTable.iloc[1,1]
    
        # ## Compute outputs.
        #
        # Scaled mutual information.
        MI = sklearn.metrics.mutual_info_score(vec_featureSet, vec_caseness)
        sMI = MI / entropy_caseness

        # Prevalence value per 1,000.
        #
        # I use 1 minus the prevalence of zeros because that
        # combines all the possibly-many values that indicate
        # the presence of the feature set.
        prevalence = \
            (1 - (sum(vec_featureSet == 0) / len(vec_featureSet))) * 1000
        if prevalence < 1:
             prevalence = '< 1'
        else:
             prevalence = round(prevalence, 2)
        mean = None
        mode = None
        
        # Class balance accuracy.
        cba = \
            round( 0.5 * \
                  ( (tp / max( (tp + fn), (tp + fp) ) ) + \
                   (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
        if cba < 0.01:
            cba = '< 0.01'

        # Odds ratio.
        if min( (tp * tn) , (fp * fn) ) == 0:
            oddsRatio = 'Undefined'
        else:
            oddsRatio = round( (tp * tn) / (fp * fn), 2)

        # Positive predictive value.
        ppv = 0.00 if (tp + fp) == 0 else tp / (tp + fp)
        if ppv > 0 and ppv < 0.01:
            ppv = '< 0.01'
        elif ppv < 1 and ppv > 0.999:
            ppv = '\u2248 1.00'
        else:
             ppv = round(ppv, 2)

        # Negative predictive value.
        npv = 0.00 if (tn + fn) == 0 else tn / (tn + fn)
        if npv > 0 and npv < 0.01:
            npv = '< 0.01'
        elif npv < 1 and npv > 0.999:
            npv = '\u2248 1.00'
        else:
             npv = round(npv, 2)
    
    
    return vec_featureSet.name, vec_featureSet.dtype, round(sMI, 6), prevalence, mean, mode, cba, oddsRatio, ppv, npv, tn, fn, fp, tp

### `chaoticlifeentropyfs()`: Calculate the entropy-based statistics for a patient's timeline of events.

In [13]:
def chaoticlifeentropyfs(pt_timeline):
    '''
    There are two categories of entropy-based feature sets for both appointments and did-not-attends:
    Sequential
    1.	activeInformation
    2.	entropyRate
    Summative
    3.	spectralEntropy
    4.	sampleEntropy
    5.	eoe (entropy of entropy)
    6.	averageEntropy
    7.	bubbleEntropy
    Use the following parameters for all summative entropy statistics other than spectral entropy, which doesn't require them:
    -	obs = three-monthly count, enough to amass a period of use.
    -	window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    -	window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    '''
    
    
    
    # Set parameters.
    # ## Set warnings parameter to handle divide-by-zero issues with spectral entropy.
    warnings.filterwarnings("error")
    # ## Window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    embeddingDimension = 4
    # ## Window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    embeddingTimeDelay = 1
    # ## Length of the patient's timeline.
    len_timeline = len(pt_timeline)
    # Convert pt_timeline into a numpy.array.
    pt_timeline = numpy.array(pt_timeline)
    
    # activeInformation
    # ...
    if len_timeline <= embeddingDimension:
        activeInformation = None
    else:
        try:
            activeInformation = \
                pyinform.activeinfo.active_info(pt_timeline, k = embeddingDimension)
        except:
            activeInformation = None
    
    # entropyRate
    # ...
    if len_timeline <= embeddingDimension:
        entropyRate = None
    else:
        try:
            entropyRate = \
                pyinform.entropyrate.entropy_rate(pt_timeline, k = embeddingDimension)
        except:
            entropyRate = None
    
    # spectralEntropy
    # ...
    if len_timeline <= 10:
        spectralEntropy = None
    else:
        try:
            spectralEntropy, _ = EntropyHub.SpecEn(pt_timeline)
        except RuntimeWarning:
            spectralEntropy = None
    
    # sampleEntropy
    # ...
    if len_timeline <= 10:
        sampleEntropy = None
    else:
        try:
            sampleEntropy, _, _ = \
                EntropyHub.SampEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
            sampleEntropy = sampleEntropy[-1]
        except:
            sampleEntropy = None

    # eoe and averageEntropy
    # ...
    if len_timeline <= 10:
        eoe = None
        averageEntropy = None
    else:
        try:
            eoe, averageEntropy, _ = \
                EntropyHub.EnofEn(pt_timeline, tau = embeddingDimension, S = math.floor(len_timeline / 4) )
        except:
            eoe = None
            averageEntropy = None

    # bubbleEntropy
    # ...
    if len_timeline <= 10:
        bubbleEntropy = None
    else:
        try:
            bubbleEntropy, _ = EntropyHub.BubbEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
            bubbleEntropy = bubbleEntropy[-1]
        except:
            bubbleEntropy = None
    
    # Package the output.
    ls_entropyBasedFeatureSets = \
        [
        activeInformation
        ,entropyRate
        ,spectralEntropy
        ,sampleEntropy
        ,eoe
        ,averageEntropy
        ,bubbleEntropy
        ]
    
    return ls_entropyBasedFeatureSets