# Helper functions

This notebook collates all the functions that help the other notebooks do their thing, without clogging the other notebooks with function definitions.

## Imports

In [1]:
import nest_asyncio
nest_asyncio.apply()
import numpy
import pandas
pandas.set_option('display.max_colwidth', None)
import datetime
import itertools
import scipy.stats
import sklearn.metrics
from sklearn.feature_selection import mutual_info_regression
import math
import os
import matplotlib.pyplot
import plotly
from google.cloud import bigquery, exceptions
from IPython.display import display, Markdown, Latex
from IPython import get_ipython
from tqdm import tqdm
import pyarrow.parquet
import fastparquet
import pathlib
import time
import re
import rpy2.ipython
from multiprocessing.pool import Pool
import functools
import copy
import pyinform
import EntropyHub
import warnings

## Functions

### `entropy_output()`: Compute and present the entropy

In [2]:
# A function to compute and present the entropy of a column in a pandas.Dataframe.
#
# ARGUMENTS
# 1. my_df_column:            A column from a pandas.Dataframe containing the variable
#                             for which entropy needs calculating.
#
# RETURNS
# 1. entropy_caseness:        Entropy in nats.
# 2. entropy_caseness_scaled: Entropy scaled to the theoretical maximum for a binary variable.
#

def entropy_output(my_df_column):
    my_df_column.dropna(inplace = True)
    entropy_caseness = scipy.stats.entropy(my_df_column.value_counts(), base = math.e)
    
    entropy_caseness_scaled = round(entropy_caseness / math.log(len(my_df_column.unique()), math.e) * 100, 1)
    if entropy_caseness < 0.001:
        print('\t Caseness variable entropy < 0.001 nats')
    else:
        print(f'\t Caseness variable entropy = {round(entropy_caseness, 3)} nats')
    if entropy_caseness < 0.001:
        print(f'\t The caseness variable\'s entropy is < 0.001 % its theoretical maximum\n')
    else:
        print(f'\t The caseness variable\'s entropy is {entropy_caseness_scaled} % of its theoretical maximum\n')
    
    return entropy_caseness, entropy_caseness_scaled

### `hitrate_output()`: Compute and present the hit rate - a.k.a. (mis)classification error - of a caseness variable

In [3]:
# A function to compute and present the hitrate - a.k.a. (mis)classification
# error - of a caseness variable.
#
# ARGUMENTS
# 1. my_caseness_variable:  A column from a pandas.Dataframe containing patients'
#                           caseness values.
#
# RETURNS
# 1. hitRate_none:          The hitrate assuming no one demonstrates the caseness.
# 2. hitRate_all:           The hitrate assuming everyone demonstrates the caseness.
#

def hitrate_output(my_caseness_variable):
    
    # Calculations.
    my_caseness_variable.dropna(inplace = True)
    numerator = my_caseness_variable.astype(bool).sum()
    denominator = len(my_caseness_variable)
    hitRate_all = (numerator / denominator) * 100
    hitRate_none = 100 - hitRate_all
    Odds_noYes = hitRate_none / hitRate_all
    
    # Message to user.
    if hitRate_all < 0.001:
        print(f'\t Hit rate (all) < 0.001 %')
        print(f'\t Hit rate (none) \u2248 100 %')
        print(f'\t Odds (No : Yes) \u2248 infitely-times less likely to demonstrate caseness than to not.')
    else:
        print(f'\t Hit rate (all) = {round(hitRate_all, 3)} %')
        print(f'\t Hit rate (none) = {round(hitRate_none, 3)} %')
        print(f'\t Odds (No : Yes) = {int(Odds_noYes):,}-times less likely to demonstrate caseness than to not.')
    
    return hitRate_none, hitRate_all

### `boundaryfilter()`: Filter out feature sets that are not within the prevalence bounds

In [4]:
# A function to filter out feature sets that are not within the prevlance bounds.
#
# The function counts the non-zero elements of each feature set can compares it 
# to the minimum and maximum count criteria.
#
# ARGUMENTS
# 1. my_featureSet_array:       An n-by-fs pandas.Datqframe of n patients represented
#                               by rows and fs features represented by columns.
# 2. caseness:                  One of {'dxandrx', 'dxnotrx', 'rxnotdx', 'multi'} to
#                               indicate which caseness variable's prevalence bounds to use.
# 3. verbose:                   An optional argument to indicate whether the user 
#                               wants feedback on how many feature sets were removed.
#
# RETURNS
# 1. filtered_featureSet_array: The inputted feature-set array but with the prevlance
#                               violating feature sets removed.
# 2. fs_removed_lower:          A numpy array of the names of feature sets removed 
#                               because their prevalence was too low.
# 3. n_fs_removed_lower:        The count of feature sets removed for high low
#                               prevalence.
# 4. fs_removed_upper:          A numpy array of the namesof feature sets removed 
#                               because their prevalence was too high.
# 5. n_fs_removed_upper:        The count of feature sets removed for having high
#                               prevalence.
#

def boundaryfilter(my_featureSet_array, caseness = None, verbose = True):
    
    # Select filter boundaries based on `caseness` argument.
    if caseness == 'dxandrx':
        lb = DxAndRxCaseness_count_LB
        ub = DxAndRxCaseness_count_UB
    elif caseness == 'dxnotrx':
        lb = DxNotRxCaseness_count_LB
        ub = DxNotRxCaseness_count_UB
    elif caseness == 'rxnotdx':
        lb = RxNotDxCaseness_count_LB
        ub = RxNotDxCaseness_count_UB
    elif caseness == 'multi':
        lb = multinomialCaseness_count_LB
        ub = multinomialCaseness_count_UB
    elif caseness == 'prescriptionVsDefinite':
        lb = prescriptionVsDefiniteCaseness_count_LB                  
        ub = prescriptionVsDefiniteCaseness_count_UB
    elif caseness == None:
        print('No values for `caseness` was provided.')
        return
    
    # Identify the feature sets that are too few, and extract the column names.
    fs_removed_lower = \
        my_featureSet_array.loc[:, 
        numpy.insert(
        ((my_featureSet_array.loc[:, my_featureSet_array.columns != 'person_id'] != 0).sum(axis=0) < lb).values
            ,0
            ,False)
                   ].columns
    # Extract the count of feature sets that are too few.
    n_fs_removed_lower = len(fs_removed_lower)

    # Identify the feature sets that are too many, and extract the column names.
    fs_removed_upper = \
        my_featureSet_array.loc[:, 
        numpy.insert(
        ((my_featureSet_array.loc[:, my_featureSet_array.columns != 'person_id'] !=0).sum(axis=0) > ub).values
            ,0
            ,False)
                   ].columns
    # Extract the count of feature sets that are too many.
    n_fs_removed_upper = len(fs_removed_upper)

    # Remove the feature sets that are no within the prevalence bounds.
    filtered_featureSet_array = \
        pandas.DataFrame(my_featureSet_array.drop(columns = numpy.insert(fs_removed_lower, 0, fs_removed_upper)))
    
    # Print message if arg{verbose} = True.
    if verbose == True:
        if caseness == 'dxandrx':
            print("\n Filtering complete for 'Definite caseness'...")
        elif caseness == 'dxnotrx':
            print("\n Filtering complete for 'Diagnosis-based caseness'...")
        elif caseness == 'rxnotdx':
            print("\n Filtering complete for 'Prescription-based caseness'...")
        elif caseness == 'multi':
            print("\n Filtering complete for 'Multinomial caseness'...")
        elif caseness == 'prescriptionVsDefinite':
            print("\n Filtering complete for 'Prescription-based -vs- Definite caseness'...")
        print("\t", len(filtered_featureSet_array.columns)-1, " feature sets remain.")
        print("\t", n_fs_removed_lower + n_fs_removed_upper, " feature sets removed, in total.")
        print("\t", n_fs_removed_lower, " feature sets removed because of low prevalence.")
        print("\t", n_fs_removed_upper, " feature sets removed because of high prevalence.")
    
    # Return outputs
    return [filtered_featureSet_array, fs_removed_lower, n_fs_removed_lower, fs_removed_upper, n_fs_removed_upper]

### `databasefsboundaryreview()`: Output tables showing the count of patient records in which unique SNOMED-CT codes occur.

In [5]:
def databasefsboundaryreview(lower_bound, upper_bound):
    global sql_variables
    global sql_base
    
    more_sql_variables = \
    """
    DECLARE lower_bound INT64 DEFAULT """ + str(lower_bound) + """;
    DECLARE upper_bound INT64 DEFAULT """ + str(upper_bound) + """;
    """
    
    sql_boundary_table = \
    """
    ,tbl_category_boundary AS
    (
    SELECT
      DISTINCT snomedcode
      ,CASE
        WHEN count_patients_with_code < lower_bound THEN "too infrequent (occurs in < """ + f'{lower_bound:,}' + """ patients' records)"
        WHEN count_patients_with_code <= upper_bound THEN "within bounds"
        ELSE "too frequent (occurs in > """ + f'{upper_bound:,}' + """ patients' records)"
      END AS cnt_SNOMED
    FROM tbl_patients_per_code
    ORDER BY cnt_SNOMED
    )

    SELECT
      COUNT(cnt_SNOMED) AS This_many_codes__
      ,cnt_SNOMED AS __occur_this_often
    FROM tbl_category_boundary
    GROUP BY cnt_SNOMED
    ORDER BY This_many_codes__ DESC
    """
    boundary_Table = client.query(sql_variables + more_sql_variables + sql_base + sql_boundary_table).to_dataframe()
    display(boundary_Table)

    # Prepare the table for extracting data.
    boundary_Table.set_index('__occur_this_often', inplace = True)
    n_within_bounds = int(boundary_Table.loc['within bounds'])
    
    return n_within_bounds

### `evaloutputs()`: Compute the evaluation outputs

In [6]:
# A function to compute the evalaution outputs. The function automatically saves the
# contingency table and also returns it.
#
# ARGUMENTS
# 1. vec_featureSet:      A column from a pandas.Dataframe containing the feature set
#                         that needs evaluating.
# 2. vec_caseness:        A column from a pandas.Dataframe containing the caseness
#                         variable of interest.
# 3. savelocation:        The folder location where the output should be saved.
#
# RETURNS
# 1. prevalence:          The proportion of patients satisfying the definition of the
#                         feature set.
# 2. cba:                 Class balanced accuracy - the lower bound of the average
#                         sensitivity and average positive predictive value
#                         (a.k.a. precision) for all caseness values.
# 3. oddsRatio:           The ratio of the odds of caseness given the presence of feature
#                         set, to the odds of CMHD given the absence of the feature set.
#                         It can also be thought of as the multiplicative difference
#                         between correct and incorrect classification.
# 4. ppv:                 The proportion of patients satisfying the definition of the
#                         feature set who satisfy the caseness.
# 5. npv:                 The proportion of patients who do not satisfy the definition
#                         of the feature set who do not satisfy the caseness.
# 6. tn:                  The count of true negatives, i.e. the count of patients whose 
#                         feature-set value and caseness value are both zero.
# 7. fn:                  The count of false negatives, i.e. the count of patients whose 
#                         feature-set value is zero but whose caseness value is one.
# 8. fp:                  The count of false positives, i.e. the count of patients whose 
#                         feature-set value is one but whose caseness value is zero.
# 9. tp:                  The count of true positives, i.e. the count of patients whose 
#                         feature-set value and caseness value are both one.
#
def evaloutputs(vec_featureSet,
                vec_caseness):
    # ## Assess argument validty.
    
    # Check that both vectors are the same length.
    if len(vec_featureSet) != len(vec_caseness):
        print("\n**",
              "Feature-set and caseness vectors are of different lengths.",
             "**\n")
        return
    
    # ## Contingency table.
    # Make contingency table.
    contingencyTable = \
        pandas.crosstab(
            index = vec_featureSet,
            columns = vec_caseness
    )
        
    
    # Extract components of contingency table
    tn = contingencyTable.loc[0,0]
    fn = contingencyTable.loc[0,1]
    fp = contingencyTable.loc[1,0]
    tp = contingencyTable.loc[1,1]
    
    # ## Compute outputs.
    
    # Prevalence value per 1,000.
    #
    # I use 1 minus the prevalence of zeros because that
    # combines all the possibly-many values that indicate
    # the presence of the feature set.
    prevalence = \
        (1 - (sum(vec_featureSet == 0) / len(vec_featureSet))) * 10
    if prevalence < 0.01:
         prevalence = '< 0.01'
    else:
         prevalence = round(prevalence, 2)
    
    # Class balance accuracy.
    cba = \
        round( 0.5 * \
              ( (tp / max( (tp + fn), (tp + fp) ) ) + \
               (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
    if cba < 0.01:
        cba = '< 0.01'
    
    # Odds ratio.
    if min( (tp * tn) , (fp * fn) ) == 0:
        oddsRatio = 'Undefined: One of the odds is zero.'
    else:
        oddsRatio = round( (tp * tn) / (fp * fn), 2)
    
    # Positive predictive value.
    ppv = 0.00 if (tp + fp) == 0 else tp / (tp + fp)
    if ppv > 0 and ppv < 0.01:
        ppv = '< 0.01'
    elif ppv < 1 and ppv > 0.999:
        ppv = '\u2248 1.00'
    else:
         ppv = round(ppv, 2)
         
    # Negative predictive value.
    npv = 0.00 if (tn + fn) == 0 else tn / (tn + fn)
    if npv > 0 and npv < 0.01:
        npv = '< 0.01'
    elif npv < 1 and npv > 0.999:
        npv = '\u2248 1.00'
    else:
         npv = round(npv, 2)
    
    
    
    return prevalence, cba, oddsRatio, ppv, npv, tn, fn, fp, tp

### `evaleachcaseness()`: Iterate through the three caseness variables with a given feature set, and call the evalaution function.

In [7]:
# A function to iterate through the three caseness variables with a given feature set
# and call the evaluation function, evaloutputs.
#
# ARGUMENTS
# 1. vec_featureSet:      A column from a pandas.Dataframe containing the feature set
#                         that needs evaluating.
# 2. vec_caseness:        A column from a pandas.Dataframe containing the caseness
#                         variable of interest.
# 3. savelocation:        The folder location where the output should be saved.
#
# RETURNS
# n/a
#

def evaleachcaseness(vec_featureSet,
                     array_caseness,
                     savelocation = None):
    counter = 0
    for vec_caseness in array_caseness[["CMHD_dx_and_rx", "CMHD_rx_not_dx","CMHD_control"]]:
        contingencyTable,
        prevalence[counter],
        cba[counter],
        oddsRatio[counter],
        ppv[counter],
        npv[counter] = \
            evaloutputs(vec_featureSet,
                        vec_caseness,
                        savelocation = None)
        counter = counter + 1

### `mutlinomRepresentation()`: Compute the multinomial representation of a feature set

In [8]:
# A function to compute the multinomial representation of a feature set.
#
# ARGUMENTS
# 1. var_vals:      An n-by-fs pandas.Dataframe of n patients and fs
#                   features, containing the feature sets that we want to
#                   compress into a single, multinomial representation.
#
# RETURNS
# 1. featureSet:    An n-by-1 pandas.Dataframe containing a multinomial
#                   representation of the inputted feature sets.
# 2. next_iter:     An indicator variable that is used by the parent
#                   function featuresetmi().
#
def mutlinomRepresentation(var_vals):
    # Check that the variables have more than three values and
    # only progress if False.
#    for i_col in range(var_vals.shape[1]-1):
#        unique_feature_vals = var_vals.iloc[:, i_col].drop_duplicates()
#        if (len(unique_feature_vals) > 3):
#            print("\n** Error: At least one of the",
#                  "component features has more than",
#                  "three values so the multinomial",
#                  "representation will not be computed.**")
#            print(f'Offending variable: {var_vals.columns.values[i_col]}')
#            unique_feature_vals
#            next_iter = True
#            return 0, next_iter

    # Get all combinations of values of the component features
    # and define feature set values for each multinomial combination.
    feature_combins = var_vals.drop_duplicates()
    feature_combins =\
        pandas.DataFrame(data = feature_combins, columns = var_vals.columns)\
        .reset_index()\
        .drop(['index'], axis = 1)
    feature_combins['multinom_vals'] = feature_combins.index


    # Define a vector indicating the feature set value.
    myMerge =\
        pandas.merge(
            var_vals,
            feature_combins,
            how = 'left',
            on = list(var_vals.columns.values)
    )

    # Extract multinomial representation as output variable.
    featureSet = myMerge['multinom_vals']
    next_iter = False
    return featureSet, next_iter

### `featuresetmi()`: Calculate two-way mutual information for a features of order m.

In [11]:
# A function to calculate the two-way mutual information for a feature set.
#
# ARGUMENTS
# 1. featureSet_array:   An n-by-fs pandas.Dataframe of n patients and fs feature
#                        sets, or an fs-by-1 pandas.Dataframe containing the names
#                        of feature sets. If the fs-by-1 dataframe, then it is
#                        assumed that the feature sets are SNOMED-CT codes that 
#                        can be queried in the WHERE clause of BigQuery syntax.
# 2. casenessVector:     A column from a pandas.Dataframe containing the caseness
#                        variable of interest.
# 3. m:                  The order of feature set to be tested. 1 = Individuals, 
#                        2 = Pairs, 3 = Triplets.
# 4. savelocation:       The folder location where the output should be saved.
# 5. representation:     A choice of {'all', 'any', 'multi'} where 'all' = the feature set
#                        value is 1 when all components are 1, or 0 otherwise; 'any' =
#                        the feature set value is 1 when any component is 1, or 0 otherwise;
#                        and 'multi' = the feature set values represent every combination
#                        of components' values.
# 6. source:             The source of the feature set: {'database', 'clinicial',
#                        'literature', 'interviews', 'PPI', 'combined'}.
# 7. df_ppl_and_codes:   An optional argument that contains a list of all patients
#                        and all SNOMEDCT-CT codes that they have that are within
#                        the prevalence bounds.
# 8. verbose:            An optional argument to indicate whether the user wants
#                        feedback on how many feature sets were removed and saved.

#
# RETURNS
# n/a
#  
def featuresetmi(featureSet_array,
                 casenessVector,
                 m = None,
                 saveloc = None,
                 representation = None,
                 src = None,
                 df_ppl_and_codes = None,
                 verbose = False):
    # ## Assess argument validty.
    
    # Check order of feature set. If not provided,
    # default to m = 1.
    global order_label
    if m == None:
        order_int = 1
        order_label = "Individuals"
        print("\nNo value for m provided." +
              "\n...Default value of m = 1 will be used.")
    elif m == 1:
        order_int = m
        order_label = "Individuals"
    elif m == 2:
        order_int = m
        order_label = "Pairs"
    elif m == 3:
        order_int = m
        order_label = "Triplets"
    else:
        print("\n** Error: Integer value between 1",
              "and 3 not supplied for m.**\n")
        return

    # Check and set save location.
    
    if saveloc == None:
        global savelocation
        savelocation = \
           ("Mutual information saves/"+\
            order_label)
        print("\nNo save location provided." +
              "\n...Defaulting to ~/" + savelocation)  

    # ## Check encoding. If not provided, 
    # ## default to OR encoding.
    global representation_label
    if representation == None:
        representation_label = "ALL"
        print("\nNo representation provided." +
              "\n...Defaulting to '" + representation_label + "' representation.")
    elif representation == "all":
        representation_label = "ALL"
    elif representation == "any":
        representation_label = "ANY"
    elif representation == "multi":
        representation_label = "MULTI"
    else:
        print("\n** Error: Representation value from ",
              "{'all', 'any', 'multi'} not provided.**\n")
        return
    
    # ## Check the source argument is provided.
    
    if src == None:
        print("\n** Error: No source argument provided.",
              "**\n")
        return
    else:
        global source
        source = src
    
    
    # ## Set save string for particular caseness variable.
    global caseness_label
    caseness_type = casenessVector.columns.values[-1]
    if caseness_type == 'CMHD_dx_and_rx': 
        caseness_label = 'DxAndRx'
    elif caseness_type == 'CMHD_dx_not_rx': 
        caseness_label = 'DxNotRx'
    elif caseness_type == 'CMHD_rx_not_dx': 
        caseness_label = 'RxNotDx'
    elif caseness_type == 'CMHD_multi': 
        caseness_label = 'Multinomial'
    elif caseness_type == 'CMHD_control': 
        caseness_label = 'Control'
    
    print("\n\n\n****************************************")  
    print("Calculating mutual information values...")
    
    # Instantiate specific storage for mutual information.
    global featureSet_MI
    featureSet_MI = \
        pandas.DataFrame(columns = ['Feature_set', 'Normalised_mutual_information'])

    # Instantiate batch number.
    global batch 
    batch = 0

    # Instantiate tally of feature sets that are dropped due to low entropy.
    global drop_tally
    drop_tally = 0

    # Define entropy of the particular caseness variable.
    global entropy_caseness
    entropy_caseness = \
        scipy.stats.entropy(casenessVector.value_counts())
    
    # Check if the supplied feature set array is an n-by-fs array
    # of feature sets, or an fs-by-1 vector of feature-set names.
    # If it is the fs-by-1 vector, then pass all arguments to the
    # appropriate function; else, continue with the code.
    if featureSet_array.shape[1] == 1:
        # Display messages to user.
        if verbose == True:
            print("Overriding 'verbose == True' because there are too many feature sets.",
                 "Messages would become unweidly and slow down processing.")
            verbose = False
            # Inform users about limitation with database feature sets.
            if representation_label != "ALL":
                print("Only representation = 'all' is available for database feature sets.")
                return
        
        # The IPYNB file has already been run in this notebook but I'm repeating the run based
        # on guidance from this blog:
        # https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac
        %run 'UNSEEN_helper_functions.ipynb'

        # Define a function to portion my iterable.
        #
        # https://stackoverflow.com/questions/51446327/python-3-generator-comprehension-to-generate-chunks-including-last
        def portion_maker(gen, portion_size):
            it = iter(gen)
            while True:
                portion = [*itertools.islice(it, 0, portion_size)]
                if portion:
                    yield portion
                else:
                    break
        
        # Do the main job of assessing the feature sets.
        if __name__ ==  '__main__': 
            gen = itertools.combinations(df_fs_database.snomedcode, m)
            portion_size = 7
            n_workers = 4
            for portion in portion_maker(gen, portion_size):
                print(f"This batch is {portion}.")
                list(Pool(n_workers).starmap(processdatabasefs, portion))
        
        # Calculate the total count of feature sets processed.
        count_fs = sum(1 for _ in itertools.combinations(featureSet_array.snomedcode, m))
    else:            
        # Ensure feature-set and casenesss values are matched for person_id.
        global full_array
        full_array = pandas.merge(casenessVector,
                                  featureSet_array,
                                  on = 'person_id',
                                  how = 'left')  

        # Define the m-way tuples of features sets as a numpy array. We will loop
        # through the rows of this array to create the feature sets.
        combins = \
            itertools.combinations(
                featureSet_array.columns[featureSet_array.columns != 'person_id'],
                order_int)
        count_fs = len(list(itertools.combinations(
                featureSet_array.columns[featureSet_array.columns != 'person_id'],
                order_int)))
        
        # The IPYNB file has already been run in this notebook but I'm repeating the run based
        # on guidance from this blog:
        # https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac
        %run 'UNSEEN_helper_functions.ipynb'
        
        # Do the main job of assessing the feature sets.
        if __name__ ==  '__main__':
            portion_size = 7
            n_workers = 4
            print('Parallel processing of feature sets has begun...')
            mypool = Pool(n_workers)
            list(mypool.starmap(processfs, combins))
            mypool.close()
            mypool.join() # Close down the pool to release resources. https://superfastpython.com/shutdown-the-multiprocessing-pool-in-python/
            print('\t...parallel processing of feature sets has ended.')
        
                    
    # Increment counter.
    batch += 1
    # Final save.
    if len(featureSet_MI) != 0:
        current_dir = os.getcwd()
        #print(current_dir + "/" +savelocation)
        os.chdir(current_dir + "/" + savelocation)
        pyarrow.parquet.write_table(pyarrow.Table.from_pandas(featureSet_MI),
                                    source + "_" +
                                    caseness_label + "_" +
                                    representation_label + "_" + 
                                    "batch" + 
                                    str(batch) + 
                                    ".parquet")
        os.chdir(current_dir)
        # ## I commented out saving the CSV files to improve runtimes.S
        #featureSet_MI.to_csv(savelocation + "/" +
        #                     source + "_" +
        #                     caseness_label + "_" +
        #                     representation_label + "_" +
        #                     "batch" + 
        #                     str(batch) + 
        #                     ".csv", index = False)

    # Feedback messages.
    print("...\n")
    print(str(batch), "batch(es) of feature sets processed.")
    print(str(drop_tally), "/",
          str(count_fs),
          "feature sets dropped due to insufficient normalised mutual information.")
    print("****************************************")  

### `processdatabasefs()`: Calculate two-way mutual information for *database* features sets, specifically.

In [None]:
def processdatabasefs(*snomedcodes):
    global drop_tally
    global batch
    global featureSet_MI
    global df_ppl_and_codes
    
    # Define the feature set.
    fs = [snomedcodes]
    
    # Name the feature set.
    name_var = '-'.join(map(str,list(fs)))

    # Check to see if anyone in the cohort has the codes in their record.
    df_temp = df_ppl_and_codes
    df_temp['ary_pAc_in_fsdf'] = df_ppl_and_codes.snomedcode.isin(fs)
    do_patients_have_all_query_codes = \
        (df_temp.groupby(['person_id']).ary_pAc_in_fsdf.count() == len(fs)).astype(int)

    # Join feature set with caseness_array to match person_id.
    fs_val = pandas.merge(caseness_array,
                          do_patients_have_all_query_codes,
                          how = 'left',
                          on = 'person_id')

    # Calculate the normalised mutual information between the feature set and
    # caseness variable. The mutual information is normalised to the caseness'
    # entropy. This ensures that:
    # - 0 means no mutual information (in other words, the feature set has done nothing to improve our certainty).
    # - 1 means the feature set is a perfect proxy for the caseness variable.
    # - 0.5 means the feature set has halved the uncertainty of the caseness variable.
    f_MI = sklearn.metrics.mutual_info_score(fs_val, full_array[caseness_array.columns[1]])
    f_nMI = f_MI / entropy_caseness

    # Test the feature set's normalised mutual information.
    if f_nMI < 0: #< 0.8: # This commenting-out is about removing any threshold on nMI scores. The team want to see all of them.
        drop_tally += 1
    else:            
        featureSet_MI.loc[len(featureSet_MI),:] = name_var, f_nMI

    if len(featureSet_MI) > 9:
            # Increment batch.
            batch += 1

            # Make an interim save of results.
            current_dir = os.getcwd()
            #print(current_dir + savelocation)
            os.chdir(current_dir + "/" + savelocation)
            pyarrow.parquet.write_table(pyarrow.Table.from_pandas(featureSet_MI),   
                                        source + "_" +
                                        caseness_label + "_" +
                                        representation_label + "_" + 
                                        "batch" + 
                                        str(batch) + 
                                        ".parquet")
            os.chdir(current_dir)
            # ## I commented out saving the CSV files to improve runtimes.S
            #featureSet_MI.to_csv(savelocation + "/" +
            #                     source + "_" +
            #                     caseness_label + "_" +
            #                     representation_label + "_" +
            #                     "batch" + 
            #                     str(batch) + 
            #                     ".csv", index = False)
            #print("\nInterim save made")
            # Instantiate new storage.
            featureSet_MI = \
                pandas.DataFrame(columns = ['Feature_set', 'Mutual_information'])

### `processfs()`: Calculate two-way mutual information for *non-database* features sets, specifically.

In [None]:
def processfs(*i_combin_names):
    global full_array
    global representation_label
    global drop_tally
    global batch
    global featureSet_MI
    global savelocation
    global source
    global casenesss_label
    
    # Define an array indicating the feature set value.
    var_vals = full_array[list(i_combin_names)]
    
    # Name the feature set.
    if representation_label == 'ALL':
        name_var = 'ALL OF   ' + '   AND   '.join(var_vals.columns.values)
    elif representation_label == 'ANY':
        name_var = 'EITHER    ' + '   OR   '.join(var_vals.columns.values)
    elif representation_label == 'MULTI':
        name_var = 'EVERY COMBINATION OF    ' + '   ,   '.join(var_vals.columns.values)

    # Transform feature sets into the chosen representation.
    if representation_label == "ALL":
        fs_val = var_vals.all(True)
    elif representation_label == "ANY":
        fs_val = var_vals.any(True)
    elif representation_label == "MULTI":
        fs_val, next_iter = mutlinomRepresentation(var_vals)
        if next_iter:
            drop_tally += 1


    # Calculate the normalised mutual information between the feature set and caseness
    # variable. The mutual information is normalised to the caseness' entropy. This 
    # ensures that:
    # - 0 means no mutual information (in other words, the feature set has done nothing to improve our certainty).
    # - 1 means the feature set is a perfect proxy for the caseness variable.
    # - 0.5 means the feature set has halved the uncertainty of the caseness variable.
    f_MI = sklearn.metrics.mutual_info_score(fs_val, full_array.iloc[:,1])
    f_nMI = f_MI / entropy_caseness
    
    # Test the feature set's normalised mutual information against a threshold.
    if f_nMI < 0: #< 0.8: # This commenting-out is about removing any threshold on nMI scores. The team want to see all of them.
        drop_tally += 1
    else:            
        # Store the name and mutual information value.
        featureSet_MI.loc[len(featureSet_MI),:] = name_var, round(f_nMI, 7)

    if len(featureSet_MI) > 9:
            # Increment batch.
            batch += 1

            # Make an interim save of results.
            current_dir = os.getcwd()
            #print(current_dir + "/" + savelocation)
            os.chdir(current_dir + "/" + savelocation)
            pyarrow.parquet.write_table(pyarrow.Table.from_pandas(featureSet_MI),   
                                        source + "_" +
                                        caseness_label + "_" +
                                        representation_label + "_" + 
                                        "batch" + 
                                        str(batch) + 
                                        ".parquet")
            os.chdir(current_dir)
            # ## I commented out saving the CSV files to improve runtimes.
            #featureSet_MI.to_csv(savelocation + "/" +
            #                     source + "_" +
            #                     caseness_label + "_" + 
            #                     representation_label + "_" + 
            #                     "batch" + 
            #                     str(batch) + 
            #                     ".csv", index = False)
            #print("\nInterim save made")
            # Instantiate new storage.
            featureSet_MI = \
                pandas.DataFrame(columns = ['Feature_set', 'Normalised_mutual_information'])

### `init_worker()`: Initialize worker processes for parallell processing.

In [None]:
def init_worker(shared_var):
    global df_ppl_and_codes
    # store argument in the global variable for this process
    df_ppl_and_codes = shared_var
    #print(f'df_ppl_and_codes.head() = {df_ppl_and_codes.head()}')

### `chaoticlifeentropyfs()`: Calculate the entropy-based statistics for a patient's timeline of events.

In [None]:
def chaoticlifeentropyfs(pt_timeline):
    '''
    There are two categories of entropy-based feature sets for both appointments and did-not-attends:
    Sequential
    1.	activeInformation
    2.	entropyRate
    Summative
    3.	spectralEntropy
    4.	sampleEntropy
    5.	eoe (entropy of entropy)
    6.	averageEntropy
    7.	bubbleEntropy
    Use the following parameters for all summative entropy statistics other than spectral entropy, which doesn't require them:
    -	obs = three-monthly count, enough to amass a period of use.
    -	window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    -	window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    '''
    
    
    
    # Set parameters.
    # ## Set warnings parameter to handle divide-by-zero issues with spectral entropy.
    warnings.filterwarnings("error")
    # ## Window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    embeddingDimension = 4
    # ## Window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    embeddingTimeDelay = 1
    # ## Length of the patient's timeline.
    len_timeline = len(pt_timeline)
    # Convert pt_timeline into a numpy.array.
    pt_timeline = numpy.array(pt_timeline)
    
    # activeInformation
    # ...
    if len_timeline <= embeddingDimension:
        activeInformation = None
    else:
        activeInformation = \
            pyinform.activeinfo.active_info(pt_timeline, k = embeddingDimension)
    
    # entropyRate
    # ...
    if len_timeline <= embeddingDimension:
        entropyRate = None
    else:
        entropyRate = \
            pyinform.entropyrate.entropy_rate(pt_timeline, k = embeddingDimension)
    
    # spectralEntropy
    # ...
    if len_timeline <= 10:
        spectralEntropy = None
    else:
        try:
            spectralEntropy, _ = EntropyHub.SpecEn(pt_timeline)
        except RuntimeWarning:
            spectralEntropy = None
    
    # sampleEntropy
    # ...
    if len_timeline <= 10:
        sampleEntropy = None
    else:
        sampleEntropy, _, _ = \
            EntropyHub.SampEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
        sampleEntropy = sampleEntropy[-1]
    
    # eoe and averageEntropy
    # ...
    if len_timeline <= 10:
        eoe = None
        averageEntropy = None
    else:
        eoe, averageEntropy, _ = \
            EntropyHub.EnofEn(pt_timeline, tau = embeddingDimension, S = math.floor(len_timeline / 4) )

    # bubbleEntropy
    # ...
    if len_timeline <= 10:
        bubbleEntropy = None
    else:
        bubbleEntropy, _ = EntropyHub.BubbEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
        bubbleEntropy = bubbleEntropy[-1]
    
    # Package the output.
    ls_entropyBasedFeatureSets = \
        [
        activeInformation
        ,entropyRate
        ,spectralEntropy
        ,sampleEntropy
        ,eoe
        ,averageEntropy
        ,bubbleEntropy
        ]
    
    return ls_entropyBasedFeatureSets

### NOT IN USE patienthassnomed(): Check whether a patient's record includes some particular SNOMED-CT codes.

In [None]:
def patienthassnomed(fs_df, snomedcode):
    ary_pAc_in_fsdf = []
    if math.isnan(i):
        ary_pAc_in_fsdf.append(False)
    else:
        ary_pAc_in_fsdf.append((i == fs_df).any()[0])
    return ary_pAc_in_fsdf

### NOT IN USE featuresetmi_database(): Calculate two-way mutual information for a *database* features of order m.

### NOT IN USE getfsarray(): Produce a database feature set for submission to `featuresetmi()` function

In [None]:
def getfsarray(fs_df):
    
    # Instantiate a Google BigQuery client.
    client = bigquery.Client()
    
    # Define the BigQuery syntax.

    sql_CTEs_body = \
    """
    # Make a table of person ID and their SNOMED-CT codes from the list of codes of interest.
    WITH tbl_persons_and_codes AS
    (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v5.tbl_SRCode
    WHERE
        src_snomedcode IN ('""" + '\', \''.join(map(str, fs_df['src_snomedcode'].to_list())) + """')
    )
    """

    sql_pivot = \
    """
        SELECT
        CONCAT("SELECT person_id,"
               , STRING_AGG(CONCAT("CASE WHEN src_snomedcode='",src_snomedcode,"' THEN 1 ELSE 0 END AS `_",src_snomedcode,"`")), 
            " FROM `tbl_persons_and_codes`",
            " GROUP BY person_id, src_snomedcode ORDER BY person_id")
    FROM (  SELECT DISTINCT src_snomedcode FROM `tbl_persons_and_codes` ORDER BY src_snomedcode  )
    """
    sql = client.query(sql_CTEs_body + sql_pivot).to_dataframe()['f0_'].iloc[0]
    
    feastureSet_array = \
        client.query(sql_CTEs_body +
                     sql).to_dataframe()
    
    return feastureSet_array

### NOT IN USE calculatemi(): Do the work of calculating the MI between a given feature set and caseness variable

In [None]:
def calculatemi(var_vals,
                name_var,
                representation_label,
                drop_tally,
                verbose):
    
    # Formulate the representation.
    if representation_label == "ALLrepresentation":
        fs_val = var_vals.all(True)
    elif representation_label == "MULTIrepresentation":
        fs_val, next_iter = mutlinomRepresentation(var_vals)
        if next_iter:
            return next_iter

    # Calculate the mutual information between the feature set and
    # caseness variable.
    f_MI = sklearn.metrics.mutual_info_score(fs_val, full_array.iloc[:,-1])

    if f_MI < entropy_caseness:
        drop_tally += 1
        if verbose == True:
            print("Dropped",name_var,"because f_MI =", f_MI)
    else:            
        # Store the name and mutual information value.
        if verbose == True:
            print("\tSaved",name_var,"because f_MI =", f_MI)
        featureSet_MI.loc[len(featureSet_MI),:] = name_var, f_MI

    if len(featureSet_MI) > 9:
            # Increment batch.
            batch += 1

            # Make an interim save of results.
            featureSet_MI.to_csv(savelocation +
                              representation_label + "_" +
                              "_batch" + \
                              str(batch) + \
                              ".csv", index = False)
            print("\nInterim save made")
            # Instantiate new storage.
            featureSet_MI = \
                pandas.DataFrame(columns = ['Feature set', 'Mutual information'])