# Caseness array

The purpose of this notebook is to produce the caseness array. The caseness array is an n-by-2 array containing patient ID and a binary vector indicating whether the patient is clinically coded for complex mental health difficulties.

### Imports

In [1]:
import pandas
import numpy
from google.cloud import bigquery
from datetime import date
import scipy.stats
import math
import os
from IPython.display import Markdown, display

### Prerequisites

In [2]:
client = bigquery.Client()

folder_loc = os.path.dirname(os.path.abspath("UNSEEN create caseness array.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_borderline = pandas.read_csv(folder + "ciaranmci-borderline-personality-disorder-1ed4af38.csv")
codes_to_query_depression = pandas.read_csv(folder + "ciaranmci-chronic-depression-53a65598.csv")
codes_to_query_chronicPTSD = pandas.read_csv(folder + "ciaranmci-chronic-post-traumatic-stress-disorder-3a96e263.csv")
codes_to_query_complexPTSD = pandas.read_csv(folder + "ciaranmci-complex-post-traumatic-stress-disorder-21876f2e.csv")
codes_to_query_devAcademicDisorder = pandas.read_csv(folder + "ciaranmci-developmental-academic-disorder-50f395a2.csv")
codes_to_query_dysthymia = pandas.read_csv(folder + "ciaranmci-dysthymia-6f6888c3.csv")
codes_to_query_personalityDisorder = pandas.read_csv(folder + "ciaranmci-personality-disorder-243a2f24.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
codes_to_query_all = pandas.read_csv(folder + "ciaranmci-unseen-snomed-codes-to-identify-cmhd-0b2abbef.csv")

# Medications of interest.
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN medications_psychosisAndRelated.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN medications_antidepressants.csv")
medications_to_query_all = pandas.read_csv(folder + "UNSEEN medications list.csv")

## Creating the array

In [3]:
sql = """
WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
),

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
#  ## Bipolar disorder
tbl_bipolar AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_bipolar["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_bipolar_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_bipolar
    WHERE
        src_snomedcode IN (tbl_bipolar.snomedcode)
),
#  ## Borderline personality disorder
tbl_borderline AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_borderline["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_borderline_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_borderline
    WHERE
        src_snomedcode IN (tbl_borderline.snomedcode)
),
#  ## Chronic PTSD
tbl_chronicPTSD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_chronicPTSD_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_chronicPTSD
    WHERE
        src_snomedcode IN (tbl_chronicPTSD.snomedcode)
),
#  ## Complex PTSD
tbl_complexPTSD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_chronicPTSD["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_complexPTSD_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_complexPTSD
    WHERE
        src_snomedcode IN (tbl_complexPTSD.snomedcode)
),
#  ## Depression
tbl_depression AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_depression["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_depression_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_depression
    WHERE
        src_snomedcode IN (tbl_depression.snomedcode)
),
#  ## Developmental academic disorder
tbl_devAcademicDisorder AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_devAcademicDisorder["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_devAcademicDisorder_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_devAcademicDisorder
    WHERE
        src_snomedcode IN (tbl_devAcademicDisorder.snomedcode)
),
#  ## Dysthymia
tbl_dysthymia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dysthymia["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_dysthymia_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_dysthymia
    WHERE
        src_snomedcode IN (tbl_dysthymia.snomedcode)
),
#  ## Personality disorder
tbl_personalityDisorder AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_personalityDisorder["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_personalityDisorder_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_personalityDisorder
    WHERE
        src_snomedcode IN (tbl_personalityDisorder.snomedcode)
),
#  ## Personality disorder
tbl_schizophrenia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_schizophrenia["code"].tolist())) + """'
                ]) AS snomedcode
),
tbl_schizophrenia_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRCode, tbl_schizophrenia
    WHERE
        src_snomedcode IN (tbl_schizophrenia.snomedcode)
),


# The following CTEs extract each medication list into a SQL table before querying the person_ID 
# associated with the medications (combined into medication type).
#
#  ## Drugs used in psychosis and related disorders.
tbl_meds_psychosisAndRelated AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS Medication
),
tbl_meds_psychosisAndRelated_persons AS (
    SELECT
        DISTINCT Tblb.person_id,
        tbl_meds_psychosisAndRelated.Medication
    FROM
        tbl_meds_psychosisAndRelated
    LEFT JOIN
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRPrimaryCareMedication AS Tblb
    ON
        Tblb.src_nameofmedication LIKE CONCAT('%',tbl_meds_psychosisAndRelated.Medication,'%')
    WHERE CAST(src_isrepeatmedication AS BOOL) IS TRUE 
        AND DATE_DIFF(CAST(src_datemedicationstart AS DATE), CURRENT_DATE(), MONTH) < 4
),
#  ## Hypnotics and anxiolyitcs
tbl_meds_hypnoticsAndAnxiolytics AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS Medication
),
tbl_meds_hypnoticsAndAnxiolytics_persons AS (
    SELECT
        DISTINCT Tblb.person_id,
        tbl_meds_hypnoticsAndAnxiolytics.Medication
    FROM
        tbl_meds_hypnoticsAndAnxiolytics
    LEFT JOIN
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRPrimaryCareMedication AS Tblb
    ON
        Tblb.src_nameofmedication LIKE CONCAT('%',tbl_meds_hypnoticsAndAnxiolytics.Medication,'%')
    WHERE CAST(src_isrepeatmedication AS BOOL) IS TRUE
        AND DATE_DIFF(CAST(src_datemedicationstart AS DATE), CURRENT_DATE(), MONTH) < 4
),
#  ## Antidepressants
tbl_meds_antidepressants AS (
    SELECT
        Medication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS Medication
),
tbl_meds_antidepressants_persons AS (
    SELECT
        DISTINCT Tblb.person_id,
        tbl_meds_antidepressants.Medication
    FROM
        tbl_meds_antidepressants
    LEFT JOIN
        yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.tbl_SRPrimaryCareMedication AS Tblb
    ON
        Tblb.src_nameofmedication LIKE CONCAT('%',tbl_meds_antidepressants.Medication,'%')
    WHERE CAST(src_isrepeatmedication AS BOOL) IS TRUE
        AND DATE_DIFF(CAST(src_datemedicationstart AS DATE), CURRENT_DATE(), MONTH) < 4
)


# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# clinical code and medication group. The code and medication columns are populated by interger
# values with '1' indicating that the code or medication is present in patient record and '0' indicating
# otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,CASE WHEN tbl_bipolar_persons.person_id IS NULL THEN 0 ELSE 1 END AS Bipolar
    ,CASE WHEN tbl_borderline_persons.person_id IS NULL THEN 0 ELSE 1 END AS Borderline
    ,CASE WHEN tbl_chronicPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ChronicPTSD
    ,CASE WHEN tbl_complexPTSD_persons.person_id IS NULL THEN 0 ELSE 1 END AS ComplexPTSD
    ,CASE WHEN tbl_depression_persons.person_id IS NULL THEN 0 ELSE 1 END AS Depression
    ,CASE WHEN tbl_devAcademicDisorder_persons.person_id IS NULL THEN 0 ELSE 1 END AS DevAcademicDisorder
    ,CASE WHEN tbl_dysthymia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Dysthymia
    ,CASE WHEN tbl_personalityDisorder_persons.person_id IS NULL THEN 0 ELSE 1 END AS PersonalityDisorder
    ,CASE WHEN tbl_schizophrenia_persons.person_id IS NULL THEN 0 ELSE 1 END AS Schizophrenia
    ,CASE WHEN tbl_meds_psychosisAndRelated_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_PsychosisAndRelated
    ,CASE WHEN tbl_meds_hypnoticsAndAnxiolytics_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_hypnoticsAndAnxiolytics
    ,CASE WHEN tbl_meds_antidepressants_persons.person_id IS NULL THEN 0 ELSE 1 END AS Meds_antidepressants
FROM tbl_persons
LEFT OUTER JOIN tbl_bipolar_persons ON tbl_persons.person_id = tbl_bipolar_persons.person_id
LEFT OUTER JOIN tbl_borderline_persons ON tbl_persons.person_id = tbl_borderline_persons.person_id
LEFT OUTER JOIN tbl_chronicPTSD_persons ON tbl_persons.person_id = tbl_chronicPTSD_persons.person_id
LEFT OUTER JOIN tbl_complexPTSD_persons ON tbl_persons.person_id = tbl_complexPTSD_persons.person_id
LEFT OUTER JOIN tbl_depression_persons ON tbl_persons.person_id = tbl_depression_persons.person_id
LEFT OUTER JOIN tbl_devAcademicDisorder_persons ON tbl_persons.person_id = tbl_devAcademicDisorder_persons.person_id
LEFT OUTER JOIN tbl_dysthymia_persons ON tbl_persons.person_id = tbl_dysthymia_persons.person_id
LEFT OUTER JOIN tbl_personalityDisorder_persons ON tbl_persons.person_id = tbl_personalityDisorder_persons.person_id
LEFT OUTER JOIN tbl_schizophrenia_persons ON tbl_persons.person_id = tbl_schizophrenia_persons.person_id
LEFT OUTER JOIN tbl_meds_psychosisAndRelated_persons ON tbl_persons.person_id = tbl_meds_psychosisAndRelated_persons.person_id
LEFT OUTER JOIN tbl_meds_hypnoticsAndAnxiolytics_persons ON tbl_persons.person_id = tbl_meds_hypnoticsAndAnxiolytics_persons.person_id
LEFT OUTER JOIN tbl_meds_antidepressants_persons ON tbl_persons.person_id = tbl_meds_antidepressants_persons.person_id
ORDER BY tbl_persons.person_id
"""

bqTable = client.query(sql).to_dataframe()

In [9]:
CMHD_dx_only = bqTable.loc[:, ~bqTable.columns.isin(['person_id',
                                                     'Meds_PsychosisAndRelated',
                                                     'Meds_hypnoticsAndAnxiolytics',
                                                     'Meds_antidepressants'])].max(axis = 1)
CMHD_rx_only = bqTable[['Meds_PsychosisAndRelated', 
                        'Meds_hypnoticsAndAnxiolytics',
                        'Meds_antidepressants']].max(axis = 1)
CMHD = []
for i_iter in range(len(CMHD_rx_only)):
    if (CMHD_dx_only[i_iter] == 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has a diagnostic code AND an active prescription,
        # then they are a definite CMHD = 2.
        CMHD.append(2)
    elif (CMHD_dx_only[i_iter] != 1 & CMHD_rx_only[i_iter] == 1):
        # If the patient has an active prescription but nota diagnostic code,
        # then they are a possible CMHD = 1.
        CMHD.append(1)
    else:
        # If the patient neither has a diagnostic code or an active prescriptions,
        # then they are definitely not CMHD = 0.
        CMHD.append(0)
        
caseness_array = \
    pandas.DataFrame(data = \
                     {"person_id" : bqTable['person_id'],
                      "CMHD" : CMHD,
                      "CMHD_dx_and_rx" : [1 if i_row == 2 else 0 for i_row in CMHD],
                      "CMHD_rx_not_dx" : [1 if i_row == 1 else 0 for i_row in CMHD],
                      "CMHD_control"   : [1 if i_row == 0 else 0 for i_row in CMHD],
                     }
                    )

# Clean up.
del(CMHD_dx_only, CMHD_rx_only)
# Make caseness_array available across notebooks.
%store caseness_array

Stored 'caseness_array' (DataFrame)


In [5]:
# Set parameters for disclosivity adjustments.
redaction_threshold = 7
target_round = 10

# Prepare header and note for presentation.
now = date.today()
#then = (now - relativedelta(years = 1)).strftime('%d-%b-%Y')
now = now.strftime('%d-%b-%Y')
display(
    Markdown("""
## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest 10.
Only then are proportions calculated.

The prevalence values refer to the period up to %s.
       """
       %(now)
       )
)

# Define the denominator.
denominator_as_int = len(caseness_array)
denominator_as_int = round(denominator_as_int / target_round) * target_round
denominator = \
    numpy.repeat(denominator_as_int,
                 bqTable.shape[1]-1, axis = 0)

# Define the base prevlance dataframe.
df_prevalence = \
    pandas.DataFrame(data = {'numerator'   : bqTable.loc[:,bqTable.columns !=  'person_id'].sum(),
                             'denominator' : denominator})
# Redact low counts.
df_prevalence = \
    df_prevalence.applymap(lambda x: numpy.nan if x <= redaction_threshold else x)

# Round to nearest target_round value.
df_prevalence = \
    (round(df_prevalence / target_round) * target_round).astype(int)

# Compute prevalence.
df_prevalence['prevalence (%)'] = \
    round((df_prevalence['numerator'] / df_prevalence['denominator']) * 100, 2)

# Print prevalence table.
print(df_prevalence)

# Calculate and state the minimum prevalence of any criterion diagnosis, and criterion medication.
min_criterion_prev_diag = \
    df_prevalence.loc[~df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                 'Meds_hypnoticsAndAnxiolytics',
                                                 'Meds_antidepressants']),
                      'prevalence (%)'].min()
min_criterion_prev_meds = \
    df_prevalence.loc[df_prevalence.index.isin(['Meds_PsychosisAndRelated',
                                                'Meds_hypnoticsAndAnxiolytics',
                                                'Meds_antidepressants']),
                      'prevalence (%)'].min()
min_criterion_prev = round(min_criterion_prev_diag * min_criterion_prev_meds, 2)

display(
    Markdown(
"""
Minimum prevalence of any criterion diagnosis is __%s%%__ of %s patients.

Minimum prevalence of any criterion medication is __%s%%__ of %s patients.

Therefore, the minimum criteria prevalence is __%s%%__ (i.e. the product of the minimum diagnostic and medication prevalence values).
"""
        %(min_criterion_prev_diag,
          f'{denominator_as_int:,}',
          min_criterion_prev_meds,
          f'{denominator_as_int:,}',
          min_criterion_prev
          )
    )
)


## Prevalence of caseness components (per hundred)

To mitigate disclosure, counts $\le 7$ are redacted before remaining values are rounded to the nearest 10.
Only then are proportions calculated.

The prevalence values refer to the period up to 21-Feb-2023.
       

                              numerator  denominator  prevalence (%)
Bipolar                            2170       703210            0.31
Borderline                          480       703210            0.07
ChronicPTSD                         120       703210            0.02
ComplexPTSD                         120       703210            0.02
Depression                         1190       703210            0.17
DevAcademicDisorder                2330       703210            0.33
Dysthymia                           520       703210            0.07
PersonalityDisorder                3600       703210            0.51
Schizophrenia                      2690       703210            0.38
Meds_PsychosisAndRelated           9670       703210            1.38
Meds_hypnoticsAndAnxiolytics       9260       703210            1.32
Meds_antidepressants              85270       703210           12.13



Minimum prevalence of any criterion diagnosis is __0.02%__ of 703,210 patients.

Minimum prevalence of any criterion medication is __1.32%__ of 703,210 patients.

Therefore, the minimum criteria prevalence is __0.03%__ (i.e. the product of the minimum diagnostic and medication prevalence values).


## Calculating the entropy of the caseness.

In [6]:
# A function to compute and present the entropy.
def entropy_output(column_name):
    entropy_caseness = scipy.stats.entropy(caseness_array[column_name].value_counts(), base = math.e)
    entropy_caseness_scaled = round(entropy_caseness / math.log(2, math.e) * 100, 1)
    entropy_caseness = round(entropy_caseness, 3)
    print("\t Caseness variable entropy = ", entropy_caseness, "nats")
    print("\t Caseness variable scaled entropy = ", entropy_caseness_scaled, "%")
    
    return entropy_caseness_scaled

print("\n Multinomial caseness variable...")
entropy_caseness_scaled_multi = entropy_output('CMHD')
print("\n \'Definite caseness\' variable...")
entropy_caseness_scaled_definite = entropy_output('CMHD_dx_and_rx')
print("\n \'Possible caseness\' variable...")
entropy_caseness_scaled_possibly = entropy_output('CMHD_rx_not_dx')
print("\n \'No caseness\' variable...")
entropy_caseness_scaled_control = entropy_output('CMHD_control')


 Multinomial caseness variable...
	 Caseness variable entropy =  0.422 nats
	 Caseness variable scaled entropy =  60.8 %

 'Definite caseness' variable...
	 Caseness variable entropy =  0.059 nats
	 Caseness variable scaled entropy =  8.5 %

 'Possible caseness' variable...
	 Caseness variable entropy =  0.364 nats
	 Caseness variable scaled entropy =  52.5 %

 'No caseness' variable...
	 Caseness variable entropy =  0.385 nats
	 Caseness variable scaled entropy =  55.5 %


## Calculating hit rates.

In [7]:
# A function to compute and present the entropy.
def hitrate_output(column_name):
    numerator = caseness_array[column_name].sum()
    hitRate_all = round((numerator / denominator_as_int) * 100, 1)
    hitRate_none = 100 - hitRate_all
    Odds_noYes = hitRate_none / (100 - hitRate_none)
    print("\t Hit rate (all) =", hitRate_all, "%")
    print("\t Hit rate (none) =", hitRate_none, "%")
    print("\t Odds (No CMHD : CMHD) = ", round(Odds_noYes, 2), "-times less likely to have CMHD than to have it.")
    
    return hitRate_none, hitRate_all

print("\n \'Definite caseness\' variable...")
hitRate_none, hitRate_all = hitrate_output('CMHD_dx_and_rx')


 'Definite caseness' variable...
	 Hit rate (all) = 1.1 %
	 Hit rate (none) = 98.9 %
	 Odds (No CMHD : CMHD) =  89.91 -times less likely to have CMHD than to have it.


In [8]:
display(
    Markdown(
"""    
We now know that:
1. based on the scaled entropy, our variable for indicating active complex mental health difficulties is $%s\%%$ as uncertain/surprising/unforeseeable
as it could possibly be; _and_
2. we would correctly classify $%s\%%$ of patients in this sample if we simply assumed that no one has active complex mental health difficulties.

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This 
encourages us to find feature sets that are proxies for this variable.
Given this encourgement to continue, the second point defines a benchmark for the indicative performance of any feature set that we evaluate in our
study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has active complex mental health difficulties
must correctly identify $\ge %s \%%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't).
"""
        %(entropy_caseness_scaled_definite, hitRate_none, hitRate_none)
            )
    )

    
We now know that:
1. based on the scaled entropy, our variable for indicating active complex mental health difficulties is $8.5\%$ as uncertain/surprising/unforeseeable
as it could possibly be; _and_
2. we would correctly classify $98.9\%$ of patients in this sample if we simply assumed that no one has active complex mental health difficulties.

The first point tells us that active caseness of complex mental health difficulties can be known with a lot of certainty, in this dataset. This 
encourages us to find feature sets that are proxies for this variable.
Given this encourgement to continue, the second point defines a benchmark for the indicative performance of any feature set that we evaluate in our
study. Specifically, any feature set that we suggest to improve our certainty of knowing that someone has active complex mental health difficulties
must correctly identify $\ge 98.9 \%$ of patients in our sample. Otherwise, the added feature set is a needless complication to our attempt to know
whether someone has complex mental health difficulties (which we can almost always safely assume they don't).


In [54]:
# Below, I compute the cells of the contingency table for a rule that says no one has active complex mental health difficulties.
#
# True positives. Zero because the rule says no one demonstrates active caseness.
tp = 0
# False positives. Zero because the rule says no one demonstrates active caseness.
fp = 0
# True negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
tn = hitRate_none
# False negatives. The opposite of the hit rates calculated previously, which assumed the rule that everyone demonstrated active caseness.
fn = hitRate_all

# Below, I compute the evaluation statistics.
#
# Class balance accuracy.
cba = round( 0.5 * ( (tp / max( (tp + fn), (tp + fp) ) ) + (tn / max( (tn + fp), (tn +fn) ) ) ), 2)
# Odds ratio.
oddsRatio = 'Not\ a\ number:\ One\ of\ the\ odds\ is\ zero.' if min( (tp * tn) , (fp * fn) ) == 0 else round( (tp * tn) / (fp * fn), 2)
# Positive predictive value.
ppv = 0 if (tp + fp) == 0 else round( tp / (tp + fp), 2)
# Negative predictive value.
npv = 0 if (tn + fn) == 0 else round( tn / (tn + fn), 2)

display(
    Markdown(
"""    
Assuming a rule that says no one demonstrates active caseness of complex mental health difficulties, we get the following values for our evaluation statistics:

- Class Balance Accuracy = $%s$

- Odds ratio = $%s$

- Positive predicitve value = $%s$

- Negative predicitve value = $%s$
"""
        %(cba, oddsRatio, ppv, npv)
            )
    )

    
Assuming a rule that says no one demonstrates active caseness of complex mental health difficulties, we get the following values for our evaluation statistics:

- Class Balance Accuracy = $0.49$

- Odds ratio = $Not\ a\ number:\ One\ of\ the\ odds\ is\ zero.$

- Positive predicitve value = $0$

- Negative predicitve value = $0.99$
