# Create feature sets

The purpose of this notebook is to create the feature sets.

## Imports and helper functions

In [1]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load codelist CSV files.
We used opencodelist.org to define codelists that define the set of SNOMED-CT codes used to identify patients based on various attributes.

In [2]:
# Instatiate BigQuery client.
client = bigquery.Client()

# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN_create_clinician_feature_sets.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_crisisContraception = pandas.read_csv(folder + "ciaranmci-crisis-contraception-409408ed.csv")
codes_to_query_UPSI = pandas.read_csv(folder + "ciaranmci-unprotected-sexual-intercourse-1c772b8e.csv")
codes_to_query_teenagePregnancy = pandas.read_csv(folder + "ciaranmci-teenage-adolescent-pregnancy-3e52c03a.csv")
codes_to_query_attemptedSuicide = pandas.read_csv(folder + "ciaranmci-attempted-suicide-0367b859.csv")
codes_to_query_selfHarm = pandas.read_csv(folder + "ciaranmci-self-harm-6b0c26b8.csv")
codes_to_query_CAMHSrefsAndDisch = pandas.read_csv(folder + "ciaranmci-camhs-referrals-and-discharges-0e87f47d.csv")
codes_to_query_substanceMisuse = pandas.read_csv(folder + "ciaranmci-substance-misuse-363a8058.csv")
codes_to_query_CYPmentalDisorder = pandas.read_csv(folder + "ciaranmci-child-young-person-mental-disorder-279767b2.csv")
codes_to_query_childAbuse = pandas.read_csv(folder + "ciaranmci-child-abuse-041b99ee.csv")
codes_to_query_familialSubstanceMisuse = pandas.read_csv(folder + "ciaranmci-familial-drug-abuse-and-misuse-3aec1eaf.csv")
codes_to_query_socialServicesInvolved = pandas.read_csv(folder + "ciaranmci-social-services-involved-25434d05.csv")
codes_to_query_childProtection = pandas.read_csv(folder + "ciaranmci-child-protection-13ed7469.csv")
codes_to_query_familialMentalHealthConditions = pandas.read_csv(folder + "ciaranmci-family-history-of-mental-health-conditions-or-suicide-53fb91e4.csv")
codes_to_query_partedParents = pandas.read_csv(folder + "ciaranmci-child-of-single-divorced-or-separated-parents-061a784f.csv")
codes_to_query_depressionNotDysthymiaOrChronic = pandas.read_csv(folder + "ciaranmci-depression-not-chronic-not-dysthymia-782b413d.csv")
codes_to_query_IAPTuse = pandas.read_csv(folder + "ciaranmci-iapt-use-51fcb383.csv")
codes_to_query_anxietyOrPanic = pandas.read_csv(folder + "ciaranmci-anxiety-or-panic-finding-and-disorder-2cfce79e.csv")
codes_to_query_agorophobia = pandas.read_csv(folder + "ciaranmci-agorophobia-67a5852d.csv")
codes_to_query_eatingDisorders = pandas.read_csv(folder + "ciaranmci-eating-disorders-3ba00044.csv")
codes_to_query_poorBodyImage = pandas.read_csv(folder + "ciaranmci-poor-body-image-27422b66.csv")
codes_to_query_CYPneurodevDisorder = pandas.read_csv(folder + "ciaranmci-child-young-person-neurodevelopmental-disorder-59b64e1c.csv")
codes_to_query_ADHD = pandas.read_csv(folder + "ciaranmci-attention-deficit-hyperactivity-disorder-752344fb.csv")
codes_to_query_autism = pandas.read_csv(folder + "nhsd-primary-care-domain-refsets-autism_cod-20210127.csv")
codes_to_query_SpLD = pandas.read_csv(folder + "ciaranmci-dyspraxia-dyscalculia-and-dyslexia-7889e686.csv")
codes_to_query_adultVictimOfAbuse = pandas.read_csv(folder + "ciaranmci-adult-victim-of-abuse-447ef3b5.csv")
codes_to_query_DNA = pandas.read_csv(folder + "ciaranmci-did-not-attend-098119da.csv")
codes_to_query_violentBehaviour = pandas.read_csv(folder + "ciaranmci-violent-behaviour-7071a6a4.csv")
codes_to_query_IAPTreferral = pandas.read_csv(folder + "ciaranmci-iapt-referral-16fed9a8.csv")
codes_to_query_paranoia = pandas.read_csv(folder + "ciaranmci-paranoia-5c690e12.csv")
codes_to_query_auditoryHallucinations = pandas.read_csv(folder + "ciaranmci-auditory-hallucinations-5762336f.csv")
codes_to_query_dissociation = pandas.read_csv(folder + "ciaranmci-dissociation-75788147.csv")
codes_to_query_NESA = pandas.read_csv(folder + "ciaranmci-non-epileptic-seizure-attack-depreciated-pseduoseizure-43599add.csv") ########needs updating
codes_to_query_alcoholMisuse = pandas.read_csv(folder + "ciaranmci-alcohol-misuse-53df56ed.csv")
codes_to_query_drugMisuse = pandas.read_csv(folder + "ciaranmci-drug-misuse-3acfe3b8.csv")
codes_to_query_DNA = pandas.read_csv(folder + "ciaranmci-did-not-attend-098119da.csv")
codes_to_query_suicidal = pandas.read_csv(folder + "ciaranmci-suicidal-5eaa56c5.csv")
codes_to_query_historyOfOrCurrentAddiction = pandas.read_csv(folder + "ciaranmci-history-of-or-current-addiction-5bf796cf.csv")
codes_to_query_drugMisuse = pandas.read_csv(folder + "ciaranmci-drug-misuse-3acfe3b8.csv")
codes_to_query_alcoholMisuse = pandas.read_csv(folder + "ciaranmci-alcohol-misuse-53df56ed.csv")
codes_to_query_asylumSeeker = pandas.read_csv(folder + "ciaranmci-asylum-seeker-4972fc5e.csv")
codes_to_query_raisedInCareSystem = pandas.read_csv(folder + "ciaranmci-raised-in-care-system-7e44a2be.csv")
codes_to_query_brainInjury = pandas.read_csv(folder + "ciaranmci-brain-injury-30638929.csv")
codes_to_query_socialServicesInvolved = pandas.read_csv(folder + "ciaranmci-social-services-involved-25434d05.csv")
codes_to_query_homeless = pandas.read_csv(folder + "ciaranmci-homelessness-0e1fe637.csv")
codes_to_query_incarcerationImprisonment = pandas.read_csv(folder + "ciaranmci-incarceration-or-imprisonment-75107301.csv")
codes_to_query_sleepDisturbance = pandas.read_csv(folder + "ciaranmci-sleep-disturbance-dyssomnia-29e21962.csv")
codes_to_query_tinnitus = pandas.read_csv(folder + "ciaranmci-tinnitus-finding-10d2a62d.csv")

codes_to_query_mentalIllHealth = pandas.read_csv(folder + "mental_ill_health_codelist.txt", sep = '\t')
codes_to_query_bipolar = pandas.read_csv(folder + "ciaranmci-bipolar-disorder-6a0308d7.csv")
codes_to_query_schizophrenia = pandas.read_csv(folder + "ciaranmci-schizophrenia-05c53c03.csv")
codes_to_query_mentalIllHealth = pandas.DataFrame(
    list(
        set(codes_to_query_mentalIllHealth["Id"]).difference(
            set(codes_to_query_bipolar["code"]).union(
                set(codes_to_query_schizophrenia["code"])
            )
        )
    )
    ,columns = ["Id"]
)


# Medications of interest.
medications_to_query_all = pandas.read_csv(folder + "UNSEEN_medications_list.csv")
medications_to_query_antidepressants = pandas.read_csv(folder + "UNSEEN medications_antidepressants.csv")
medications_to_query_hypnoticsAndAnxiolytics = pandas.read_csv(folder + "UNSEEN medications_hypnoticsAndAnxiolytics.csv")
medications_to_query_psychosisAndRelated = pandas.read_csv(folder + "UNSEEN medications_psychosisAndRelated.csv")
medications_to_query_alcoholMisuse = pandas.read_csv(folder + "UNSEEN medications indicating treatment for alcohol misuse.csv")
medications_to_query_drugMisuse = pandas.read_csv(folder + "UNSEEN medications indicating treatment for drug misuse.csv")

## Load requisites

In [3]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"
%store -r

## Create BiqQuery syntax

The final query makes use of a few component scripts of BigQuery syntax:

1. sql_declarations: Declares BigQuery variables using variables defined in the 'Load requisites' section of this an other notebooks. Defined in `UNSEEN_create_caseness_variable.ipynb`.
2. sql_studyPopulation: Script of Common Table expressions (CTEs) that define the study population into CTE table called `tbl_studyPopulation_no_caseness`. Defined in `UNSEEN_create_caseness_variable.ipynb`.
3. sql_fs_CTEs: Script of CTEs that define the CTE tables containing the feature sets. These CTE tables require joining.
4. sql_final_select: Script joining the CTE tables that defined the feature sets.

### Defining sql_fs_CTEs

In [4]:
sql_CTEs_body = \
"""
#  ## Homeless
,tbl_homeless AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_homeless["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_homeless_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_homeless
    WHERE
        a.snomedcode IN (tbl_homeless.snomedcode)
)
#  ## Poverty
,tbl_poverty_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('11403006', '284477001', '724451000000108', '722221000000105', '719781000000101')
)
#  ## Sleep disturbance
,tbl_sleepDisturbance AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_sleepDisturbance["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_sleepDisturbance_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_sleepDisturbance
    WHERE
        a.snomedcode IN (tbl_sleepDisturbance.snomedcode)
)
#  ## Suicidal ideation
,tbl_suicidal AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_suicidal["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_suicidal_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_suicidal
    WHERE
        a.snomedcode IN (tbl_suicidal.snomedcode)
)
#  ## Tinnitus
,tbl_tinnitus AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_tinnitus["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_tinnitus_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_tinnitus
    WHERE
        a.snomedcode IN (tbl_tinnitus.snomedcode)
)
# ## Food insecurity
,tbl_foodInsecurity AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('1078229009', '1004109000', '1002223009')
)
#  ## Age at first admission to psychiatric rehabilitation services
,tbl_admissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
        ,dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('306139004')
)
,tbl_ageAtFirstAdmissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT tbl_studyPopulation_no_caseness.person_id
        ,tbl_admissionToPsychRehabServices_persons.snomedcode
        ,MIN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth)) AS ageAtFirstAdmission
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_admissionToPsychRehabServices_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_admissionToPsychRehabServices_persons.person_id
    WHERE
        IS_NAN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth)) = FALSE
    GROUP BY
        tbl_studyPopulation_no_caseness.person_id, tbl_admissionToPsychRehabServices_persons.snomedcode
)

#  ## Incarceration or imprisonment
,tbl_incarcerationImprisonment AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_incarcerationImprisonment["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_incarcerationImprisonment_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_incarcerationImprisonment
    WHERE
        a.snomedcode IN (tbl_incarcerationImprisonment.snomedcode)
)
#  ## Metabolic syndrome
,tbl_metabolicSyndrome_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('237602007')
)
#  ## Sleep dysfunction
,tbl_sleepDysfunction_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('441877007', '442176004')
)
#  ## Count of appointments in the previous year.
,tbl_countAppointmentsPreviousYear_persons AS ( 
    SELECT 
        DISTINCT person_id
        ,COUNT( DISTINCT EXTRACT(DATE FROM datestart) ) AS countAppointmentsPreviousYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    WHERE
        DATE_DIFF(CURRENT_DATE(), datestart, YEAR) <= 1
    GROUP BY
        person_id
    ORDER BY
        person_id
)
# ## Median annual count of appointments.
,tbl_annualCountOfAppointments AS (
    SELECT 
        DISTINCT person_id
        ,EXTRACT(YEAR FROM datestart) AS year_appointment
        ,COUNT( DISTINCT EXTRACT(DATE FROM datestart) ) AS countAppointmentsPerYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    GROUP BY
        person_id
        ,year_appointment
    ORDER BY
        person_id
        ,year_appointment
)
,tbl_medianAnnualCountAppointments_persons AS (
    SELECT
        DISTINCT person_id
        ,PERCENTILE_DISC(countAppointmentsPerYear, 0.5) OVER(PARTITION BY person_id) AS medianAnnualCountAppointments
    FROM
        tbl_annualCountOfAppointments
    ORDER BY
        person_id
)
#  ## Count of Did-Not-Attend (DNA) in the previous year.
,tbl_DNAcodes AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_countDNAsPreviousYear_persons AS ( 
    SELECT 
      DISTINCT a.person_id
     ,COUNT(person_id) AS countDNAsPreviousYear
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
      a.snomedcode IN (tbl_DNAcodes.snomedcode)
      AND DATE_DIFF(CURRENT_DATE(), dateevent, YEAR) <= 1
    GROUP BY
        person_id
)
# ## Median annual count of Did-Not-Attend (DNA).
,tbl_annualCountOfDNAs AS ( 
    SELECT 
        DISTINCT a.person_id
        ,EXTRACT(YEAR FROM dateevent) AS year_DNA
        ,COUNT( DISTINCT EXTRACT(DATE FROM dateevent) ) AS countDNAsPerYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
        a.snomedcode IN (tbl_DNAcodes.snomedcode)
    GROUP BY
        person_id
        ,year_DNA
)
,tbl_medianAnnualCountDNAs_persons AS (
    SELECT
        DISTINCT tbl_annualCountOfDNAs.person_id
        ,PERCENTILE_DISC(countDNAsPerYear, 0.5) OVER(PARTITION BY person_id) AS medianAnnualCountDNAs
    FROM
        tbl_annualCountOfDNAs
    ORDER BY
        person_id
)
# ## Ratio of annual counts of Did-Not-Attend (DNA) to appointment, in the previous year.
,tbl_ratioDNAtoAppointmentPreviousYear_persons AS (
    SELECT
        DISTINCT tbl_countDNAsPreviousYear_persons.person_id
        ,(countDNAsPreviousYear / countAppointmentsPreviousYear) AS ratioDNAtoAppointmentPreviousYear
    FROM
        tbl_countDNAsPreviousYear_persons
    LEFT OUTER JOIN tbl_countAppointmentsPreviousYear_persons ON tbl_countDNAsPreviousYear_persons.person_id = tbl_countAppointmentsPreviousYear_persons.person_id
        
)
# ## Median annual ratio of DNA to appointments
,tbl_medianAnnualRatioDNAtoAppointment_persons AS (
    SELECT
        DISTINCT tbl_annualCountOfDNAs.person_id
        ,PERCENTILE_DISC( (countDNAsPerYear / countAppointmentsPerYear), 0.5) OVER(PARTITION BY tbl_annualCountOfDNAs.person_id) AS medianAnnualRatioDNAtoAppointment
    FROM
        tbl_annualCountOfDNAs
    LEFT OUTER JOIN
        tbl_annualCountOfAppointments
        ON
        (
        tbl_annualCountOfDNAs.person_id = tbl_annualCountOfAppointments.person_id
        AND tbl_annualCountOfDNAs.year_DNA = tbl_annualCountOfAppointments.year_appointment
        )
)
# ## Trafficked person.
,tbl_trafficked_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('734998001')
)
# ## Tortured person.
,tbl_tortured_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('248006006', '95318007')
)
# ## Obsessive-compulsive disorder.
,tbl_OCD_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('191736004')
)
# ## Non-native English speaker.
,tbl_nonNativeEnglishSpeaker_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('161148002', '1047281000000107')
)
# ## Hoarder.
,tbl_hoarder_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('248025009', '247968005')
)
#  ## History of or current addiction.
,tbl_historyOfOrCurrentAddiction AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_historyOfOrCurrentAddiction["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_historyOfOrCurrentAddiction_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_historyOfOrCurrentAddiction
    WHERE
        a.snomedcode IN (tbl_historyOfOrCurrentAddiction.snomedcode)
)
# ## Family history of psychosis.
,tbl_familyHistoryOfPsychosis_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
       snomedcode IN ('266969002', '429399002', '293721000000105', '293731000000107')
)
# ## Family history of alcoholism.
,tbl_familyHistoryOfAlcoholism_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
       snomedcode IN ('266890009', '293161000000103')
)
# ## Extreme self-neglect.
,tbl_extremeSelfNeglect_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
       snomedcode IN ('277850002', '439124004', '735939003')
)
# ## Body dysmorphic disorder.
,tbl_bodyDysmorphicDisorder_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
       snomedcode IN ('83482000')
)
#  ## Asylum seeker.
,tbl_asylumSeeker AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_asylumSeeker["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_asylumSeeker_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_asylumSeeker
    WHERE
        a.snomedcode IN (tbl_asylumSeeker.snomedcode)
)
#  ## Raised in the care system.
,tbl_raisedInCareSystem AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_raisedInCareSystem["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_raisedInCareSystem_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_raisedInCareSystem
    WHERE
        a.snomedcode IN (tbl_raisedInCareSystem.snomedcode)
)
#  ## Brain injury
,tbl_brainInjury AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_brainInjury["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_brainInjury_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_brainInjury
    WHERE
        a.snomedcode IN (tbl_brainInjury.snomedcode)
)
#  ## Chaotic upbringing indicated by multiple instances of social services involvement.
,tbl_socialServicesInvolved AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_socialServicesInvolved["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_socialServicesInvolved_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_socialServicesInvolved
    WHERE
        a.snomedcode IN (tbl_socialServicesInvolved.snomedcode)
)
,tbl_socialServicesInvolvedCYP_countOfCodes_persons AS (
    SELECT
        DISTINCT tbl_socialServicesInvolved_persons.person_id
        ,COUNT(snomedcode) AS socialServicesInvolvedCYP_countOfCodes
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_socialServicesInvolved_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolved_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_socialServicesInvolved_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
    GROUP BY person_id
)
# ## Count of psychological disorders.
,tbl_countPsychologicalDisorders_persons_and_codes AS (
    SELECT
        DISTINCT tbl_studyPopulation_no_caseness.person_id
        ,tbl_srcode.snomedcode
    FROM
        tbl_studyPopulation_no_caseness
    JOIN
        """ + server_id + """.""" + database_id + """.tbl_srcode
        ON tbl_studyPopulation_no_caseness.person_id = tbl_srcode.person_id
    JOIN 
        tbl_codes_mentalIllHealth
        ON tbl_srcode.snomedcode = tbl_codes_mentalIllHealth.my_snomedcode
)
,tbl_countPsychologicalDisorders AS (
    SELECT
        person_id
        ,COUNT(snomedcode) AS countPsychologicalDisorders
    FROM
        tbl_countPsychologicalDisorders_persons_and_codes
    GROUP BY
        person_id
)
# ## Crisis contraception
,tbl_crisisContraception AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_crisisContraception["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_crisisContraception_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_crisisContraception
    WHERE
        a.snomedcode IN (tbl_crisisContraception.snomedcode)
)
,tbl_crisisContraceptionCYP_persons AS (
    SELECT
        DISTINCT tbl_crisisContraception_persons.person_id
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_crisisContraception_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_crisisContraception_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_crisisContraception_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
# ## School refusal
,tbl_schoolRefusal_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('248052004')
)
#  ## Unprotected sexual intercourse
,tbl_UPSI AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_UPSI["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_UPSI_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_UPSI
    WHERE
        a.snomedcode IN (tbl_UPSI.snomedcode)
)
,tbl_UPSICYP_persons AS (
    SELECT
        DISTINCT tbl_UPSI_persons.person_id
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_UPSI_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_UPSI_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_UPSI_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## Teenage pregnancy
,tbl_teenagePregnancy AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_teenagePregnancy["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_teenagePregnancy_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_teenagePregnancy
    WHERE
        a.snomedcode IN (tbl_teenagePregnancy.snomedcode)
)
,tbl_teenagePregnancyCYP_persons AS (
    SELECT
        DISTINCT tbl_teenagePregnancy_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_teenagePregnancy_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_teenagePregnancy_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_teenagePregnancy_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## Attempted Suicide
,tbl_attemptedSuicide AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_attemptedSuicide["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_attemptedSuicide_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_attemptedSuicide
    WHERE
        a.snomedcode IN (tbl_attemptedSuicide.snomedcode)
)
,tbl_attemptedSuicideCYP_persons AS (
    SELECT
        DISTINCT tbl_attemptedSuicide_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_attemptedSuicide_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_attemptedSuicide_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_attemptedSuicide_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## Self harm
,tbl_selfHarm AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_selfHarm["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_selfHarmAdultpersons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_selfHarm
    WHERE
        a.snomedcode IN (tbl_selfHarm.snomedcode)
)
,tbl_selfHarmCYP_persons AS (
    SELECT
        DISTINCT tbl_selfHarmAdultpersons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_selfHarmAdultpersons ON tbl_studyPopulation_no_caseness.person_id = tbl_selfHarmAdultpersons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_selfHarmAdultpersons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## CAMHS referrals and Discharges.
,tbl_CAMHSrefsAndDisch AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_CAMHSrefsAndDisch["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_CAMHSrefsAndDisch_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_CAMHSrefsAndDisch
    WHERE
        a.snomedcode IN (tbl_CAMHSrefsAndDisch.snomedcode)
)
# ## IAPT revolving door.
,tbl_IAPTrevolvingDoor AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
        ,COUNT(snomedcode) AS n_occurrences
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('747821000000108')
    GROUP BY
        person_id, snomedcode
)
,tbl_IAPTrevolvingDoor_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        tbl_IAPTrevolvingDoor
    WHERE
        n_occurrences > 1
)
#  ## Substance misuse.
,tbl_substanceMisuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_substanceMisuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_substanceMisuse_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,a.dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_substanceMisuse
    WHERE
        a.snomedcode IN (tbl_substanceMisuse.snomedcode)
)
,tbl_substanceMisuseAdult_persons AS (
    SELECT
        DISTINCT tbl_substanceMisuse_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    JOIN tbl_substanceMisuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_substanceMisuse_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_substanceMisuse_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) > 18
)
,tbl_substanceMisuseCYP_persons AS (
    SELECT
        DISTINCT tbl_substanceMisuse_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    JOIN tbl_substanceMisuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_substanceMisuse_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_substanceMisuse_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## Child or young person mental disorder
,tbl_CYPmentalDisorder AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_CYPmentalDisorder["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_CYPmentalDisorder_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_CYPmentalDisorder
    WHERE
        a.snomedcode IN (tbl_CYPmentalDisorder.snomedcode)
)
#  ## Child abuse
,tbl_childAbuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_childAbuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_childAbuse_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_childAbuse
    WHERE
        a.snomedcode IN (tbl_childAbuse.snomedcode)
)
#  ## Familial substance misuse
,tbl_familialSubstanceMisuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_familialSubstanceMisuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_familialSubstanceMisuse_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_familialSubstanceMisuse
    WHERE
        a.snomedcode IN (tbl_familialSubstanceMisuse.snomedcode)
)
#  ## Social services involved
,tbl_socialServicesInvolvedAdult_persons AS (
    SELECT
        DISTINCT tbl_socialServicesInvolved_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_socialServicesInvolved_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolved_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_socialServicesInvolved_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) > 18
)
,tbl_socialServicesInvolvedCYP_persons AS (
    SELECT
        DISTINCT tbl_socialServicesInvolved_persons.person_id
        ,snomedcode
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_socialServicesInvolved_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolved_persons.person_id
    WHERE (EXTRACT(YEAR FROM tbl_socialServicesInvolved_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth) < 19
)
#  ## Child protection
,tbl_childProtection AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_childProtection["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_childProtection_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_childProtection
    WHERE
        a.snomedcode IN (tbl_childProtection.snomedcode)
)
#  ## Family history of mental health conditions or suicide
,tbl_familialMentalHealthConditions AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_familialMentalHealthConditions["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_familialMentalHealthConditions_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_familialMentalHealthConditions
    WHERE
        a.snomedcode IN (tbl_familialMentalHealthConditions.snomedcode)
)
#  ## Child of single divorced or separated parents
,tbl_partedParents AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_partedParents["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_partedParents_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_partedParents
    WHERE
        a.snomedcode IN (tbl_partedParents.snomedcode)
)
#  ## Depression that is not dysthymia or chronic depression
,tbl_depressionNotDysthymiaOrChronic AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_depressionNotDysthymiaOrChronic["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_depressionNotDysthymiaOrChronic_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_depressionNotDysthymiaOrChronic
    WHERE
        a.snomedcode IN (tbl_depressionNotDysthymiaOrChronic.snomedcode)
)
#  ## IAPT use
,tbl_IAPTuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_IAPTuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_IAPTuse_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_IAPTuse
    WHERE
        a.snomedcode IN (tbl_IAPTuse.snomedcode)
)
#  ## Anxiety or panic
,tbl_anxietyOrPanic AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_anxietyOrPanic["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_anxietyOrPanic_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_anxietyOrPanic
    WHERE
        a.snomedcode IN (tbl_anxietyOrPanic.snomedcode)
)
#  ## Agorophobia
,tbl_agorophobia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_agorophobia["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_agorophobia_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_agorophobia
    WHERE
        a.snomedcode IN (tbl_agorophobia.snomedcode)
)
#  ## Eating disorders
,tbl_eatingDisorders AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_eatingDisorders["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_eatingDisorders_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_eatingDisorders
    WHERE
        a.snomedcode IN (tbl_eatingDisorders.snomedcode)
)
#  ## poorBodyImage
,tbl_poorBodyImage AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_poorBodyImage["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_poorBodyImage_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_poorBodyImage
    WHERE
        a.snomedcode IN (tbl_poorBodyImage.snomedcode)
)
#  ## Child or young person neurodevelopmental disorder
,tbl_CYPneurodevDisorder AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_CYPneurodevDisorder["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_CYPneurodevDisorder_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_CYPneurodevDisorder
    WHERE
        a.snomedcode IN (tbl_CYPneurodevDisorder.snomedcode)
)
#  ## ADHD
,tbl_ADHD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_ADHD["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_ADHD_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_ADHD
    WHERE
        a.snomedcode IN (tbl_ADHD.snomedcode)
)
#  ## Autism
,tbl_autism AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_autism["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_autism_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_autism
    WHERE
        a.snomedcode IN (tbl_autism.snomedcode)
)
#  ## Special learning difficulties
,tbl_SpLD AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_SpLD["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_SpLD_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_SpLD
    WHERE
        a.snomedcode IN (tbl_SpLD.snomedcode)
)
#  ## adultVictimOfAbuse
,tbl_adultVictimOfAbuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_adultVictimOfAbuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_adultVictimOfAbuse_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_adultVictimOfAbuse
    WHERE
        a.snomedcode IN (tbl_adultVictimOfAbuse.snomedcode)
)
#  ## Many Did-Not-Attend (DNA)
,tbl_manyDNA AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_manyDNA_count AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,EXTRACT(YEAR FROM dateevent) AS year_occurrence
        ,COUNT(person_id) AS n_occurrences
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_manyDNA
    WHERE
        a.snomedcode IN (tbl_manyDNA.snomedcode)
    GROUP BY
        person_id, a.snomedcode, year_occurrence
)
,tbl_manyDNA_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        tbl_manyDNA_count
    WHERE
        # Justification for threshold is in section entitled "Rational for the threshold we used for 'manyDNA'".
        n_occurrences > 4
)
#  ## violentBehaviour
,tbl_violentBehaviour AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_violentBehaviour["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_violentBehaviour_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_violentBehaviour
    WHERE
        a.snomedcode IN (tbl_violentBehaviour.snomedcode)
)
#  ## IAPT referral
,tbl_IAPTreferral AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_IAPTreferral["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_IAPTreferral_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_IAPTreferral
    WHERE
        a.snomedcode IN (tbl_IAPTreferral.snomedcode)
)
#  ## Paranoia
,tbl_paranoia AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_paranoia["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_paranoia_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_paranoia
    WHERE
        a.snomedcode IN (tbl_paranoia.snomedcode)
)
#  ## Auditory hallucinations
,tbl_auditoryHallucinations AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_auditoryHallucinations["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_auditoryHallucinations_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_auditoryHallucinations
    WHERE
        a.snomedcode IN (tbl_auditoryHallucinations.snomedcode)
)
#  ## Dissociation
,tbl_dissociation AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_dissociation["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_dissociation_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_dissociation
    WHERE
        a.snomedcode IN (tbl_dissociation.snomedcode)
)
#  ## Non-epileptic siezure attack (depreceated: pseudoseizure), and somatic pain.
,tbl_NESA AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_NESA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_NESA_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_NESA
    WHERE
        a.snomedcode IN (tbl_NESA.snomedcode)
)
#  ## Alcohol misuse.
,tbl_meds_alcoholMisuse AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_alcoholMisuse["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_codes_alcoholMisuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_alcoholMisuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_meds_alcoholMisuse_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_alcoholMisuse
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_alcoholMisuse.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(datemedicationstart AS DATE), MONTH) < 4
)
,tbl_codes_alcoholMisuse_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_codes_alcoholMisuse
    WHERE
        a.snomedcode IN (tbl_codes_alcoholMisuse.snomedcode)
)
,tbl_alcoholMisuse_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        (SELECT * FROM tbl_meds_alcoholMisuse_persons
        UNION ALL
        SELECT * FROM tbl_codes_alcoholMisuse_persons)
)
#  ## Drug misuse.
,tbl_meds_drugMisuse AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_drugMisuse["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_codes_drugMisuse AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_drugMisuse["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_meds_drugMisuse_persons AS (
    SELECT
      DISTINCT person_id
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication, tbl_meds_drugMisuse
    WHERE
        nameofmedication LIKE CAST(CONCAT('%',tbl_meds_drugMisuse.my_nameofmedication,'%') AS STRING)
        AND DATE_DIFF(CURRENT_DATE(), CAST(datemedicationstart AS DATE), MONTH) < 4
)
,tbl_codes_drugMisuse_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_codes_drugMisuse
    WHERE
        a.snomedcode IN (tbl_codes_drugMisuse.snomedcode)
)
,tbl_drugMisuse_persons AS (
    SELECT
        DISTINCT person_id
    FROM
        (SELECT * FROM tbl_meds_drugMisuse_persons
        UNION ALL
        SELECT * FROM tbl_codes_drugMisuse_persons)
)
"""

### Defining sql_final_select

In [5]:
sql_final_select = \
"""
SELECT
    DISTINCT tbl_studyPopulation_no_caseness.person_id
    ,CASE WHEN tbl_homeless_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS homeless
    ,CASE WHEN tbl_poverty_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS poverty
    ,CASE WHEN tbl_sleepDisturbance_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS sleepDisturbance
    ,CASE WHEN tbl_suicidal_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS suicidal
    ,CASE WHEN tbl_tinnitus_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS tinnitus
    ,CASE WHEN tbl_foodInsecurity.person_id IS NULL THEN FALSE ELSE TRUE END AS foodInsecurity
    ,CASE WHEN tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS ageAtFirstAdmissionToPsychRehabServices
    ,CASE WHEN tbl_incarcerationImprisonment_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS incarcerationImprisonment
    ,CASE WHEN tbl_metabolicSyndrome_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS metabolicSyndrome
    ,CASE WHEN tbl_sleepDysfunction_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS sleepDysfunction
    ,countAppointmentsPreviousYear
    ,medianAnnualCountAppointments
    ,countDNAsPreviousYear
    ,medianAnnualCountDNAs
    ,ratioDNAtoAppointmentPreviousYear
    ,medianAnnualRatioDNAtoAppointment
    ,CASE WHEN tbl_trafficked_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS trafficked
    ,CASE WHEN tbl_tortured_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS tortured
    ,CASE WHEN tbl_OCD_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS OCD
    ,CASE WHEN tbl_nonNativeEnglishSpeaker_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS nonNativeEnglishSpeaker
    ,CASE WHEN tbl_hoarder_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS hoarder
    ,CASE WHEN tbl_historyOfOrCurrentAddiction_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS historyOfOrCurrentAddiction
    ,CASE WHEN tbl_familyHistoryOfPsychosis_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS familyHistoryOfPsychosis
    ,CASE WHEN tbl_familyHistoryOfAlcoholism_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS familyHistoryOfAlcoholism
    ,CASE WHEN tbl_extremeSelfNeglect_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS extremeSelfNeglect
    ,CASE WHEN tbl_bodyDysmorphicDisorder_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS bodyDysmorphicDisorder
    ,CASE WHEN tbl_raisedInCareSystem_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS raisedInCareSystem
    ,CASE WHEN tbl_brainInjury_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS brainInjury
    ,socialServicesInvolvedCYP_countOfCodes
    ,countPsychologicalDisorders
    ,CASE WHEN tbl_schoolRefusal_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS schoolRefusal
    ,CASE WHEN tbl_crisisContraceptionCYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS crisisContraceptionCYP
    ,CASE WHEN tbl_UPSI_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS UPSI
    ,CASE WHEN tbl_UPSICYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS UPSICYP
    ,CASE WHEN tbl_teenagePregnancy_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS teenagePregnancy
    ,CASE WHEN tbl_attemptedSuicide_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS attemptedSuicide
    ,CASE WHEN tbl_attemptedSuicideCYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS attemptedSuicideCYP
    ,CASE WHEN tbl_selfHarmAdultpersons.person_id IS NULL THEN FALSE ELSE TRUE END AS selfHarmAdult
    ,CASE WHEN tbl_selfHarmCYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS selfHarmCYP
    ,CASE WHEN tbl_CAMHSrefsAndDisch_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS CAMHSrefsAndDisch
    ,CASE WHEN tbl_IAPTrevolvingDoor_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS IAPTrevolvingDoor
    ,CASE WHEN tbl_substanceMisuseAdult_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS substanceMisuseAdult
    ,CASE WHEN tbl_substanceMisuseCYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS substanceMisuseCYP
    ,CASE WHEN tbl_CYPmentalDisorder_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS CYPmentalDisorder
    ,CASE WHEN tbl_childAbuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS childAbuse
    ,CASE WHEN tbl_familialSubstanceMisuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS familialSubstanceMisuse
    ,CASE WHEN tbl_socialServicesInvolvedAdult_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS socialServicesInvolvedAdult
    ,CASE WHEN tbl_socialServicesInvolvedCYP_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS socialServicesInvolvedCYP
    ,CASE WHEN tbl_childProtection_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS childProtection
    ,CASE WHEN tbl_familialMentalHealthConditions_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS familialMentalHealthConditions
    ,CASE WHEN tbl_partedParents_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS partedParents
    ,CASE WHEN tbl_depressionNotDysthymiaOrChronic_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS depressionNotDysthymiaOrChronic
    ,CASE WHEN tbl_IAPTuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS IAPTuse
    ,CASE WHEN tbl_anxietyOrPanic_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS anxietyOrPanic
    ,CASE WHEN tbl_agorophobia_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS agorophobia
    ,CASE WHEN tbl_eatingDisorders_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS eatingDisorders
    ,CASE WHEN tbl_poorBodyImage_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS poorBodyImage
    ,CASE WHEN tbl_CYPneurodevDisorder_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS CYPneurodevDisorder
    ,CASE WHEN tbl_ADHD_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS ADHD
    ,CASE WHEN tbl_autism_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS autism
    ,CASE WHEN tbl_SpLD_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS SpLD
    ,CASE WHEN tbl_adultVictimOfAbuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS adultVictimOfAbuse
    ,CASE WHEN tbl_manyDNA_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS manyDNA
    ,CASE WHEN tbl_violentBehaviour_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS violentBehaviour
    ,CASE WHEN tbl_IAPTreferral_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS IAPTreferral
    ,CASE WHEN tbl_paranoia_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS paranoia
    ,CASE WHEN tbl_auditoryHallucinations_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS auditoryHallucinations
    ,CASE WHEN tbl_dissociation_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS dissociation
    ,CASE WHEN tbl_NESA_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS NESA
    ,CASE WHEN tbl_alcoholMisuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS alcoholMisuse
    ,CASE WHEN tbl_drugMisuse_persons.person_id IS NULL THEN FALSE ELSE TRUE END AS drugMisuse
    
FROM tbl_studyPopulation_no_caseness

LEFT OUTER JOIN tbl_homeless_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_homeless_persons.person_id
LEFT OUTER JOIN tbl_poverty_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_poverty_persons.person_id
LEFT OUTER JOIN tbl_sleepDisturbance_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_sleepDisturbance_persons.person_id
LEFT OUTER JOIN tbl_suicidal_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_suicidal_persons.person_id
LEFT OUTER JOIN tbl_tinnitus_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_tinnitus_persons.person_id
LEFT OUTER JOIN tbl_foodInsecurity ON tbl_studyPopulation_no_caseness.person_id = tbl_foodInsecurity.person_id
LEFT OUTER JOIN tbl_ageAtFirstAdmissionToPsychRehabServices_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id
LEFT OUTER JOIN tbl_incarcerationImprisonment_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_incarcerationImprisonment_persons.person_id
LEFT OUTER JOIN tbl_metabolicSyndrome_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_metabolicSyndrome_persons.person_id
LEFT OUTER JOIN tbl_sleepDysfunction_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_sleepDysfunction_persons.person_id
LEFT OUTER JOIN tbl_countAppointmentsPreviousYear_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_countAppointmentsPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualCountAppointments_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_medianAnnualCountAppointments_persons.person_id
LEFT OUTER JOIN tbl_countDNAsPreviousYear_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_countDNAsPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualCountDNAs_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_medianAnnualCountDNAs_persons.person_id
LEFT OUTER JOIN tbl_ratioDNAtoAppointmentPreviousYear_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_ratioDNAtoAppointmentPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualRatioDNAtoAppointment_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_medianAnnualRatioDNAtoAppointment_persons.person_id
LEFT OUTER JOIN tbl_trafficked_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_trafficked_persons.person_id
LEFT OUTER JOIN tbl_tortured_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_tortured_persons.person_id
LEFT OUTER JOIN tbl_OCD_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_OCD_persons.person_id
LEFT OUTER JOIN tbl_nonNativeEnglishSpeaker_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_nonNativeEnglishSpeaker_persons.person_id
LEFT OUTER JOIN tbl_hoarder_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_hoarder_persons.person_id
LEFT OUTER JOIN tbl_historyOfOrCurrentAddiction_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_historyOfOrCurrentAddiction_persons.person_id
LEFT OUTER JOIN tbl_familyHistoryOfPsychosis_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_familyHistoryOfPsychosis_persons.person_id
LEFT OUTER JOIN tbl_familyHistoryOfAlcoholism_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_familyHistoryOfAlcoholism_persons.person_id
LEFT OUTER JOIN tbl_extremeSelfNeglect_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_extremeSelfNeglect_persons.person_id
LEFT OUTER JOIN tbl_bodyDysmorphicDisorder_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_bodyDysmorphicDisorder_persons.person_id
LEFT OUTER JOIN tbl_asylumSeeker_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_asylumSeeker_persons.person_id
LEFT OUTER JOIN tbl_raisedInCareSystem_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_raisedInCareSystem_persons.person_id
LEFT OUTER JOIN tbl_brainInjury_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_brainInjury_persons.person_id
LEFT OUTER JOIN tbl_socialServicesInvolvedCYP_countOfCodes_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolvedCYP_countOfCodes_persons.person_id
LEFT OUTER JOIN tbl_countPsychologicalDisorders  ON tbl_studyPopulation_no_caseness.person_id = tbl_countPsychologicalDisorders.person_id
LEFT OUTER JOIN tbl_schoolRefusal_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_schoolRefusal_persons.person_id
LEFT OUTER JOIN tbl_crisisContraceptionCYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_crisisContraceptionCYP_persons.person_id
LEFT OUTER JOIN tbl_UPSI_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_UPSI_persons.person_id
LEFT OUTER JOIN tbl_UPSICYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_UPSICYP_persons.person_id
LEFT OUTER JOIN tbl_teenagePregnancy_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_teenagePregnancy_persons.person_id
LEFT OUTER JOIN tbl_attemptedSuicide_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_attemptedSuicide_persons.person_id
LEFT OUTER JOIN tbl_attemptedSuicideCYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_attemptedSuicideCYP_persons.person_id
LEFT OUTER JOIN tbl_selfHarmAdultpersons ON tbl_studyPopulation_no_caseness.person_id = tbl_selfHarmAdultpersons.person_id
LEFT OUTER JOIN tbl_selfHarmCYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_selfHarmCYP_persons.person_id
LEFT OUTER JOIN tbl_CAMHSrefsAndDisch_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_CAMHSrefsAndDisch_persons.person_id
LEFT OUTER JOIN tbl_IAPTrevolvingDoor_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_IAPTrevolvingDoor_persons.person_id
LEFT OUTER JOIN tbl_substanceMisuseAdult_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_substanceMisuseAdult_persons.person_id
LEFT OUTER JOIN tbl_substanceMisuseCYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_substanceMisuseCYP_persons.person_id
LEFT OUTER JOIN tbl_CYPmentalDisorder_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_CYPmentalDisorder_persons.person_id
LEFT OUTER JOIN tbl_childAbuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_childAbuse_persons.person_id
LEFT OUTER JOIN tbl_familialSubstanceMisuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_familialSubstanceMisuse_persons.person_id
LEFT OUTER JOIN tbl_socialServicesInvolvedAdult_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolvedAdult_persons.person_id
LEFT OUTER JOIN tbl_socialServicesInvolvedCYP_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_socialServicesInvolvedCYP_persons.person_id
LEFT OUTER JOIN tbl_childProtection_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_childProtection_persons.person_id
LEFT OUTER JOIN tbl_familialMentalHealthConditions_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_familialMentalHealthConditions_persons.person_id
LEFT OUTER JOIN tbl_partedParents_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_partedParents_persons.person_id
LEFT OUTER JOIN tbl_depressionNotDysthymiaOrChronic_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_depressionNotDysthymiaOrChronic_persons.person_id
LEFT OUTER JOIN tbl_IAPTuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_IAPTuse_persons.person_id
LEFT OUTER JOIN tbl_anxietyOrPanic_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_anxietyOrPanic_persons.person_id
LEFT OUTER JOIN tbl_agorophobia_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_agorophobia_persons.person_id
LEFT OUTER JOIN tbl_eatingDisorders_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_eatingDisorders_persons.person_id
LEFT OUTER JOIN tbl_poorBodyImage_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_poorBodyImage_persons.person_id
LEFT OUTER JOIN tbl_CYPneurodevDisorder_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_CYPneurodevDisorder_persons.person_id
LEFT OUTER JOIN tbl_ADHD_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_ADHD_persons.person_id
LEFT OUTER JOIN tbl_autism_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_autism_persons.person_id
LEFT OUTER JOIN tbl_SpLD_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_SpLD_persons.person_id
LEFT OUTER JOIN tbl_adultVictimOfAbuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_adultVictimOfAbuse_persons.person_id
LEFT OUTER JOIN tbl_manyDNA_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_manyDNA_persons.person_id
LEFT OUTER JOIN tbl_violentBehaviour_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_violentBehaviour_persons.person_id
LEFT OUTER JOIN tbl_IAPTreferral_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_IAPTreferral_persons.person_id
LEFT OUTER JOIN tbl_paranoia_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_paranoia_persons.person_id
LEFT OUTER JOIN tbl_auditoryHallucinations_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_auditoryHallucinations_persons.person_id
LEFT OUTER JOIN tbl_dissociation_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_dissociation_persons.person_id
LEFT OUTER JOIN tbl_NESA_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_NESA_persons.person_id
LEFT OUTER JOIN tbl_alcoholMisuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_alcoholMisuse_persons.person_id
LEFT OUTER JOIN tbl_drugMisuse_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_drugMisuse_persons.person_id

ORDER BY tbl_studyPopulation_no_caseness.person_id
"""

### Running the full query

In [22]:
feature_set_array = client.query(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_final_select).to_dataframe()

## Add feature sets that are defined by others

Some feature sets aren't defined directly by querying the patient's electronic health records. Instead, they require additional processing to define.

### Mental health treatments

#### Relevant prescriptions.

In [25]:
sql_select = \
"""
SELECT
    DISTINCT tbl_srprimarycaremedication.person_id
    ,1 AS relevantPrescriptions
FROM
    """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
    # This CROSS JOIN conveniently creates all possible combinations of values of the
    # tbl_srprimarycaremedication table and my `tbl_medications`. This sets up my interim result to 
    # easily do a row-wise comparison of the medications of interest with the variously-
    # worded `nameofmedication` values in the database.
CROSS JOIN
    tbl_medications
WHERE
    # This filters for the medications of interest.
    REGEXP_CONTAINS(nameofmedication, tbl_medications.my_nameofmedication) = True
    AND
    CAST(isrepeatmedication AS BOOL) IS TRUE
    AND
    DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), MONTH) < 3
"""

fs_relevantPrescriptions = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()
fs_relevantPrescriptions.relevantPrescriptions = fs_relevantPrescriptions.relevantPrescriptions.astype(bool)
# Join relevantPrescriptions
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_relevantPrescriptions[['person_id',
                                           'relevantPrescriptions']],
                 on = 'person_id',
                 how = 'left')

In [84]:
# MentalHealthTratments
feature_set_array['MentalHealthTreatments'] = \
    feature_set_array[['CAMHSrefsAndDisch',
                  'IAPTrevolvingDoor',
                  'substanceMisuseAdult',
                  'relevantPrescriptions',
                  ]].any(True)

### Young person, mental health concerns

In [85]:
# CYPmentalHealthConcern
feature_set_array['CYPmentalHealthConcern'] = \
    feature_set_array[['attemptedSuicideCYP',
                  'selfHarmCYP',
                  'CAMHSrefsAndDisch']].any(True)

### Family concerns

In [86]:
# FamilyConcerns
feature_set_array['FamilyConcerns'] = \
    feature_set_array[['familialSubstanceMisuse',
                  'socialServicesInvolvedAdult',
                  'childProtection',
                  'familialMentalHealthConditions',
                  'partedParents']].any(True)

### Adulthood concerns

In [87]:
# AdulthoodConcerns
feature_set_array['AdulthoodConcerns'] = \
    feature_set_array[['adultVictimOfAbuse',
                  'substanceMisuseAdult']].any(True)

### Access to healthcare

#### Recurrent ED attendance

In [88]:
sql_CTEs_body = \
"""
,tbl_visit_date_and_lag_date AS (
  SELECT
    person_id
    ,tbl_ae_start_date AS visit_date
    ,LAG(tbl_ae_start_date) OVER(PARTITION BY person_id ORDER BY tbl_ae_start_date) AS lag_visit_date
  FROM
    `""" + server_id + """.CB_FDM_Warehouse_V2.tbl_ae`
)
,tbl_cnt_annual_AE_attendence AS (
  SELECT
    person_id
    ,COUNT(person_id) AS cnt_annual_AE_attendence
  FROM
    tbl_visit_date_and_lag_date
  WHERE
    DATE_DIFF(visit_date, lag_visit_date, DAY) BETWEEN 1 AND 365
  GROUP BY
    person_id
)
"""
sql_select = \
"""
SELECT
  person_id
  ,cnt_annual_AE_attendence
  ,CASE
    WHEN cnt_annual_AE_attendence > 20 THEN 2
    WHEN cnt_annual_AE_attendence > 4 THEN 1
    ELSE 0
  END AS recurrentEDattednances
FROM
  tbl_cnt_annual_AE_attendence
ORDER BY
  person_id
"""

fs_recurrentEDattednances = client.query(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_select).to_dataframe()

# Join recurrentEDattednances.
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_recurrentEDattednances[['person_id',
                                     'recurrentEDattednances']],
                 on = 'person_id',
                 how = 'left')

In [89]:
# AccessToHealthcare
feature_set_array['AccessToHealthcare'] = \
    feature_set_array[['manyDNA',
                  'violentBehaviour',
                  'recurrentEDattednances']].any(True)

### Count of antidepressants prescriptions, in the three years before the index date

In [90]:
sql_select = \
"""
,tbl_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_AntidepressantPrescriptions_persons AS (
    SELECT
        DISTINCT person_id
        ,datemedicationstart
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        # This CROSS JOIN conveniently creates all possible combinations of values of the
        # tbl_srprimarycaremedication table and my `tbl_antidepressants`. This sets up my interim result to 
        # easily do a row-wise comparison of the medications of interest with the variously-
        # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_antidepressants
    WHERE
        CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
        AND
        # This filters for the medications of interest.
        REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_antidepressants.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
)
SELECT
  DISTINCT person_id
  ,COUNT(datemedicationstart) AS countAntidepressantPrescriptions
FROM
  tbl_AntidepressantPrescriptions_persons
GROUP BY
    person_id
ORDER BY
    person_id
"""

fs_countAntidepressantPrescriptions = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join countAntidepressantPrescriptions
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_countAntidepressantPrescriptions[
                     ['person_id',
                      'countAntidepressantPrescriptions']],
                 on = 'person_id',
                 how = 'left')

### Count of unique antidepressant medications, in the three years before the index date

In [91]:
sql_select = \
"""
,tbl_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_uniqueAntidepressants_persons AS (
    SELECT
        DISTINCT person_id
        ,my_nameofmedication
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        # This CROSS JOIN conveniently creates all possible combinations of values of the
        # tbl_srprimarycaremedication table and my `tbl_antidepressants`. This sets up my interim result to 
        # easily do a row-wise comparison of the medications of interest with the variously-
        # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_antidepressants
    WHERE
        CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
        AND
        # This filters for the medications of interest.
        REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_antidepressants.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
)
SELECT
  DISTINCT person_id
  ,COUNT(my_nameofmedication) AS countUniqueAntidepressants
FROM
  tbl_uniqueAntidepressants_persons
GROUP BY
    person_id
ORDER BY
    person_id
"""

fs_countUniqueAntidepressants = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join countUniqueAntidepressants.
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_countUniqueAntidepressants[
                     ['person_id',
                      'countUniqueAntidepressants']],
                 on = 'person_id',
                 how = 'left')

### Count of hyponotics or anxiolytics prescriptions, in the three years before the index date

In [92]:
sql_select = \
"""
,tbl_hypnoticsAndAnxiolytics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_hypnoticsAndAnxiolyticsPrescriptions_persons AS (
    SELECT
        DISTINCT person_id
        ,datemedicationstart
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        # This CROSS JOIN conveniently creates all possible combinations of values of the
        # tbl_srprimarycaremedication table and my `tbl_hypnoticsAndAnxiolytics`. This sets up my interim result to 
        # easily do a row-wise comparison of the medications of interest with the variously-
        # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_hypnoticsAndAnxiolytics
    WHERE
        CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
        AND
        # This filters for the medications of interest.
        REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_hypnoticsAndAnxiolytics.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
)
SELECT
  DISTINCT person_id
  ,COUNT(datemedicationstart) AS countHypnoticsAndAnxiolyticsPrescriptions
FROM
  tbl_hypnoticsAndAnxiolyticsPrescriptions_persons
GROUP BY
    person_id
ORDER BY
    person_id
"""

fs_countHypnoticsAndAnxiolyticsPrescriptionss = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join countHypnoticsAndAnxiolyticsPrescriptions
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_countHypnoticsAndAnxiolyticsPrescriptionss[
                     ['person_id',
                      'countHypnoticsAndAnxiolyticsPrescriptions']],
                 on = 'person_id',
                 how = 'left')

### Count of unique hyponotic and anxiolytic medications, in the three years before the index date

In [93]:
sql_select = \
"""
,tbl_hypnoticsAndAnxiolytics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_hypnoticsAndAnxiolytics["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_uniquehypnoticsAndAnxiolytics_persons AS (
    SELECT
        DISTINCT person_id
        ,my_nameofmedication
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        # This CROSS JOIN conveniently creates all possible combinations of values of the
        # tbl_srprimarycaremedication table and my `tbl_hypnoticsAndAnxiolytics`. This sets up my interim result to 
        # easily do a row-wise comparison of the medications of interest with the variously-
        # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_hypnoticsAndAnxiolytics
    WHERE
        CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
        AND
        # This filters for the medications of interest.
        REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_hypnoticsAndAnxiolytics.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
)
SELECT
  DISTINCT person_id
  ,COUNT(my_nameofmedication) AS countUniqueHypnoticsAndAnxiolytics
FROM
  tbl_uniquehypnoticsAndAnxiolytics_persons
GROUP BY
    person_id
ORDER BY
    person_id
"""

fs_countUniqueHypnoticsAndAnxiolytics = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join countUniqueHypnoticsAndAnxiolytics.
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_countUniqueHypnoticsAndAnxiolytics[
                     ['person_id',
                      'countUniqueHypnoticsAndAnxiolytics']],
                 on = 'person_id',
                 how = 'left')

### Antipsychotic prescriptions, in the three years before the index date

In [94]:
sql_select = \
"""
,tbl_antipsychotics AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_psychosisAndRelated["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
SELECT
    DISTINCT person_id
    ,1 AS antipsychoticsPrescription
FROM
    """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
    # This CROSS JOIN conveniently creates all possible combinations of values of the
    # tbl_srprimarycaremedication table and my `tbl_antipsychotics`. This sets up my interim result to 
    # easily do a row-wise comparison of the medications of interest with the variously-
    # worded `nameofmedication` values in the database.
CROSS JOIN
    tbl_antipsychotics
WHERE
    CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
    AND
    # This filters for the medications of interest.
    REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_antipsychotics.my_nameofmedication) = True
    AND
    DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
"""

fs_antipsychoticsPrescription = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join antipsychoticsPrescription
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_antipsychoticsPrescription[
                     ['person_id',
                      'antipsychoticsPrescription']],
                 on = 'person_id',
                 how = 'left')

### Count of aborted medication regimes

In [95]:
sql_select = \
"""
,tbl_antidepressants AS (
    SELECT
        my_nameofmedication
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, medications_to_query_antidepressants["Medication"].tolist())) + """'
                ]) AS my_nameofmedication
)
,tbl_laggedAndLeadPrescriptions_persons AS (
    SELECT
        DISTINCT person_id
        ,datemedicationstart
        ,LAG(datemedicationstart) OVER(PARTITION BY person_id ORDER BY dateevent) AS lagged_datemedicationstart
        ,LEAD(datemedicationstart, 1) OVER(PARTITION BY person_id ORDER BY dateevent) AS leadBy1_datemedicationstart
        ,LEAD(datemedicationstart, 2) OVER(PARTITION BY person_id ORDER BY dateevent) AS leadBy2_datemedicationstart
        #,EXTRACT(YEAR FROM datemedicationstart) AS year_prescription
        #,EXTRACT(MONTH FROM datemedicationstart) AS month_prescription
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srprimarycaremedication
        # This CROSS JOIN conveniently creates all possible combinations of values of the
        # tbl_srprimarycaremedication table and my `tbl_antidepressants`. This sets up my interim result to 
        # easily do a row-wise comparison of the medications of interest with the variously-
        # worded `nameofmedication` values in the database.
    CROSS JOIN
        tbl_antidepressants
    WHERE
        CAST(tbl_srprimarycaremedication.isrepeatmedication AS BOOL) IS TRUE
        AND
        # This filters for the medications of interest.
        REGEXP_CONTAINS(tbl_srprimarycaremedication.nameofmedication, tbl_antidepressants.my_nameofmedication) = True
        AND
        DATE_DIFF(myIndexDate, CAST(tbl_srprimarycaremedication.datemedicationstart AS DATE), YEAR) < 3
)
SELECT
    DISTINCT person_id
    ,COUNT(*) AS countAbortedMedicationRegimes
FROM
    tbl_laggedAndLeadPrescriptions_persons
WHERE
    # No prescription in previous month.
    DATE_DIFF(datemedicationstart, lagged_datemedicationstart, MONTH) > 1
    AND
    # No prescription in next month.
    DATE_DIFF(leadBy1_datemedicationstart, datemedicationstart, MONTH) > 1
    AND
    # No prescription in next next month.
    DATE_DIFF(leadBy2_datemedicationstart, datemedicationstart, MONTH) > 2
GROUP BY
    person_id
ORDER BY
    person_id
   # ,datemedicationstart
    #,lagged_datemedicationstart
"""

fs_countAbortedMedicationRegimes = client.query(sql_declarations + sql_studyPopulation + sql_select).to_dataframe()

# Join countAbortedMedicationRegimes
feature_set_array = \
    pandas.merge(feature_set_array,
                 fs_countAbortedMedicationRegimes[
                     ['person_id',
                      'countAbortedMedicationRegimes']],
                 on = 'person_id',
                 how = 'left')

### Recurring mental symptoms

In [96]:
# RecurringMentalSymptoms
feature_set_array['RecurringMentalSymptoms'] = \
    feature_set_array[['paranoia',
                  'auditoryHallucinations',
                  'dissociation',
                  'NESA',
                  'violentBehaviour']].any(True)

In [97]:
feature_set_array

Unnamed: 0,person_id,homeless,poverty,sleepDisturbance,suicidal,tinnitus,foodInsecurity,ageAtFirstAdmissionToPsychRehabServices,incarcerationImprisonment,metabolicSyndrome,...,AdulthoodConcerns,recurrentEDattednances,AccessToHealthcare,countAntidepressantPrescriptions,countUniqueAntidepressants,countHypnoticsAndAnxiolyticsPrescriptions,countUniqueHypnoticsAndAnxiolytics,antipsychoticsPrescription,countAbortedMedicationRegimes,RecurringMentalSymptoms
0,70,False,False,False,False,False,False,False,False,False,...,True,,False,,,,,,,False
1,230,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False
2,272,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False
3,275,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False
4,387,False,False,True,False,True,False,False,False,False,...,False,,False,8.0,2.0,,,,2.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208003,14994436,False,False,False,False,False,False,False,False,False,...,False,,False,1.0,1.0,,,,,False
208004,14994448,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False
208005,14994470,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False
208006,14994859,False,False,False,False,False,False,False,False,False,...,False,,False,,,,,,,False


### Entropy-based feature sets potentially indicative of "chaotic life"

The data from BigQuery results needs to be appointments and DNAs tallied in three-month blocks, per person. Specifically, I use BigQuery's built-in `QUARTER()` function for which Q1 = Jan-Mar, Q2 = Apr-Jun, etc.  The query will only return data for quarters in which there was an appointment or a DNA. Each patient's data will be processed in Python to fill in the missing quarters' counts with 0 before calculating the values of the entropy-based feature sets.

## Save `feature_set_array`, then show feature set names.

In [98]:
feature_set_array = feature_set_array.fillna(0)
%store feature_set_array
print(f'The feature sets are {list(feature_set_array.columns)}')

Stored 'feature_set_array' (DataFrame)
The feature sets are ['person_id', 'homeless', 'poverty', 'sleepDisturbance', 'suicidal', 'tinnitus', 'foodInsecurity', 'ageAtFirstAdmissionToPsychRehabServices', 'incarcerationImprisonment', 'metabolicSyndrome', 'sleepDysfunction', 'countAppointmentsPreviousYear', 'medianAnnualCountAppointments', 'countDNAsPreviousYear', 'medianAnnualCountDNAs', 'ratioDNAtoAppointmentPreviousYear', 'medianAnnualRatioDNAtoAppointment', 'trafficked', 'tortured', 'OCD', 'nonNativeEnglishSpeaker', 'hoarder', 'historyOfOrCurrentAddiction', 'familyHistoryOfPsychosis', 'familyHistoryOfAlcoholism', 'extremeSelfNeglect', 'bodyDysmorphicDisorder', 'raisedInCareSystem', 'brainInjury', 'socialServicesInvolvedCYP_countOfCodes', 'countPsychologicalDisorders', 'schoolRefusal', 'crisisContraceptionCYP', 'UPSI', 'UPSICYP', 'teenagePregnancy', 'attemptedSuicide', 'attemptedSuicideCYP', 'selfHarmAdult', 'selfHarmCYP', 'CAMHSrefsAndDisch', 'IAPTrevolvingDoor', 'substanceMisuseAdult'

# ---------------------------------------------------------------------------------

# Rationale for decisions made about how some feature set were defined.

## Discretise float64 feature sets to calculate mutual information.

For consistency, I am computing normalised mutual information for all feature sets using the same function call. But some of our feature sets are not discrete values, e.g. entropy statistics. Rather than compute a continuous entropy (specifically, it is called the [Limiting density of discrete points](https://en.wikipedia.org/wiki/Limiting_density_of_discrete_points) as opposed to the often misnomer "differential entropy"), I convert all continuous-valued feature sets into discrete values using the sklearn function [mutual_info_regression](https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/feature_selection/_mutual_info.py). This function implements [Ross (2014)'s](https://sci-hub.wf/10.1371/journal.pone.0087357) version of the _k_-nearest neighbour approach to calculating MI with continuous variables.

The problem is that repeated trials of the function reveal some randomness as part of the nearest-neighbour seeding. The script below shows that some trials of `sklearn.feature_selection.mutual_info_regression()` (blue dots) estimate the MI at twice the discrete estimate (red line), and the arithmetic mean of the repeated estimates (black horizontal line) variously under- and over-estimates the discrete estimate, depending on the run. The cumulative arithmetic mean (black varying line) does not converge to the discrete MI, rather it bounces around it. My qualitative assessment is that the arithmetic mean of 5 runs appears to provide a sufficient approximation of the discrete MI. Therefore, all my applications of `sklearn.feature_selection.mutual_info_regression()` will use the arithmetic mean of 5 runs as the MI value for that feature set.

In [None]:
# Set parameters.
my_repeats = 10
my_trials = 10
warnings.filterwarnings("ignore", message="A column-vector y was passed when a 1d array was expected")

# Get values for discrete calculation.
# ## Arbitrarily choose the feature set 'suicidal'.
my_fs = fs_interview.suicidal
my_caseness = caseness_array.iloc[:,1]
MI_discrete = sklearn.metrics.mutual_info_score(my_fs, my_caseness)

# Get values for nearest-neighbour calculation.
my_fs = fs_interview.loc[:,fs_interview.columns == 'suicidal'].values
my_caseness = caseness_array.loc[:,caseness_array.columns == 'CMHD_dx_and_rx'].to_numpy()
# ## Define function that does 10 repeats to produce an average and plot
def doit():
    MI_continuous = []
    for i in range(my_trials):
        MI_continuous.append(mutual_info_regression(my_fs, my_caseness, n_neighbors = 2))
    return MI_continuous


# Do plot.
fig, axs = matplotlib.pyplot.subplots(round(my_repeats / 2), 2)
fig.suptitle('Repeated calculation of MI using Ross\'s nearest-neighbour method.\nFeature set is \'suicidal\'')
for i_repeats in range(my_repeats):
    # Do the calculations for this iteration.
    i_MI_continuous = doit()
    # Plot this iteration.
    axs[i_repeats // 2, i_repeats % 2].plot(range(my_trials), i_MI_continuous, 'o')
    #axs.flat.set(xlabel = 'Trials', ylabel = 'Mutual information to base _e_')
    # Add MI_discrete as a line.
    axs[i_repeats // 2, i_repeats % 2].axhline(y = MI_discrete, color='r', linestyle='-')
    axs[i_repeats // 2, i_repeats % 2].axhline(y = sum(i_MI_continuous) / len(i_MI_continuous), color='k', linestyle='-')
    # Add the cumulative average as repeats progress.
    avg = []
    for i in range(len(i_MI_continuous)):
        check = 0
        l = i + 1
        for j in range(i+1):
            check = check + i_MI_continuous[j]
        avg.append(check/l)
    axs[i_repeats // 2, i_repeats % 2].plot(range(my_trials), avg, 'k')



# Rationale for the threshold I used for 'manyDNA'
The threshold for what constitutes "many" in the definition of 'manyDNA' was based on an investigation into the tallies of did-not-attend events in a calendar year, in patients' records.

The BigQuery syntax returns a Python pandas.DataFrame containing:

- n_occurrence: a list from 1 to the maximum count of one-year did-not-attend events observed in the cohort's records, and

- n_patients_with_n_occurrences_in_a_year: the count of patients who had at least one calendar year in which they had _n_occurrence_-many did-not-attend events.


In [None]:
sql_CTEs_body = \
"""
,tbl_manyDNA AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_manyDNA_count AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
        ,EXTRACT(YEAR FROM dateevent) AS year_occurrence
        ,COUNT(person_id) AS n_occurrences
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_manyDNA
    WHERE
        a.snomedcode IN (tbl_manyDNA.snomedcode)
    GROUP BY
        person_id, a.snomedcode, year_occurrence
)
,tbl_manyDNA_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        tbl_manyDNA_count
    WHERE
        n_occurrences > 1
)"""

sql_select = \
"""
SELECT
    DISTINCT n_occurrences
    ,COUNT(person_id) AS n_patients_with_n_occurrences_in_a_year
FROM
    tbl_manyDNA_count
GROUP BY
    n_occurrences
ORDER BY
    n_occurrences
"""

fs_manyDNA = client.query(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_select).to_dataframe()
display(fs_manyDNA)

Below are two basic plots that show:

- the count of patients with each number of one-year did-not-attend tallies (i.e. a plot of n_patients_with_n_occurrences_in_a_year).

- the rate of change of the count of patients with each number of one-year did-not-attend tallies (in other words, the difference between successive counts)

__I decided that the kink at four did-not-attend events within a calendar year indicated a change in the pattern of one-year did-not-attend tallies.__ Four did-not-attend events in a calendar year are quite common, but more than four in a year is particular.

In [None]:
x_lim = 10

# Unbounded plot of the count of patients with each number of DNAs
matplotlib.pyplot.plot(
        fs_manyDNA['n_occurrences']
         ,fs_manyDNA['n_patients_with_n_occurrences_in_a_year']
        )
matplotlib.pyplot.title('Count of patients with each number of annual DNAs')
matplotlib.pyplot.xlabel('n_occurrences')
matplotlib.pyplot.show()
# Bounded plot of the count of patients with each number of DNAs
matplotlib.pyplot.plot(
        fs_manyDNA['n_occurrences']
         ,fs_manyDNA['n_patients_with_n_occurrences_in_a_year']
        )
matplotlib.pyplot.title('Count of patients with each number of annual DNAs (axis limited)')
matplotlib.pyplot.xlabel('n_occurrences')
matplotlib.pyplot.xlim([0, x_lim])
matplotlib.pyplot.show()
# Bounded plot of the rate of change of the count of patients with each number of DNAs (in other words, the difference between successive counts)
matplotlib.pyplot.plot(
         fs_manyDNA['n_occurrences'][1:]
         ,[i-j for i, j in zip(fs_manyDNA['n_patients_with_n_occurrences_in_a_year'][:-1], fs_manyDNA['n_patients_with_n_occurrences_in_a_year'][1:])]
        )
matplotlib.pyplot.title('Rate of change of the count of patients with each number of DNAs (axis limited)')
matplotlib.pyplot.xlabel('n_occurrences')
matplotlib.pyplot.xlim([0, x_lim])
matplotlib.pyplot.show()

# Rationale for the threshold I used for 'recurrentEDattendance'
The threshold for what constitutes "recurrent" in the definition of 'recurrentEDattendance' was based on an investigation into the tallies of attendances at the emergency department within a year (i.e. 365 days), in patients' records.

The BigQuery syntax returns a Python pandas.DataFrame containing this count, `cnt_annual_ED_attedence`.

In [None]:
sql = \
"""
WITH
tbl_a AS (
  SELECT
    person_id
    ,tbl_ae_start_date AS visit_date
    ,LAG(tbl_ae_start_date) OVER(PARTITION BY person_id ORDER BY tbl_ae_start_date) AS lag_visit_date 
  FROM
    `""" + server_id + """.CB_FDM_Warehouse_V2.tbl_ae`
)
,tbl_b AS (
  SELECT
    person_id
    ,COUNT(person_id) AS cnt_annual_ED_attendence
  FROM
    tbl_a
  WHERE
    DATE_DIFF(visit_date, lag_visit_date, DAY) BETWEEN 1 AND 365
  GROUP BY
    person_id
)
SELECT
  person_id
  ,cnt_annual_ED_attendence
FROM
  tbl_b
WHERE
  cnt_annual_ED_attendence BETWEEN 3 AND 100
ORDER BY
      cnt_annual_ED_attendence
"""

fs_recurrentEDattednances = client.query(sql).to_dataframe()

fs_recurrentEDattednancesBelow is a histograme of the frequency of patients' annual count of ED attendances that suggests:

- Most patients have 4 or fewer ED attendances within a year.
- The bulk of the histogram occurs within 20 of fewer ED attendences.
- Few patients have annual ED attendance counts greater than 20.

</br>
</br>

__I decided that there are three cohorts, defined as:__
- 'Infrequent user' - four or fewer ED attendences within a year.
- 'Frequent user' - between five and twenty ED attendences within a year.
- 'Very frequent user' - more than twenty ED attendences within a year.

These criteria define the three values for the `recurrentEDattendance` variable.

In [None]:
matplotlib.pyplot.hist(
    fs_recurrentEDattednances['cnt_annual_ED_attendence']
    ,bins = 40
)
matplotlib.pyplot.title('Frequency of patients\' annual count of ED attendances\n(capped at 100)')
matplotlib.pyplot.xticks(
        numpy.concatenate((numpy.arange(0, 19, 4), numpy.arange(20, 100, 20)))
)
matplotlib.pyplot.show()

# Rationale for the threshold I used for 'categoryAnnualCountUniqueAntidepressants'
I needed to define categories for categoryAnnualCountUniqueAntidepressants because the counts can be too large for our planned analysis. I decided our categories based on an investigation into the counts of unique antidepressants that patients in the database were prescribed.

</br>
</br>
Below is a histogram of the frequency of patients' count of unique antidepressants that suggests:

- Most patients do not have a prescription for antidepressants.
- Of those identified as having a prescription for antidepressants, most were only prescribed one.
- Few patients are prescribed four or more antidepressants.
- Relatively very few are prescribed more than four.

</br>
</br>

__I decided that there are three cohorts, defined as:__
- 'No antidepressants' - zero antidepressants prescriptions.
- 'Few antidepressants' - between one and three unique antidepressants.
- 'Many antipressants' - four or more unique antidepressants.

These criteria define the three values for the `categoryAnnualCountUniqueAntidepressants` variable.

In [None]:
matplotlib.pyplot.hist(
    fs_clinician['categoryAnnualCountUniqueAntidepressants']
    ,bins = 9
)
matplotlib.pyplot.title('Frequency of patients\' count of unique antidepressants')
matplotlib.pyplot.show()

matplotlib.pyplot.hist(
    fs_uniqueAntidepressants['categoryAnnualCountUniqueAntidepressants']
    ,bins = 9
)
matplotlib.pyplot.title('Frequency of patients\' count of unique antidepressants\n(excluding \'No prescriptions\') ')
matplotlib.pyplot.show()