# Literature feature sets
The purpose of this notebook is to create the array of feature sets suggested by the literature.

### Imports and helper functions

In [1]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load codelist CSV files.
We used opencodelist.org to define codelists that define the set of SNOMED-CT codes used to identify patients based on various attributes.

In [2]:
client = bigquery.Client()

folder_loc = os.path.dirname(os.path.abspath("UNSEEN create clinician feature sets.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_homeless = pandas.read_csv(folder + "ciaranmci-homelessness-0e1fe637.csv")
codes_to_query_incarcerationImprisonment = pandas.read_csv(folder + "ciaranmci-incarceration-or-imprisonment-75107301.csv")
codes_to_query_sleepDisturbance = pandas.read_csv(folder + "ciaranmci-sleep-disturbance-dyssomnia-29e21962.csv")
codes_to_query_tinnitus = pandas.read_csv(folder + "ciaranmci-tinnitus-finding-10d2a62d.csv")

## Load prerequisites

In [3]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"
%store -r

## Query database for base feature sets
The literature feature sets are a mix of feature sets are simply defined by the presence of SNOMED-CT codes. The code below returns `fs_literature` that contains the base feature sets.

In [4]:
sql_CTEs_top = """
WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
        ,year_of_birth
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
"""

sql_CTEs_body = \
"""
#  ## Homeless
,tbl_homeless AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_homeless["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_homeless_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_homeless
    WHERE
        src_snomedcode IN (tbl_homeless.snomedcode)
)
#  ## Poverty
,tbl_poverty_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode
    WHERE
        src_snomedcode IN ('11403006', '284477001', '724451000000108', '722221000000105', '719781000000101')
)
#  ## Sleep disturbance
,tbl_sleepDisturbance AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_sleepDisturbance["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_sleepDisturbance_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_sleepDisturbance
    WHERE
        src_snomedcode IN (tbl_sleepDisturbance.snomedcode)
)
#  ## Tinnitus
,tbl_tinnitus AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_tinnitus["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_tinnitus_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_tinnitus
    WHERE
        src_snomedcode IN (tbl_tinnitus.snomedcode)
)
# ## Food insecurity
,tbl_foodInsecurity AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode
    WHERE
        src_snomedcode IN ('1078229009', '1004109000', '1002223009')
)
#  ## Age at first admission to psychiatric rehabilitation services
,tbl_admissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
        ,src_dateevent
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode
    WHERE
        src_snomedcode IN ('306139004')
)
,tbl_ageAtFirstAdmissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT tbl_persons.person_id
        ,src_snomedcode
        ,MIN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.src_dateevent) - tbl_persons.year_of_birth)) AS ageAtFirstAdmission
    FROM tbl_persons
    LEFT OUTER JOIN tbl_admissionToPsychRehabServices_persons ON tbl_persons.person_id = tbl_admissionToPsychRehabServices_persons.person_id
    WHERE
        IS_NAN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.src_dateevent) - tbl_persons.year_of_birth)) = FALSE
    GROUP BY
        person_id, src_snomedcode
)

#  ## Incarceration or imprisonment
,tbl_incarcerationImprisonment AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_incarcerationImprisonment["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_incarcerationImprisonment_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode, tbl_incarcerationImprisonment
    WHERE
        src_snomedcode IN (tbl_incarcerationImprisonment.snomedcode)
)
#  ## Metabolic syndrome
,tbl_metabolicSyndrome_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode
    WHERE
        src_snomedcode IN ('237602007')
)
#  ## Sleep dysfunction
,tbl_sleepDysfunction_persons AS (
    SELECT
        DISTINCT person_id
        ,src_snomedcode
    FROM
        yhcr-prd-phm-bia-core.CB_MYSPACE_CMC.tbl_SRCode
    WHERE
        src_snomedcode IN ('441877007', '442176004')
)
"""

sql_final_select = \
"""
# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# feature set. The feature-set columns are populated by interger values with '1' indicating that the
# feature set is satisfied and '0' indicating otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,CASE WHEN tbl_homeless_persons.person_id IS NULL THEN 0 ELSE 1 END AS homeless
    ,CASE WHEN tbl_poverty_persons.person_id IS NULL THEN 0 ELSE 1 END AS poverty
    ,CASE WHEN tbl_sleepDisturbance_persons.person_id IS NULL THEN 0 ELSE 1 END AS sleepDisturbance
    ,CASE WHEN tbl_tinnitus_persons.person_id IS NULL THEN 0 ELSE 1 END AS tinnitus
    ,CASE WHEN tbl_foodInsecurity.person_id IS NULL THEN 0 ELSE 1 END AS foodInsecurity
    ,CASE WHEN tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id IS NULL THEN 0 ELSE 1 END AS ageAtFirstAdmissionToPsychRehabServices
    ,CASE WHEN tbl_incarcerationImprisonment_persons.person_id IS NULL THEN 0 ELSE 1 END AS incarcerationImprisonment
    ,CASE WHEN tbl_metabolicSyndrome_persons.person_id IS NULL THEN 0 ELSE 1 END AS metabolicSyndrome
    ,CASE WHEN tbl_sleepDysfunction_persons.person_id IS NULL THEN 0 ELSE 1 END AS sleepDysfunction
FROM tbl_persons
LEFT OUTER JOIN tbl_homeless_persons ON tbl_persons.person_id = tbl_homeless_persons.person_id
LEFT OUTER JOIN tbl_poverty_persons ON tbl_persons.person_id = tbl_poverty_persons.person_id
LEFT OUTER JOIN tbl_sleepDisturbance_persons ON tbl_persons.person_id = tbl_sleepDisturbance_persons.person_id
LEFT OUTER JOIN tbl_tinnitus_persons ON tbl_persons.person_id = tbl_tinnitus_persons.person_id
LEFT OUTER JOIN tbl_foodInsecurity ON tbl_persons.person_id = tbl_foodInsecurity.person_id
LEFT OUTER JOIN tbl_ageAtFirstAdmissionToPsychRehabServices_persons ON tbl_persons.person_id = tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id
LEFT OUTER JOIN tbl_incarcerationImprisonment_persons ON tbl_persons.person_id = tbl_incarcerationImprisonment_persons.person_id
LEFT OUTER JOIN tbl_metabolicSyndrome_persons ON tbl_persons.person_id = tbl_metabolicSyndrome_persons.person_id
LEFT OUTER JOIN tbl_sleepDysfunction_persons ON tbl_persons.person_id = tbl_sleepDysfunction_persons.person_id
ORDER BY tbl_persons.person_id
"""

fs_literature = client.query(sql_CTEs_top + sql_CTEs_body + sql_final_select).to_dataframe()

In [5]:
# Display message.
display(
    Markdown(
f"""
## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- ${int(possibleCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(possibleCaseness_count_UB):,}$, for 'Possible caseness'
- ${int(definiteCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(definiteCaseness_count_UB):,}$, for 'Definite caseness'
"""
       )
)


## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- $600\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le1,200$, for 'Possible caseness'
- $40\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le90$, for 'Definite caseness'


In [6]:
fs_literature_filteredPossible = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'possible', verbose = True)[0]
# Final list of variables
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredPossible.columns.values[1:])
fs_literature_filteredDefinite = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'definite', verbose = True)[0]
# Final list of variables
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredDefinite.columns.values[1:])



# Store clinician feature sets for use in other notebooks.
%store fs_literature_filteredPossible fs_literature_filteredDefinite


 Filtering complete for 'Possible caseness'...
	 0  feature sets remain.
	 9  feature sets removed, in total.
	 5  feature sets removed because of low prevalence.
	 4  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 []

 Filtering complete for 'Definite caseness'...
	 1  feature sets remain.
	 8  feature sets removed, in total.
	 4  feature sets removed because of low prevalence.
	 4  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 ['metabolicSyndrome']
Stored 'fs_literature_filteredPossible' (DataFrame)
Stored 'fs_literature_filteredDefinite' (DataFrame)


# ---------------------------------------------------------------------------------