# Literature feature sets
The purpose of this notebook is to create the array of feature sets suggested by the literature.

### Imports and helper functions

In [1]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load codelist CSV files.
We used opencodelist.org to define codelists that define the set of SNOMED-CT codes used to identify patients based on various attributes.

In [2]:
# Instaniate BigQuery client.
client = bigquery.Client()

# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN create clinician feature sets.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_homeless = pandas.read_csv(folder + "ciaranmci-homelessness-0e1fe637.csv")
codes_to_query_incarcerationImprisonment = pandas.read_csv(folder + "ciaranmci-incarceration-or-imprisonment-75107301.csv")
codes_to_query_sleepDisturbance = pandas.read_csv(folder + "ciaranmci-sleep-disturbance-dyssomnia-29e21962.csv")
codes_to_query_suicidal = pandas.read_csv(folder + "ciaranmci-suicidal-5eaa56c5.csv")
codes_to_query_tinnitus = pandas.read_csv(folder + "ciaranmci-tinnitus-finding-10d2a62d.csv")

## Load prerequisites

In [3]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"
%store -r

## Query database for base feature sets
The literature feature sets are a mix of feature sets are simply defined by the presence of SNOMED-CT codes. The code below returns `fs_literature` that contains the base feature sets.

In [3]:
sql_CTEs_body = \
"""
#  ## Homeless
,tbl_homeless AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_homeless["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_homeless_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_homeless
    WHERE
        a.snomedcode IN (tbl_homeless.snomedcode)
)
#  ## Poverty
,tbl_poverty_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('11403006', '284477001', '724451000000108', '722221000000105', '719781000000101')
)
#  ## Sleep disturbance
,tbl_sleepDisturbance AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_sleepDisturbance["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_sleepDisturbance_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_sleepDisturbance
    WHERE
        a.snomedcode IN (tbl_sleepDisturbance.snomedcode)
)
#  ## Suicidal ideation
,tbl_suicidal AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_suicidal["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_suicidal_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_suicidal
    WHERE
        a.snomedcode IN (tbl_suicidal.snomedcode)
)
#  ## Tinnitus
,tbl_tinnitus AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_tinnitus["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_tinnitus_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_tinnitus
    WHERE
        a.snomedcode IN (tbl_tinnitus.snomedcode)
)
# ## Food insecurity
,tbl_foodInsecurity AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('1078229009', '1004109000', '1002223009')
)
#  ## Age at first admission to psychiatric rehabilitation services
,tbl_admissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
        ,dateevent
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('306139004')
)
,tbl_ageAtFirstAdmissionToPsychRehabServices_persons AS (
    SELECT
        DISTINCT tbl_studyPopulation_no_caseness.person_id
        ,tbl_admissionToPsychRehabServices_persons.snomedcode
        ,MIN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth)) AS ageAtFirstAdmission
    FROM tbl_studyPopulation_no_caseness
    LEFT OUTER JOIN tbl_admissionToPsychRehabServices_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_admissionToPsychRehabServices_persons.person_id
    WHERE
        IS_NAN((EXTRACT(YEAR FROM tbl_admissionToPsychRehabServices_persons.dateevent) - tbl_studyPopulation_no_caseness.year_of_birth)) = FALSE
    GROUP BY
        tbl_studyPopulation_no_caseness.person_id, tbl_admissionToPsychRehabServices_persons.snomedcode
)

#  ## Incarceration or imprisonment
,tbl_incarcerationImprisonment AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_incarcerationImprisonment["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_incarcerationImprisonment_persons AS (
    SELECT
        DISTINCT person_id
        ,a.snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_incarcerationImprisonment
    WHERE
        a.snomedcode IN (tbl_incarcerationImprisonment.snomedcode)
)
#  ## Metabolic syndrome
,tbl_metabolicSyndrome_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('237602007')
)
#  ## Sleep dysfunction
,tbl_sleepDysfunction_persons AS (
    SELECT
        DISTINCT person_id
        ,snomedcode
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode
    WHERE
        snomedcode IN ('441877007', '442176004')
)
"""

sql_final_select = \
"""
# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# feature set. The feature-set columns are populated by interger values with '1' indicating that the
# feature set is satisfied and '0' indicating otherwise.
SELECT
    DISTINCT tbl_studyPopulation_no_caseness.person_id
    ,CASE WHEN tbl_homeless_persons.person_id IS NULL THEN 0 ELSE 1 END AS homeless
    ,CASE WHEN tbl_poverty_persons.person_id IS NULL THEN 0 ELSE 1 END AS poverty
    ,CASE WHEN tbl_sleepDisturbance_persons.person_id IS NULL THEN 0 ELSE 1 END AS sleepDisturbance
    ,CASE WHEN tbl_suicidal_persons.person_id IS NULL THEN 0 ELSE 1 END AS suicidal
    ,CASE WHEN tbl_tinnitus_persons.person_id IS NULL THEN 0 ELSE 1 END AS tinnitus
    ,CASE WHEN tbl_foodInsecurity.person_id IS NULL THEN 0 ELSE 1 END AS foodInsecurity
    ,CASE WHEN tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id IS NULL THEN 0 ELSE 1 END AS ageAtFirstAdmissionToPsychRehabServices
    ,CASE WHEN tbl_incarcerationImprisonment_persons.person_id IS NULL THEN 0 ELSE 1 END AS incarcerationImprisonment
    ,CASE WHEN tbl_metabolicSyndrome_persons.person_id IS NULL THEN 0 ELSE 1 END AS metabolicSyndrome
    ,CASE WHEN tbl_sleepDysfunction_persons.person_id IS NULL THEN 0 ELSE 1 END AS sleepDysfunction
FROM tbl_studyPopulation_no_caseness
LEFT OUTER JOIN tbl_homeless_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_homeless_persons.person_id
LEFT OUTER JOIN tbl_poverty_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_poverty_persons.person_id
LEFT OUTER JOIN tbl_sleepDisturbance_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_sleepDisturbance_persons.person_id
LEFT OUTER JOIN tbl_suicidal_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_suicidal_persons.person_id
LEFT OUTER JOIN tbl_tinnitus_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_tinnitus_persons.person_id
LEFT OUTER JOIN tbl_foodInsecurity ON tbl_studyPopulation_no_caseness.person_id = tbl_foodInsecurity.person_id
LEFT OUTER JOIN tbl_ageAtFirstAdmissionToPsychRehabServices_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_ageAtFirstAdmissionToPsychRehabServices_persons.person_id
LEFT OUTER JOIN tbl_incarcerationImprisonment_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_incarcerationImprisonment_persons.person_id
LEFT OUTER JOIN tbl_metabolicSyndrome_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_metabolicSyndrome_persons.person_id
LEFT OUTER JOIN tbl_sleepDysfunction_persons ON tbl_studyPopulation_no_caseness.person_id = tbl_sleepDysfunction_persons.person_id
ORDER BY tbl_studyPopulation_no_caseness.person_id
"""

fs_literature = client.query(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_final_select).to_dataframe()

In [4]:
# Display message.
display(
    Markdown(
f"""
## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- ${int(DxAndRxCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(DxAndRxCaseness_count_UB):,}$, for 'Definite caseness'
- ${int(DxNotRxCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(DxNotRxCaseness_count_UB):,}$, for 'Diagnosis-based caseness'
- ${int(RxNotDxCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(RxNotDxCaseness_count_UB):,}$, for 'Prescription-based caseness'
- ${int(multinomialCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(multinomialCaseness_count_UB):,}$, for 'Multinomial caseness'
- ${int(prescriptionVsDefiniteCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(prescriptionVsDefiniteCaseness_count_UB):,}$, for 'Prescription-based -vs- Definite caseness'
"""
       )
)


## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- $1,510\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le3,350$, for 'Definite caseness'
- $2,640\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le5,860$, for 'Diagnosis-based caseness'
- $43,580\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le96,850$, for 'Prescription-based caseness'
- $47,720\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le106,050$, for 'Multinomial caseness'
- $1,510\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le3,350$, for 'Prescription-based -vs- Definite caseness'


In [5]:
fs_literature_filteredDxAndRx = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'dxandrx', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredDxAndRx.columns.values[1:])
fs_literature_filteredDxNotRx = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'dxnotrx', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredDxNotRx.columns.values[1:])
fs_literature_filteredRxNotDx = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'rxnotdx', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredRxNotDx.columns.values[1:])
fs_literature_filteredMulti = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'multi', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredMulti.columns.values[1:])
fs_literature_filteredPrescriptionVsDefinite = boundaryfilter(my_featureSet_array = fs_literature, caseness = 'prescriptionVsDefinite', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_literature_filteredPrescriptionVsDefinite.columns.values[1:])

# Store clinician feature sets for use in other notebooks.
%store fs_literature fs_literature_filteredDxAndRx fs_literature_filteredDxNotRx \
    fs_literature_filteredRxNotDx fs_literature_filteredMulti fs_literature_filteredPrescriptionVsDefinite


 Filtering complete for 'Definite caseness'...
	 1  feature sets remain.
	 9  feature sets removed, in total.
	 5  feature sets removed because of low prevalence.
	 4  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 ['poverty']

 Filtering complete for 'Diagnosis-based caseness'...
	 1  feature sets remain.
	 9  feature sets removed, in total.
	 6  feature sets removed because of low prevalence.
	 3  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 ['homeless']

 Filtering complete for 'Prescription-based caseness'...
	 0  feature sets remain.
	 10  feature sets removed, in total.
	 10  feature sets removed because of low prevalence.
	 0  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 []

 Filtering complete for 'Multinomial caseness'...
	 0  feature sets remain.
	 10  feature sets removed, in total.
	 10  feature sets rem

# ---------------------------------------------------------------------------------