# Database feature sets

The purpose of this notebook is to create the array of feature sets suggested by the available SNOMED-CT codes in the Connected Bradford primary care data table.

The database feature sets must be handled differently to the other feature sets because there are so many feature sets in the database. I will not define an fs_database like fs_clinician or fs_literature, where each feature set is represented by a column in a pandas.Dataframe with a row for each patient. Instead, I will define an n-by-1 pandas.Dataframe containing the list of feature sets (i.e. SNOMED-CT codes) that satisfy the boundary filter of not occuring too frequently nor too infrequently. This list will be used directly in a script that applies the mutual-information filter.

### Imports and helper functions

In [4]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load prerequisites

In [6]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"
    
client = bigquery.Client()

## Query database for individual SNOMED-CT codes
The first feature sets to be assessed are individual SNOMED-CT codes found in the Connected Bradford primary care table.

The tables outputted below shows the count of patient records in which unique SNOMED-CT codes occur. The first table provides counts are aggregated in ranges from $<10$ to $>10,000,000$ by factors of 10. The second table presents counts aggregate in the ranges defined by the arguments made previously.

In [7]:
# Declare your redaction threshold and target rounding number.
redaction_threshold = 7
target_round = 10
sql_variables = \
"""
DECLARE redaction_threshold INT64 DEFAULT """ + str(redaction_threshold) + """;
DECLARE target_round INT64 DEFAULT """ + str(target_round) + """;
"""

# Declare lower and upp boundaries for feature-set prevalence
lower_bound = min_criterion_count
upper_bound = max_criterion_count
sql_variables = \
    sql_variables + \
"""
DECLARE lower_bound INT64 DEFAULT """ + str(lower_bound) + """;
DECLARE upper_bound INT64 DEFAULT """ + str(upper_bound) + """;
"""


sql_base = \
"""
WITH
tbl_persons AS (
SELECT
    DISTINCT person_id
FROM
    yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.person
# Limiting to age range 18-70.
WHERE
    (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)
,tbl_patients_per_code AS (
SELECT
    DISTINCT a.src_snomedcode,
    COUNT(DISTINCT tbl_persons.person_id) AS count_patients_with_code
FROM
    `yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v5.tbl_SRCode` AS a    
RIGHT JOIN
    tbl_persons
    ON a.person_id = tbl_persons.person_id
GROUP BY
    a.src_snomedcode
ORDER BY
    count_patients_with_code DESC
)
"""

sql_full_table = \
"""
,tbl_category_full AS
(
SELECT
  DISTINCT src_snomedcode
  ,CASE
    WHEN count_patients_with_code < 10 THEN "<10"
    WHEN count_patients_with_code < 100 THEN "10 =< code < 100"
    WHEN count_patients_with_code < 1000 THEN "100 =< code < 1,000"
    WHEN count_patients_with_code < 10000 THEN "1,000 =< code < 10,000"
    WHEN count_patients_with_code < 100000 THEN "10,000 =< code < 100,000"
    WHEN count_patients_with_code < 1000000 THEN "100,000 =< code < 1,000,000"
    WHEN count_patients_with_code < 10000000 THEN "1,000,000 =< code < 10,000,000"
    WHEN count_patients_with_code >= 10000000 THEN "code >= 10,000,000"
  END AS cnt_SNOMED
FROM tbl_patients_per_code
ORDER BY cnt_SNOMED
)

SELECT
  COUNT(cnt_SNOMED) AS This_many_codes__
  ,cnt_SNOMED AS __occur_for_this_many_patients
FROM tbl_category_full
GROUP BY cnt_SNOMED
ORDER BY This_many_codes__ DESC
"""
full_Table = client.query(sql_variables + sql_base + sql_full_table).to_dataframe()
display(full_Table)

sql_boundary_table = \
"""
,tbl_category_boundary AS
(
SELECT
  DISTINCT src_snomedcode
  ,CASE
    WHEN count_patients_with_code < lower_bound THEN "too infrequent (occurs in < """ + f'{lower_bound:,}' + """ patients' records)"
    WHEN count_patients_with_code <= upper_bound THEN "within bounds"
    ELSE "too frequent (occurs in > """ + f'{upper_bound:,}' + """ patients' records)"
  END AS cnt_SNOMED
FROM tbl_patients_per_code
ORDER BY cnt_SNOMED
)

SELECT
  COUNT(cnt_SNOMED) AS This_many_codes__
  ,cnt_SNOMED AS __occur_this_often
FROM tbl_category_boundary
GROUP BY cnt_SNOMED
ORDER BY This_many_codes__ DESC
"""
boundary_Table = client.query(sql_variables + sql_base + sql_boundary_table).to_dataframe()
display(boundary_Table)

# Prepare the table for extracting data.
boundary_Table.set_index('__occur_this_often', inplace = True)
n_within_bounds = int(boundary_Table.loc['within bounds'])

Unnamed: 0,This_many_codes__,__occur_for_this_many_patients
0,40287,<10
1,22996,10 =< code < 100
2,12139,"100 =< code < 1,000"
3,5483,"1,000 =< code < 10,000"
4,1357,"10,000 =< code < 100,000"
5,146,"100,000 =< code < 1,000,000"


Unnamed: 0,This_many_codes__,__occur_this_often
0,42087,within bounds
1,40287,too infrequent (occurs in < 10 patients' records)
2,34,"too frequent (occurs in > 351,605 patients' records)"


In [13]:
# Display message.
display(
    Markdown(
"""
The first table above shows that most SNOMED-CT codes occur infrequently in patients' records, with a
handfull of codes showing up in many patient's records. he second table shows that almost as many codes occur too infrequently as occur within our bounds.

#### Interim conclusion
We can infer that __%s feature sets (defined solely by the presence of a single SNOMED-CT code) might be
informative of the caseness of complex mental health difficulties, in our particular cohort within the Connected Bradford dataset__.
"""
        %(f'{n_within_bounds:,}')
    )
)


The first table above shows that most SNOMED-CT codes occur infrequently in patients' records, with a
handfull of codes showing up in many patient's records. he second table shows that almost as many codes occur too infrequently as occur within our bounds.

#### Interim conclusion
We can infer that __42,087 feature sets (defined solely by the presence of a single SNOMED-CT code) might be
informative of the caseness of complex mental health difficulties, in our particular cohort within the Connected Bradford dataset__.


## Making a list of the single-feature feature sets of interest
The following code defines a list of SNOMED-CT codes (that appear in our cohort from the Connected Bradford dataset) that we will carry forward as single-feature feature sets.

In [22]:
sql_singleFS_select = \
"""
SELECT
    src_snomedcode
FROM
    tbl_patients_per_code
WHERE
    count_patients_with_code BETWEEN lower_bound AND upper_bound
ORDER BY
    src_snomedcode
"""
df_fs_database = client.query(sql_variables + sql_base + sql_singleFS_select).to_dataframe()
display(df_fs_database)

Unnamed: 0,src_snomedcode
0,10001005
1,1000231000000100
2,1000241000000109
3,1000311000000103
4,1000381000000105
...,...
42082,999671000000103
42083,999681000000101
42084,999691000000104
42085,999701000000104


### What is the prevalence of pair-composite feature sets?
The next question we might ask is whether pairs of SNOMED-CT codes might be informative of the cases of complex mental health difficulties.

To answer this question, we re-run the previous analysis but count the patients who have pairs of SNOMED-CT codes in their record instead of individual SNOMED-CT codes.

In [None]:
sql_base_2 = \
"""
WITH
tbl_persons AS (
SELECT
    DISTINCT person_id
FROM
    yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.person
# Limiting to age range 18-70.
WHERE
    (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)
,tbl_patients_per_code AS (
SELECT
    DISTINCT
    a.person_id
    ,a.src_snomedcode
    #,COUNT(DISTINCT tbl_persons.person_id) AS count_patients_with_code
FROM
    `yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v5.tbl_SRCode` AS a    
RIGHT JOIN
    tbl_persons
    ON a.person_id = tbl_persons.person_id
#GROUP BY
#    code_1
#ORDER BY
#    person_id
#    count_patients_with_code DESC
)
,tbl_code_combins AS
(
SELECT
    DISTINCT c1.person_id
    ,c1.src_snomedcode AS code1
    ,c2.src_snomedcode AS code2
FROM
    tbl_patients_per_code AS c1
    CROSS JOIN tbl_patients_per_code AS c2
WHERE 
    c1.src_snomedcode != c2.src_snomedcode
ORDER BY
    c1.person_id, c1.src_snomedcode, c2.src_snomedcode
)

#SELECT
#    code1
#    ,code2
#    ,COUNT(person_id) AS count_patients_with_codeCombins
#FROM
#    tbl_code_combins
#GROUP BY
#    code1, code2
#ORDER BY
#    count_patients_with_codeCombins, code1, code2 DESC;
    
    
## 399,857 people have both codes.
SELECT * FROM tbl_code_combins# WHERE code1 IN ('1022551000000104', '1022571000000108') AND code2 IN ('1022551000000104', '1022571000000108') ORDER BY person_id

#SELECT * FROM tbl_patients_per_code WHERE src_snomedcode IN ('1022551000000104', '1022571000000108') ORDER BY person_id, src_snomedcode
"""
bqTable = client.query(sql_base_2).to_dataframe()
display(bqTable)

In [None]:
# First, we must compute all the possible SNOMED-CT code pairs.
sql_snomedcode_vector = \
"""
SELECT
    src_snomedcode
FROM
    tbl_patients_per_code
"""

snomedcode_vector = client.query(sql_base + sql_snomedcode_vector).to_dataframe().values.tolist()
display(snomedcode_vector)
combins = itertools.combinations(snomedcode_vector,2)
list(combins)[0:5]

## Creating the initial feature-set array

To produce the initial feature-set array, we need to define the list of unique SNOMED CT codes and check whether each patient has that code in their primary care record. The code below produces an n-by-p array where each column contains the count of times that a code is recorded for a given patient.

In [18]:
# I'm thankful for the following stackoverflow thread about pivot queries:
# https://stackoverflow.com/questions/50293482/how-to-create-crosstab-with-two-field-in-bigquery-with-standart-or-legacy-sql.

sql_with = """
WITH
tbl_persons AS
(
SELECT
    DISTINCT person_id
FROM
    yhcr-prd-phm-bia-core.CY_MYSPACE_CMC.person
# Limiting to age range 18-70.
WHERE
    (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)
,tbl_codes_and_count AS
(
SELECT
    DISTINCT src_snomedcode
    ,COUNT(src_snomedcode) AS cnt_code
FROM `yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v5.tbl_SRCode`
GROUP BY src_snomedcode
)
,tbl_codes_of_interest AS
(
SELECT
  src_snomedcode AS SNOMEDcode
FROM tbl_codes_and_count
WHERE
    cnt_code >= (SELECT COUNT(person_id)/2 FROM tbl_persons)
    # This justification for this filter is described in the
    # previous part of the Jupyter notebook.
)
,tbl_persons_and_codes AS
(
SELECT
    tbl_persons.person_id
    ,tbl_codes.src_snomedcode
FROM 
    tbl_persons
LEFT JOIN
    yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v5.tbl_SRCode AS tbl_codes
ON
    tbl_persons.person_id = tbl_codes.person_id
)
,tbl_persons_codes_of_interest AS
(
SELECT
  tbl_persons_and_codes.person_id
  ,tbl_codes_of_interest.SNOMEDcode
FROM
  tbl_persons_and_codes
LEFT JOIN
  tbl_codes_of_interest
ON 
  tbl_codes_of_interest.SNOMEDcode = tbl_persons_and_codes.src_snomedcode
)
"""
sql_pivot = """
SELECT
    CONCAT("SELECT person_id,", STRING_AGG(CONCAT("COUNTIF(SNOMEDcode='",SNOMEDcode,"') AS `_",SNOMEDcode,"`")), 
        " FROM `tbl_persons_codes_of_interest`",
        " GROUP BY person_id ORDER BY person_id")
FROM (  SELECT DISTINCT SNOMEDcode FROM `tbl_persons_codes_of_interest` ORDER BY SNOMEDcode  )
"""
mysql= \
"""
SELECT * FROM tbl_persons_codes_of_interest
"""
client.query(sql_with+mysql).to_dataframe()
#sql = client.query(sql_with + sql_pivot).to_dataframe()['f0_'].iloc[0]
#featureSet_array = client.query(sql_with + sql).to_dataframe()

KeyboardInterrupt: 

## Create the Feature Set ID table.
This table is a look-up table of feature-set IDs that shows which features make up the feature set. The table is instantiated on the assumption that feature sets will include no more than five features.

In [5]:
# Instantiate the feature set id table.
featureSet_ID_table = \
    pandas.DataFrame(columns = ['Feature set ID', 'Feature Set 1', 'Feature Set 2',
                               'Feature Set 3', 'Feature Set 4', 'Feature Set 5'
                               ])
# Populate the feature set id table with the individual features.
featureSet_ID_table['Feature set ID'] = \
    featureSet_ID_table['Feature Set 1'] = \
        featureSet_array.columns[featureSet_array.columns != 'person_id']