# Database feature sets

The purpose of this notebook is to create the array of feature sets suggested by the available SNOMED-CT codes in the Connected Bradford primary care data table.

The database feature sets must be handled differently to the other feature sets because there are so many feature sets in the database. I will not define an fs_database like fs_clinician or fs_literature, where each feature set is represented by a column in a pandas.Dataframe with a row for each patient. Instead, I will define an n-by-1 pandas.Dataframe containing the list of feature sets (i.e. SNOMED-CT codes) that satisfy the boundary filter of not occuring too frequently nor too infrequently. This list will be used directly in a script that applies the mutual-information filter.

### Imports and helper functions

In [3]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load prerequisites

In [2]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"

# Instantiae BigQuery client.
client = bigquery.Client()

## Query database for individual SNOMED-CT codes
The first feature sets to be assessed are individual SNOMED-CT codes found in the Connected Bradford primary care table.

The tables outputted below shows the count of patient records in which unique SNOMED-CT codes occur. The first table provides counts are aggregated in ranges from $<10$ to $>10,000,000$ by factors of 10. The second set of tables presents counts aggregate in the bounds defined by the arguments made previously, for each caseness variable.

In [4]:
# Declare your redaction threshold and target rounding number.
global redaction_threshold
redaction_threshold = 7
global target_round
target_round = 10
global sql_variables
sql_variables = \
"""
DECLARE redaction_threshold INT64 DEFAULT """ + str(redaction_threshold) + """;
DECLARE target_round INT64 DEFAULT """ + str(target_round) + """;
"""

# SQL syntax.
global sql_base
sql_base = \
"""
WITH
tbl_persons AS (
SELECT
    DISTINCT person_id
FROM
    """ + server_id + """.""" + database_id + """.person
# Limiting to age range 18-70.
WHERE
    (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)
,tbl_patients_per_code AS (
SELECT
    DISTINCT a.snomedcode,
    COUNT(DISTINCT tbl_persons.person_id) AS count_patients_with_code
FROM
    """ + server_id + """.""" + database_id + """.tbl_srcode AS a    
RIGHT JOIN
    tbl_persons
    ON a.person_id = tbl_persons.person_id
GROUP BY
    a.snomedcode
ORDER BY
    count_patients_with_code DESC
)
"""

sql_full_table = \
"""
,tbl_category_full AS
(
SELECT
  DISTINCT snomedcode
  ,CASE
    WHEN count_patients_with_code < 10 THEN "<10"
    WHEN count_patients_with_code < 100 THEN "10 =< code < 100"
    WHEN count_patients_with_code < 1000 THEN "100 =< code < 1,000"
    WHEN count_patients_with_code < 10000 THEN "1,000 =< code < 10,000"
    WHEN count_patients_with_code < 100000 THEN "10,000 =< code < 100,000"
    WHEN count_patients_with_code < 1000000 THEN "100,000 =< code < 1,000,000"
    WHEN count_patients_with_code < 10000000 THEN "1,000,000 =< code < 10,000,000"
    WHEN count_patients_with_code >= 10000000 THEN "code >= 10,000,000"
  END AS cnt_SNOMED
FROM tbl_patients_per_code
ORDER BY cnt_SNOMED
)

SELECT
  COUNT(cnt_SNOMED) AS This_many_codes__
  ,cnt_SNOMED AS __occur_for_this_many_patients
FROM tbl_category_full
GROUP BY cnt_SNOMED
ORDER BY This_many_codes__ DESC
"""
full_Table = client.query(sql_variables + sql_base + sql_full_table).to_dataframe()
display(full_Table)

Unnamed: 0,This_many_codes__,__occur_for_this_many_patients
0,40321,<10
1,23220,10 =< code < 100
2,12332,"100 =< code < 1,000"
3,5637,"1,000 =< code < 10,000"
4,1447,"10,000 =< code < 100,000"
5,155,"100,000 =< code < 1,000,000"


In [6]:
# 'Possible caseness'
print("'Possible caseness'")
possible_n_within_bounds = databasefsboundaryreview(lower_bound = possibleCaseness_count_LB, upper_bound = possibleCaseness_count_UB)

# 'Definite caseness'
print("'Definite caseness'")
definite_n_within_bounds = databasefsboundaryreview(lower_bound = definiteCaseness_count_LB, upper_bound = definiteCaseness_count_UB)

# 'Multinomial caseness'
print("'Multinomial caseness'")
multinomial_n_within_bounds = databasefsboundaryreview(lower_bound = multinomialCaseness_count_LB, upper_bound = multinomialCaseness_count_UB)

# 'Possible-vs-Definite caseness'
print("'Possible-vs-Definite caseness'")
possdef_n_within_bounds = databasefsboundaryreview(lower_bound = possdefCaseness_count_LB, upper_bound = possdefCaseness_count_UB)

'Possible caseness'


Unnamed: 0,This_many_codes__,__occur_this_often
0,82191,"too infrequent (occurs in < 18,950 patients' records)"
1,482,within bounds
2,439,"too frequent (occurs in > 37,890 patients' records)"


'Definite caseness'


Unnamed: 0,This_many_codes__,__occur_this_often
0,74888,too infrequent (occurs in < 770 patients' records)
1,5772,"too frequent (occurs in > 1,530 patients' records)"
2,2452,within bounds


'Multinomial caseness'


Unnamed: 0,This_many_codes__,__occur_this_often
0,82222,"too infrequent (occurs in < 19,710 patients' records)"
1,468,within bounds
2,422,"too frequent (occurs in > 39,430 patients' records)"


'Possible-vs-Definite caseness'


Unnamed: 0,This_many_codes__,__occur_this_often
0,74888,too infrequent (occurs in < 770 patients' records)
1,5772,"too frequent (occurs in > 1,530 patients' records)"
2,2452,within bounds


In [8]:
# Display message.
display(
    Markdown(
f"""
The first table above shows that most SNOMED-CT codes occur infrequently in patients' records, with a
handfull of codes showing up in many patient's records. The second table shows that almost as many codes occur too infrequently as occur within our bounds.

#### Interim conclusion
We can infer that the following counts of feature sets (defined solely by the presence of a single SNOMED-CT code) might be
informative of the various casenesses of complex mental health difficulties, in our particular cohort within the Connected Bradford dataset.

| Caseness cohort          |     # feature sets within bounds     |
| ------------------------ | ------------------------------------ |
| Possible                 | {f'{possible_n_within_bounds:,}'}    |
| Definite                 | {f'{definite_n_within_bounds:,}'}    |
| Multinomial              | {f'{multinomial_n_within_bounds:,}'} |
| Possible-vs-Definite     | {f'{possdef_n_within_bounds:,}'}    |

"""
    )
)


The first table above shows that most SNOMED-CT codes occur infrequently in patients' records, with a
handfull of codes showing up in many patient's records. The second table shows that almost as many codes occur too infrequently as occur within our bounds.

#### Interim conclusion
We can infer that the following counts of feature sets (defined solely by the presence of a single SNOMED-CT code) might be
informative of the various casenesses of complex mental health difficulties, in our particular cohort within the Connected Bradford dataset.

| Caseness cohort          |     # feature sets within bounds     |
| ------------------------ | ------------------------------------ |
| Possible                 | 482    |
| Definite                 | 2,452    |
| Multinomial              | 468 |
| Possible-vs-Definite     | 2,452    |



## Making a list of the single-feature feature sets of interest
The following code defines a list of SNOMED-CT codes (that appear in our cohort from the Connected Bradford dataset) that we will carry forward as single-feature feature sets.

In [11]:
possible_sql_singleFS_select = \
"""
SELECT
    CAST(snomedcode AS INT64) AS snomedcode
FROM
    tbl_patients_per_code
WHERE
    count_patients_with_code BETWEEN """ + str(possibleCaseness_count_LB) + """ AND """ + str(possibleCaseness_count_UB) + """
ORDER BY
    snomedcode
"""
global df_fs_database_possible
df_fs_database_possible = client.query(sql_variables + sql_base + possible_sql_singleFS_select).to_dataframe().sort_values(by=['snomedcode']).reset_index(drop=True)
display(df_fs_database_possible)
%store df_fs_database_possible


definite_sql_singleFS_select = \
"""
SELECT
    CAST(snomedcode AS INT64) AS snomedcode
FROM
    tbl_patients_per_code
WHERE
    count_patients_with_code BETWEEN """ + str(definiteCaseness_count_LB) + """ AND """ + str(definiteCaseness_count_UB) + """
ORDER BY
    snomedcode
"""
global df_fs_database_definite
df_fs_database_definite = client.query(sql_variables + sql_base + definite_sql_singleFS_select).to_dataframe().sort_values(by=['snomedcode']).reset_index(drop=True)
display(df_fs_database_definite)
%store df_fs_database_definite


multinomial_sql_singleFS_select = \
"""
SELECT
    CAST(snomedcode AS INT64) AS snomedcode
FROM
    tbl_patients_per_code
WHERE
    count_patients_with_code BETWEEN """ + str(multinomialCaseness_count_LB) + """ AND """ + str(multinomialCaseness_count_UB) + """
ORDER BY
    snomedcode
"""
global df_fs_database_multinomial
df_fs_database_multinomial = client.query(sql_variables + sql_base + multinomial_sql_singleFS_select).to_dataframe().sort_values(by=['snomedcode']).reset_index(drop=True)
display(df_fs_database_multinomial)
%store df_fs_database_multinomial


possdef_sql_singleFS_select = \
"""
SELECT
    CAST(snomedcode AS INT64) AS snomedcode
FROM
    tbl_patients_per_code
WHERE
    count_patients_with_code BETWEEN """ + str(possdefCaseness_count_LB) + """ AND """ + str(possdefCaseness_count_UB) + """
ORDER BY
    snomedcode
"""
global df_fs_database_possdef
df_fs_database_possdef = client.query(sql_variables + sql_base + possdef_sql_singleFS_select).to_dataframe().sort_values(by=['snomedcode']).reset_index(drop=True)
display(df_fs_database_possdef)
%store df_fs_database_possdef

Unnamed: 0,snomedcode
0,967006
1,3457005
2,3895009
3,4556007
4,5880005
...,...
477,1052781000000102
478,1084371000000102
479,1084421000000109
480,1085101000000107


Stored 'df_fs_database_possible' (DataFrame)


Unnamed: 0,snomedcode
0,286009
1,844005
2,1085006
3,1126007
4,1225002
...,...
2447,1087611000000108
2448,1091881000000109
2449,1092221000000101
2450,1094801000000102


Stored 'df_fs_database_definite' (DataFrame)


Unnamed: 0,snomedcode
0,3457005
1,3895009
2,4556007
3,5880005
4,6020002
...,...
463,1052671000000108
464,1084371000000102
465,1084421000000109
466,1085101000000107


Stored 'df_fs_database_multinomial' (DataFrame)


Unnamed: 0,snomedcode
0,286009
1,844005
2,1085006
3,1126007
4,1225002
...,...
2447,1087611000000108
2448,1091881000000109
2449,1092221000000101
2450,1094801000000102


Stored 'df_fs_database_possdef' (DataFrame)
