# Interview feature sets
The purpose of this notebook is to create the array of feature sets suggested by the interviews with GPs.

### Imports and helper functions

In [120]:
%run 'UNSEEN_helper_functions.ipynb'
%store -r

## Load codelist CSV files.
We used opencodelist.org to define codelists that define the set of SNOMED-CT codes used to identify patients based on various attributes.

In [4]:
# Instaniate BigQuery client.
client = bigquery.Client()

# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN create clinician feature sets.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_DNA = pandas.read_csv(folder + "ciaranmci-did-not-attend-098119da.csv")

## Load prerequisites

In [128]:
%%capture
if 'caseness_array' not in globals():
    print("not here")
    %run ./"UNSEEN_create_caseness_variables.ipynb"
%store -r

## Query database for base feature sets
The code below returns `fs_interview` that contains the base feature sets.

### Feature sets potentially indicative of "chaotic life"

#### Counts and ratios of appointments and did-not-attend events

In [129]:
sql_CTEs_top = """
WITH
# The first CTE will specify the 'spine' of the data table by selecting the unique list of person IDs.
tbl_persons AS (
    SELECT
        DISTINCT person_id
        ,year_of_birth
    FROM
        """ + server_id + """.""" + database_id + """.person
    # Limiting to age range 18-70.
    WHERE
        (EXTRACT(YEAR FROM CURRENT_DATE()) - year_of_birth) BETWEEN 18 AND 70
)

# The following CTEs extract each clinical codelist into a SQL table before querying the person_ID 
# associated with the clinical codes.
#
"""

sql_CTEs_body = \
"""
#  ## Count of appointments in the previous year.
,tbl_countAppointmentsPreviousYear_persons AS ( 
    SELECT 
        DISTINCT person_id
        ,COUNT( DISTINCT EXTRACT(DATE FROM datestart) ) AS countAppointmentsPreviousYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    WHERE
        DATE_DIFF(CURRENT_DATE(), datestart, YEAR) <= 1
    GROUP BY
        person_id
    ORDER BY
        person_id
)
# ## Median annual count of appointments.
,tbl_annualCountOfAppointments AS (
    SELECT 
        DISTINCT person_id
        ,EXTRACT(YEAR FROM datestart) AS year_appointment
        ,COUNT( DISTINCT EXTRACT(DATE FROM datestart) ) AS countAppointmentsPerYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    GROUP BY
        person_id
        ,year_appointment
    ORDER BY
        person_id
        ,year_appointment
)
,tbl_medianAnnualCountAppointments_persons AS (
    SELECT
        DISTINCT person_id
        ,PERCENTILE_DISC(countAppointmentsPerYear, 0.5) OVER(PARTITION BY person_id) AS medianAnnualCountAppointments
    FROM
        tbl_annualCountOfAppointments
    ORDER BY
        person_id
)
#  ## Count of Did-Not-Attend (DNA) in the previous year.
,tbl_DNAcodes AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_countDNAsPreviousYear_persons AS ( 
    SELECT 
      DISTINCT a.person_id
     ,COUNT(person_id) AS countDNAsPreviousYear
    FROM
      """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
      a.snomedcode IN (tbl_DNAcodes.snomedcode)
      AND DATE_DIFF(CURRENT_DATE(), dateevent, YEAR) <= 1
    GROUP BY
        person_id
)
# ## Median annual count of Did-Not-Attend (DNA).
,tbl_annualCountOfDNAs AS ( 
    SELECT 
        DISTINCT a.person_id
        ,EXTRACT(YEAR FROM dateevent) AS year_DNA
        ,COUNT( DISTINCT EXTRACT(DATE FROM dateevent) ) AS countDNAsPerYear
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
        a.snomedcode IN (tbl_DNAcodes.snomedcode)
    GROUP BY
        person_id
        ,year_DNA
)
,tbl_medianAnnualCountDNAs_persons AS (
    SELECT
        DISTINCT tbl_annualCountOfDNAs.person_id
        ,PERCENTILE_DISC(countDNAsPerYear, 0.5) OVER(PARTITION BY person_id) AS medianAnnualCountDNAs
    FROM
        tbl_annualCountOfDNAs
    ORDER BY
        person_id
)
# ## Ratio of annual counts of Did-Not-Attend (DNA) to appointment, in the previous year.
,tbl_ratioDNAtoAppointmentPreviousYear_persons AS (
    SELECT
        DISTINCT tbl_countDNAsPreviousYear_persons.person_id
        ,(countDNAsPreviousYear / countAppointmentsPreviousYear) AS ratioDNAtoAppointmentPreviousYear
    FROM
        tbl_countDNAsPreviousYear_persons
    LEFT OUTER JOIN tbl_countAppointmentsPreviousYear_persons ON tbl_countDNAsPreviousYear_persons.person_id = tbl_countAppointmentsPreviousYear_persons.person_id
        
)
# ## Median annual ratio of DNA to appointments
,tbl_medianAnnualRatioDNAtoAppointment_persons AS (
    SELECT
        DISTINCT tbl_annualCountOfDNAs.person_id
        ,PERCENTILE_DISC( (countDNAsPerYear / countAppointmentsPerYear), 0.5) OVER(PARTITION BY tbl_annualCountOfDNAs.person_id) AS medianAnnualRatioDNAtoAppointment
    FROM
        tbl_annualCountOfDNAs
    LEFT OUTER JOIN
        tbl_annualCountOfAppointments
        ON
        (
        tbl_annualCountOfDNAs.person_id = tbl_annualCountOfAppointments.person_id
        AND tbl_annualCountOfDNAs.year_DNA = tbl_annualCountOfAppointments.year_appointment
        )
)

#######################################################

"""

sql_final_select = \
"""
# Finally, we use the above CTEs to define a table with one row per patient and one column for each
# feature set. The feature-set columns are populated by interger values with '1' indicating that the
# feature set is satisfied and '0' indicating otherwise.
SELECT
    DISTINCT tbl_persons.person_id
    ,countAppointmentsPreviousYear
    ,medianAnnualCountAppointments
    ,countDNAsPreviousYear
    ,medianAnnualCountDNAs
    ,ratioDNAtoAppointmentPreviousYear
    ,medianAnnualRatioDNAtoAppointment
    
FROM tbl_persons
LEFT OUTER JOIN tbl_countAppointmentsPreviousYear_persons ON tbl_persons.person_id = tbl_countAppointmentsPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualCountAppointments_persons ON tbl_persons.person_id = tbl_medianAnnualCountAppointments_persons.person_id
LEFT OUTER JOIN tbl_countDNAsPreviousYear_persons ON tbl_persons.person_id = tbl_countDNAsPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualCountDNAs_persons ON tbl_persons.person_id = tbl_medianAnnualCountDNAs_persons.person_id
LEFT OUTER JOIN tbl_ratioDNAtoAppointmentPreviousYear_persons ON tbl_persons.person_id = tbl_ratioDNAtoAppointmentPreviousYear_persons.person_id
LEFT OUTER JOIN tbl_medianAnnualRatioDNAtoAppointment_persons ON tbl_persons.person_id = tbl_medianAnnualRatioDNAtoAppointment_persons.person_id

ORDER BY tbl_persons.person_id
"""

fs_interview = client.query(sql_CTEs_top + sql_CTEs_body + sql_final_select).to_dataframe().fillna(0)

#### Entropy-based feature sets potentially indicative of "chaotic life"

The data from BigQuery needs to be appointments and DNAs tallied in three-month blocks, per person. Specifically, I use BigQuery's built-in `QUARTER()` function for which Q1 = Jan-Mar, Q2 = Apr-Jun, etc.  The query will only return data for quarters in which there was an appointment or a DNA. Each patient's data will be processed in Python to fill in the missing quarters' counts with 0 before calculating the values of the entropy-based feature sets.

In [130]:
sql_CTEs_body = \
"""
WITH
#  ## Count of appointments, per quarter.
tbl_countAppointmentsPerQuarter AS (
    SELECT
        DISTINCT person_id
        ,EXTRACT(YEAR FROM datestart) AS year_appointment
        ,EXTRACT(QUARTER FROM datestart) AS quarter_appointment
        ,COUNT(DISTINCT datestart) AS countAppointmentsPerQuarter
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    GROUP BY
         person_id
        ,year_appointment
        ,quarter_appointment

)
"""

sql_final_select = \
"""
SELECT
    person_id
    ,year_appointment
    ,quarter_appointment
    ,countAppointmentsPerQuarter
FROM
    tbl_countAppointmentsPerQuarter
ORDER BY
    person_id
    ,year_appointment
    ,quarter_appointment
"""

bq_countAppointmentsPerQuarter = client.query(sql_CTEs_body + sql_final_select).to_dataframe().fillna(0).astype(int)

In [131]:
sql_CTEs_body = \
"""
WITH
#  ## Count of did-not-attend (DNA), per quarter.
tbl_DNAcodes AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_countDNAsPerQuarter AS ( 
    SELECT 
        DISTINCT a.person_id
        ,EXTRACT(YEAR FROM dateevent) AS year_DNA
        ,EXTRACT(QUARTER FROM dateevent) AS quarter_DNA
        ,COUNT( DISTINCT EXTRACT(DATE FROM dateevent) ) AS countDNAsPerQuarter
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
        a.snomedcode IN (tbl_DNAcodes.snomedcode)
    GROUP BY
        person_id
        ,year_DNA
        ,quarter_DNA
)
"""

sql_final_select = \
"""
SELECT
    person_id
    ,year_DNA
    ,quarter_DNA
    ,countDNAsPerQuarter
FROM
    tbl_countDNAsPerQuarter
ORDER BY
    person_id
    ,year_DNA
    ,quarter_DNA
"""

bq_countDNAsPerQuarter = client.query(sql_CTEs_body + sql_final_select).to_dataframe().fillna(0).astype(int)

In [152]:
# Make a FOR loop that will loop through patients to 1) produce their timeline, 2) join their actual appointment/DNA data, 
# and 3) create the entropy-based feature sets.
# This for loop will later be parallelised using the `multiprocessing` library.
#
# Set storage.
ls_entropyBased_fs_appts = [['activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]
ls_entropyBased_fs_DNAs = [['activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]
ls_pids = set(numpy.concatenate((bq_countAppointmentsPerQuarter.head(100).person_id.unique(), bq_countDNAsPerQuarter.head(100).person_id.unique())))

for pid in ls_pids:
    # Extract this particular patient's range of active years.
    pt_years = \
        bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, 'year_appointment'].append(
         bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, 'year_DNA'])
    
    pt_years_lsrange =  pandas.DataFrame(
        data = { 'year' : list( range( min(pt_years), max(pt_years) ) ) }
        )
    # Create a timeline of years and quarters for this particular patient.
    pt_quarters = pandas.DataFrame( data = {'qtr': [1,2,3,4]} )
    pt_timeline = pt_years_lsrange.merge(pt_quarters, how = 'cross')
    
    # Join the patient's actual count of appointments-per-quarter-per-year to their timeline.
    pt_appts = bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, :]
    pt_timeline_appts = \
        pandas.merge(pt_timeline, pt_appts, how = 'left',
                     left_on = ['year', 'qtr'],
                     right_on = ['year_appointment',
                                 'quarter_appointment']).loc[:,'countAppointmentsPerQuarter'].fillna(0).astype(int)
    
    # Repeat for did-not-attend events.
    pt_DNAs = bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, :]
    pt_timeline_DNAs = \
        pandas.merge(pt_timeline, pt_DNAs, how = 'left',
                     left_on = ['year', 'qtr'],
                     right_on = ['year_DNA',
                                 'quarter_DNA']).loc[:,'countDNAsPerQuarter'].fillna(0).astype(int)

    # Create the entropy-based feature sets.
    # ...
    pt_entropyStats_appts = chaoticlifeentropyfs(pt_timeline_appts)
    ls_entropyBased_fs_appts.append(pt_entropyStats_appts)
    pt_entropyStats_DNAs = chaoticlifeentropyfs(pt_timeline_DNAs)
    ls_entropyBased_fs_DNAs.append(pt_entropyStats_DNAs)

# Convert the nested list into a pandas.DataFrame.
entropyBasedFS = pandas.DataFrame(ls_entropyBased_fs_appts[1:], columns = ls_entropyBased_fs_appts[0])
entropyBasedFS.insert(0, 'person_id', bq_countAppointmentsPerQuarter.head(100).person_id.unique())

entropyBasedFS_DNAs = pandas.DataFrame(ls_entropyBased_fs_DNAs[1:], columns = ls_entropyBased_fs_DNAs[0])
entropyBasedFS_DNAs.insert(0, 'person_id', bq_countDNAsPerQuarter.head(100).person_id.unique())

entropyBasedFS.merge(entropyBasedFS_DNAs
                    ,how = 'outer'
                    ,on = 'person_id')

  Pxx = Pt[:Fx]/sum(Pt[:Fx])


ValueError: Length of values (6) does not match length of index (38)

In [23]:
# Display message.
display(
    Markdown(
f"""
## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- ${int(possibleCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(possibleCaseness_count_UB):,}$, for 'Possible caseness'
- ${int(definiteCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(definiteCaseness_count_UB):,}$, for 'Definite caseness'
- ${int(multinomialCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(multinomialCaseness_count_UB):,}$, for 'Multinomial caseness'
- ${int(possdefCaseness_count_LB):,}\le$ $count\ of\ patients_{{feature\ set_{{i}}}}$ $\le{int(possdefCaseness_count_UB):,}$, for 'Possible-vs-Definite caseness'
"""
       )
)


## Filter feature sets not within bounds

All feature sets must have a prevalence (or count) between the prevalence (or count) bounds defined in `UNSEEN_feature_sets_prevalence_bounds.ipynb`,
i.e. a feature set's count must satisfy:
- $18,950\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le37,890$, for 'Possible caseness'
- $770\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le1,530$, for 'Definite caseness'
- $19,710\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le39,430$, for 'Multinomial caseness'
- $770\le$ $count\ of\ patients_{feature\ set_{i}}$ $\le1,530$, for 'Possible-vs-Definite caseness'


In [29]:
fs_interview_filteredPossible = boundaryfilter(my_featureSet_array = fs_interview, caseness = 'possible', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_interview_filteredPossible.columns.values[1:])
fs_interview_filteredDefinite = boundaryfilter(my_featureSet_array = fs_interview, caseness = 'definite', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_interview_filteredDefinite.columns.values[1:])
fs_interview_filteredMulti = boundaryfilter(my_featureSet_array = fs_interview, caseness = 'multi', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_interview_filteredMulti.columns.values[1:])
fs_interview_filteredPossDef = boundaryfilter(my_featureSet_array = fs_interview, caseness = 'possdef', verbose = True)[0]
print("\nThe final list of feature sets from this source is:\n", fs_interview_filteredPossDef.columns.values[1:])



# Store clinician feature sets for use in other notebooks.
%store fs_interview_filteredPossible fs_interview_filteredDefinite fs_interview_filteredMulti fs_interview_filteredPossDef


 Filtering complete for 'Possible caseness'...
	 1  feature sets remain.
	 4  feature sets removed, in total.
	 4  feature sets removed because of low prevalence.
	 0  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 ['sleepDisturbance']

 Filtering complete for 'Definite caseness'...
	 0  feature sets remain.
	 5  feature sets removed, in total.
	 1  feature sets removed because of low prevalence.
	 4  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 []

 Filtering complete for 'Multi caseness'...
	 1  feature sets remain.
	 4  feature sets removed, in total.
	 4  feature sets removed because of low prevalence.
	 0  feature sets removed because of high prevalence.

The final list of feature sets from this source is:
 ['sleepDisturbance']

 Filtering complete for 'Possible-vs-Definite caseness'...
	 0  feature sets remain.
	 5  feature sets removed, in total.
	 1  feature sets r

# ---------------------------------------------------------------------------------