# Add feature sets of entropy statistics.

The purpose of this notebook is to append feature sets that are defined by the value of entropy statistics.

This notebook is expected to be called by its parent `UNSEEN_create_feature_sets_base.ipynb`. It will not run without the requisite loaded during the parent notebook.

## Refresh store.

In [1]:
# Get helper functions.
%run 'UNSEEN_helper_functions.ipynb'
# Refresh stored variables, if they are present.
%store -r

## Load requisites

In [2]:
# Set folder location.
folder_loc = os.path.dirname(os.path.abspath("UNSEEN_create_clinician_feature_sets.ipynb"))
folder = folder_loc + '/codelists/'

# Clinical codes of interest.
codes_to_query_DNA = set( pandas.read_csv(folder + "ciaranmci-did-not-attend-098119da.csv")["code"] )

### Define functions.

## Entropy-based feature sets potentially indicative of "chaotic life"

The data from BigQuery results needs to be appointments and did-not-attend (DNA) tallied in three-month blocks, per person. Specifically, I use BigQuery's built-in `QUARTER()` function for which Q1 = Jan-Mar, Q2 = Apr-Jun, etc.  The query will only return data for quarters in which there was an appointment or a DNA. Each patient's data will be processed in Python to fill in the missing quarters' counts with 0 before calculating the values of the entropy-based feature sets.

##### SQL synax for appointments.

In [None]:
sql_CTEs_body = \
"""
#  ## Count of appointments, per quarter.
,tbl_countAppointmentsPerQuarter AS (
    SELECT
        DISTINCT person_id
        ,EXTRACT(YEAR FROM datestart) AS year_appointment
        ,EXTRACT(QUARTER FROM datestart) AS quarter_appointment
        ,COUNT(DISTINCT datestart) AS countAppointmentsPerQuarter
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srappointment
    GROUP BY
         person_id
        ,year_appointment
        ,quarter_appointment

)
"""

sql_final_select = \
"""
SELECT
    tbl_studyPopulation_no_caseness.person_id
    ,year_appointment
    ,quarter_appointment
    ,countAppointmentsPerQuarter
FROM
    tbl_studyPopulation_no_caseness
LEFT JOIN
    tbl_countAppointmentsPerQuarter
    ON tbl_studyPopulation_no_caseness.person_id = tbl_countAppointmentsPerQuarter.person_id
ORDER BY
    person_id
    ,year_appointment
    ,quarter_appointment
"""

global bq_countAppointmentsPerQuarter
bq_countAppointmentsPerQuarter = pandas.read_gbq(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_final_select).fillna(0).astype(int)
# Double-check that all exclusions have been applied.
bq_countAppointmentsPerQuarter.drop(bq_countAppointmentsPerQuarter[~bq_countAppointmentsPerQuarter.person_id.isin(caseness_array.person_id)].index, inplace=True)
# Remove rows where a patient does not have any did-not-attends.
bq_countAppointmentsPerQuarter = bq_countAppointmentsPerQuarter[bq_countAppointmentsPerQuarter.year_appointment != 0]
%store bq_countAppointmentsPerQuarter

##### SQL synax for did-not-attends (DNAs).

In [7]:
sql_CTEs_body = \
"""
#  ## Count of did-not-attend (DNA), per quarter.
,tbl_DNAcodes AS ( 
    SELECT
        snomedcode
    FROM
        UNNEST([
                '""" + '\', \''.join(map(str, codes_to_query_DNA["code"].tolist())) + """'
                ]) AS snomedcode
)
,tbl_countDNAsPerQuarter AS ( 
    SELECT 
        DISTINCT person_id
        ,EXTRACT(YEAR FROM dateevent) AS year_DNA
        ,EXTRACT(QUARTER FROM dateevent) AS quarter_DNA
        ,COUNT( DISTINCT EXTRACT(DATE FROM dateevent) ) AS countDNAsPerQuarter
    FROM
        """ + server_id + """.""" + database_id + """.tbl_srcode AS a, tbl_DNAcodes
    WHERE
        a.snomedcode IN (tbl_DNAcodes.snomedcode)
    GROUP BY
        person_id
        ,year_DNA
        ,quarter_DNA
)
"""

sql_final_select = \
"""
SELECT
    tbl_studyPopulation_no_caseness.person_id
    ,year_DNA
    ,quarter_DNA
    ,countDNAsPerQuarter
FROM
    tbl_studyPopulation_no_caseness
LEFT JOIN
    tbl_countDNAsPerQuarter
    ON tbl_studyPopulation_no_caseness.person_id = tbl_countDNAsPerQuarter.person_id
ORDER BY
    person_id
    ,year_DNA
    ,quarter_DNA
"""

global bq_countDNAsPerQuarter
bq_countDNAsPerQuarter = pandas.read_gbq(sql_declarations + sql_studyPopulation + sql_CTEs_body + sql_final_select).fillna(0).astype(int)
# Double-check that all exclusions have been applied.
bq_countDNAsPerQuarter.drop(bq_countDNAsPerQuarter[~bq_countDNAsPerQuarter.person_id.isin(caseness_array.person_id)].index, inplace=True)
# Remove rows where a patient does not have any did-not-attends.
bq_countDNAsPerQuarter = bq_countDNAsPerQuarter[bq_countDNAsPerQuarter.year_DNA != 0]
%store bq_countDNAsPerQuarter

Stored 'bq_countDNAsPerQuarter' (DataFrame)


##### Calculating entropy statistics, using FOR loop

In [None]:
pid_processed = []
# Set iterator.
ls_pids = list(set(numpy.concatenate((bq_countAppointmentsPerQuarter.person_id.unique(), bq_countDNAsPerQuarter.person_id.unique()))))
ls_pids.sort()

# Set storage.
ls_entropyBasedFS = []

# Set counter of patients who are capped.
cap_counter = 0

# Do the work.
t1 = time.time()
for pid in tqdm.notebook.tqdm_notebook(ls_pids, unit = " patients"):
    pt_years = \
            bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, 'year_appointment'].append(
             bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, 'year_DNA'])

    pt_years_lsrange =  pandas.DataFrame(
        data = { 'year' : list( range( min(pt_years), max(pt_years) ) ) }
        )
    # Create a timeline of years and quarters for this particular patient.
    pt_quarters = pandas.DataFrame( data = {'qtr': [1,2,3,4]} )
    pt_timeline = pt_years_lsrange.merge(pt_quarters, how = 'cross')

    # Join the patient's actual count of appointments-per-quarter-per-year to their timeline.
    pt_appts = bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, :]
    pt_timeline_appts = \
        pandas.merge(pt_timeline, pt_appts, how = 'left',
                     left_on = ['year', 'qtr'],
                     right_on = ['year_appointment',
                                 'quarter_appointment']).loc[:,'countAppointmentsPerQuarter'].fillna(0).astype(int)
    
    # Repeat for did-not-attend events.
    pt_DNAs = bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, :]
    pt_timeline_DNAs = \
        pandas.merge(pt_timeline, pt_DNAs, how = 'left',
                     left_on = ['year', 'qtr'],
                     right_on = ['year_DNA',
                                 'quarter_DNA']).loc[:,'countDNAsPerQuarter'].fillna(0).astype(int)
    
        # Increment cap counter.
    if ((pt_timeline_DNAs > 72) + (pt_timeline_appts > 72)).any():
        cap_counter += 1
    
    # Cap the count of events at 72. This fudge exists because the Python kernel in Google Cloud Platform dies trying
    # to allocated memory to more than 72 states, as part of the entropy calculations.
    pt_timeline_appts = [72 if i >72 else i for i in pt_timeline_appts]
    pt_timeline_DNAs = [72 if i >72 else i for i in pt_timeline_DNAs]
    
    # Create the entropy-based feature sets.
    pt_entropyStats_appts = chaoticlifeentropyfs(pt_timeline_appts)
    pt_entropyStats_DNAs = chaoticlifeentropyfs(pt_timeline_DNAs)
    
    # Mark pid as processed, just in case things crash and you need to restart.
    pid_processed.append(pid)

    # send result using shared queue
    ls_entropyBasedFS.append([pid] + pt_entropyStats_appts + pt_entropyStats_DNAs)

print(f'It took {time.time() - t1} to process.')
ls_entropyBasedFS.sort()

# Print feedback.
print(f'A total of {cap_counter} patients had more than 72 events in a given quarter, thus requiring capping at 72, for that quarter')

In [11]:
entropyBasedFS = \
    pandas.DataFrame(ls_entropyBasedFS,
                     columns =
                     ['person_id'
                     ,'activeInformationAppts'
                     ,'entropyRateAppts'
                     ,'spectralEntropyAppts'
                     ,'sampleEntropyAppts'
                     ,'eoeAppts'
                     ,'averageEntropyAppts'
                     ,'bubbleEntropyAppts'
                     ,'activeInformationDNAs'
                     ,'entropyRateDNAs'
                     ,'spectralEntropyDNAs'
                     ,'sampleEntropyDNAs'
                     ,'eoeDNAs'
                     ,'averageEntropyDNAs'
                     ,'bubbleEntropyDNAs'
        ])
# Double-check that all exclusions have been applied.
entropyBasedFS.drop(entropyBasedFS[~entropyBasedFS.person_id.isin(caseness_array.person_id)].index, inplace=True)
%store entropyBasedFS

Stored 'entropyBasedFS' (DataFrame)


In [5]:
# Join to `feature_set_array`.
feature_set_array = feature_set_array.merge(entropyBasedFS, on = 'person_id', how = 'left' )

##### Calculating entropy statistics, using multiprocessing

## Store of feature_set_array.

In [7]:
%store feature_set_array

Stored 'feature_set_array' (DataFrame)
