<img src="https://raw.githubusercontent.com/AmsterdamUMC/AmsterdamUMCdb/master/img/logo_amds.png" alt="Logo" style="width: 128px;"/>

# AmsterdamUMCdb Scientific Paper

Manuscript submitted for publication. Copyright &copy; 2003-2020 Amsterdam UMC - Amsterdam Medical Data Science

## Imports

In [1]:
%matplotlib inline
import amsterdamumcdb
import psycopg2
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl

import io
from IPython.display import display, HTML, Markdown

## Display settings

In [2]:
#matplotlib settings for image size
#needs to be in a different cell from %matplotlib inline
plt.style.use('seaborn-darkgrid')
plt.rcParams["figure.dpi"] = 288
plt.rcParams["figure.figsize"] = [16, 12]
plt.rcParams["font.size"] = 12

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = 1000

## Connection settings

In [3]:
#Modify config.ini in the root folder of the repository to change the settings to connect to your postgreSQL database
import configparser
import os
config = configparser.ConfigParser()

if os.path.isfile('../config.ini'):
    config.read('../config.ini')
else:
    config.read('../config.SAMPLE.ini')

#Open a connection to the postgres database:
con = psycopg2.connect(database=config['psycopg2']['database'], 
                       user=config['psycopg2']['username'], password=config['psycopg2']['password'], 
                       host=config['psycopg2']['host'], port=config['psycopg2']['port'])
con.set_client_encoding('WIN1252') #Uses code page for Dutch accented characters.
con.set_session(autocommit=True)

cursor = con.cursor()
cursor.execute('SET SCHEMA \'amsterdamumcdb\''); #set search_path to amsterdamumcdb schema

## Table 2. Assessment of re-identification risk 

## Load the admissions table
Used in calculation prevalence and other estimates

In [4]:
admissions = pd.read_sql('SELECT * FROM admissions', con)
admissions.head()

Unnamed: 0,patientid,admissionid,admissioncount,location,urgency,origin,admittedat,admissionyeargroup,dischargedat,lengthofstay,destination,gender,agegroup,dateofdeath,weightgroup,weightsource,heightgroup,heightsource,specialty
0,0,0,1,IC,0,,0,2003-2009,148800000,42,16,Vrouw,80+,,60-69,Anamnestisch,160-169,Anamnestisch,Cardiochirurgie
1,1,1,1,IC,0,,0,2010-2016,96120000,26,15,Man,60-69,,70-79,Anamnestisch,170-179,Anamnestisch,Cardiochirurgie
2,2,2,1,IC,1,Eerste Hulp afdeling zelfde ziekenhuis,0,2010-2016,84240000,23,15,Man,60-69,,90-99,Anamnestisch,180-189,Anamnestisch,Cardiochirurgie
3,3,3,1,IC,0,,0,2003-2009,84900000,23,14,Man,50-59,,90-99,,180-189,Gemeten,Cardiochirurgie
4,4,4,1,IC&MC,0,Verpleegafdeling zelfde ziekenhuis,0,2010-2016,180900000,50,19,Man,70-79,,70-79,Anamnestisch,170-179,Anamnestisch,Cardiochirurgie


# <i>k</i>-Anonymity

## Researcher with presumed background knowledge:
- age (agegroup)
- admission year (admissionyeargroup)
- gender
- survival at discharge (destination/alive)
- weight (weightgroup)
- height (heightgroup)

### Determine equivalence classes, *k*-anonymity and *l*-diversity based on diagnoses in listitems
Re-identification P(re_id) is calculated as 1/*k*. For calculating *l*-diversity with groups containing `null` values or `diagnose anders` (no real diagnoses documented) the following is assumed for calculation:
- Groups with only `null` values: *l*-diversity = `null`
- Groups with at least one 'real' diagnosis, count `null` values as one group

In [5]:
sql_risk_researcher_ids = """
WITH diagnosis_groups AS (
SELECT admissionid,
        item, 
        value as diagnosis_group,
        CASE
            WHEN itemid = 13110 AND valueid BETWEEN 1 AND 3 THEN 1 --D_Hoofdgroep
            WHEN itemid = 16651 AND valueid BETWEEN 7 AND 9 THEN 1 --DMC_Hoofdgroep
            WHEN itemid = 16997 AND valueid BETWEEN 11 AND 20 THEN 1 --APACHE IV Groepen
            WHEN itemid = 18588 AND valueid BETWEEN 1 AND 7 THEN 1 --Apache II Hoofdgroep
            ELSE 0
        END AS surgical,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
    FROM listitems
    WHERE itemid IN (
        --MAIN GROUP - LEVEL 0
        13110, --D_Hoofdgroep
        16651 --DMC_Hoofdgroep, Medium Care
    )
),diagnosis_subgroups AS (
SELECT admissionid,
        item, 
        value as diagnosis_subgroup,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
    FROM listitems
    WHERE itemid IN (
        --SUB GROUP - LEVEL 1
        13111, --D_Subgroep_Thoraxchirurgie
        16669, --DMC_Subgroep_Thoraxchirurgie
        13112, --D_Subgroep_Algemene chirurgie
        16665, --DMC_Subgroep_Algemene chirurgie
        13113, --D_Subgroep_Neurochirurgie
        16667, --DMC_Subgroep_Neurochirurgie
        13114, --D_Subgroep_Neurologie
        16668, --DMC_Subgroep_Neurologie
        13115, --D_Subgroep_Interne geneeskunde
        16666 --DMC_Subgroep_Interne geneeskunde
    )
), diagnoses AS (
SELECT admissionid,
        item, 
        value as diagnosis,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
FROM listitems
WHERE itemid IN (
        -- Diagnosis - LEVEL 2
        --SURGICAL
        13116, --D_Thoraxchirurgie_CABG en Klepchirurgie
        16671, --DMC_Thoraxchirurgie_CABG en Klepchirurgie
        13117, --D_Thoraxchirurgie_Cardio anders
        16672, --DMC_Thoraxchirurgie_Cardio anders
        13118, --D_Thoraxchirurgie_Aorta chirurgie
        16670, --DMC_Thoraxchirurgie_Aorta chirurgie
        13119, --D_Thoraxchirurgie_Pulmonale chirurgie
        16673, --DMC_Thoraxchirurgie_Pulmonale chirurgie
        13141, --D_Algemene chirurgie_Algemeen   
        16642, --DMC_Algemene chirurgie_Algemeen
        13121, --D_Algemene chirurgie_Buikchirurgie
        16643, --DMC_Algemene chirurgie_Buikchirurgie
        13123, --D_Algemene chirurgie_Endocrinologische chirurgie
        16644, --DMC_Algemene chirurgie_Endocrinologische chirurgie
        13145, --D_Algemene chirurgie_KNO/Overige
        16645, --DMC_Algemene chirurgie_KNO/Overige
        13125, --D_Algemene chirurgie_Orthopedische chirurgie
        16646, --DMC_Algemene chirurgie_Orthopedische chirurgie
        13122, --D_Algemene chirurgie_Transplantatie chirurgie
        16647, --DMC_Algemene chirurgie_Transplantatie chirurgie
        13124, --D_Algemene chirurgie_Trauma
        16648, --DMC_Algemene chirurgie_Trauma
        13126, --D_Algemene chirurgie_Urogenitaal
        16649, --DMC_Algemene chirurgie_Urogenitaal
        13120, --D_Algemene chirurgie_Vaatchirurgie
        16650, --DMC_Algemene chirurgie_Vaatchirurgie
        13128, --D_Neurochirurgie _Vasculair chirurgisch
        16661, --DMC_Neurochirurgie _Vasculair chirurgisch
        13129, --D_Neurochirurgie _Tumor chirurgie
        16660, --DMC_Neurochirurgie _Tumor chirurgie
        13130, --D_Neurochirurgie_Overige
        16662, --DMC_Neurochirurgie_Overige

        --MEDICAL
        13133, --D_Interne Geneeskunde_Cardiovasculair
        16653, --DMC_Interne Geneeskunde_Cardiovasculair
        13134, --D_Interne Geneeskunde_Pulmonaal
        16658, --DMC_Interne Geneeskunde_Pulmonaal
        13135, --D_Interne Geneeskunde_Abdominaal
        16652, --DMC_Interne Geneeskunde_Abdominaal
        13136, --D_Interne Geneeskunde_Infectieziekten
        16655, --DMC_Interne Geneeskunde_Infectieziekten
        13137, --D_Interne Geneeskunde_Metabool
        16656, --DMC_Interne Geneeskunde_Metabool
        13138, --D_Interne Geneeskunde_Renaal
        16659, --DMC_Interne Geneeskunde_Renaal
        13139, --D_Interne Geneeskunde_Hematologisch
        16654, --DMC_Interne Geneeskunde_Hematologisch
        13140, --D_Interne Geneeskunde_Overige
        16657, --DMC_Interne Geneeskunde_Overige
        13131, --D_Neurologie_Vasculair neurologisch
        16664, --DMC_Neurologie_Vasculair neurologisch
        13132, --D_Neurologie_Overige
        16663, --DMC_Neurologie_Overige 
        13127 --D_KNO/Overige
        )
),
admissions_cleaned AS (
    SELECT 
        admissionid, 
        patientid,
        lengthofstay,
        agegroup, 
        admissionyeargroup,
        gender,
        CASE
            WHEN destination = 'Overleden' THEN 0
            ELSE 1
        END as alive,
        weightgroup,
        heightgroup
    FROM admissions
),
anonymity_groups AS (
    -- possible diagnosis grouping: diagnosis, diagnosis_subgroup, diagnosis_group, 
    SELECT 
        gender, 
        agegroup, 
        alive,
        admissionyeargroup,
        weightgroup, 
        heightgroup, 
        COUNT(DISTINCT COALESCE(diagnosis,'N/A')) as l_diversity, 
        STRING_AGG(DISTINCT COALESCE(diagnosis,'N/A'), '\n') as l_diversity_diags,
        COUNT(patientid) AS k_anonymity 
    FROM admissions_cleaned
    LEFT JOIN diagnoses ON admissions_cleaned.admissionid = diagnoses.admissionid
    LEFT JOIN diagnosis_subgroups ON admissions_cleaned.admissionid = diagnosis_subgroups.admissionid
    LEFT JOIN diagnosis_groups ON admissions_cleaned.admissionid = diagnosis_groups.admissionid         
    WHERE (diagnoses.rownum = 1 OR diagnoses.rownum IS NULL) AND 
        (diagnosis_subgroups.rownum = 1 OR diagnosis_subgroups.rownum IS NULL) AND
        (diagnosis_groups.rownum = 1 OR diagnosis_groups.rownum IS NULL) --only last updated record
    GROUP BY 
        agegroup, 
        admissionyeargroup,
        gender,
        alive,
        weightgroup, 
        heightgroup
),
anonymity_ids AS (
    SELECT 
        a.*, 
        diagnosis, 
        diagnosis_subgroup, 
        diagnosis_group, 
        CASE 
            WHEN l_diversity = 1 AND 
            (l_diversity_diags = 'N/A' OR l_diversity_diags = 'Diagnose anders') THEN NULL
            ELSE l_diversity
        END AS l_diversity,
        l_diversity_diags,
        k_anonymity,
        1.0/k_anonymity AS risk
    FROM admissions_cleaned a
    LEFT JOIN 
        anonymity_groups k ON 
        a.agegroup = k.agegroup  AND 
        a.admissionyeargroup = k.admissionyeargroup AND
        a.gender = k.gender AND 
        a.alive = k.alive AND 
        (a.weightgroup = k.weightgroup OR (a.weightgroup IS NULL AND k.weightgroup IS NULL)) AND
        (a.heightgroup = k.heightgroup OR (a.heightgroup IS NULL AND k.heightgroup IS NULL))
    LEFT JOIN 
        diagnoses ON a.admissionid = diagnoses.admissionid
    LEFT JOIN 
        diagnosis_subgroups ON a.admissionid = diagnosis_subgroups.admissionid
    LEFT JOIN 
        diagnosis_groups ON a.admissionid = diagnosis_groups.admissionid         
    WHERE 
        (diagnoses.rownum = 1 OR diagnoses.rownum IS NULL) AND 
        (diagnosis_subgroups.rownum = 1 OR diagnosis_subgroups.rownum IS NULL) AND
        (diagnosis_groups.rownum = 1 OR diagnosis_groups.rownum IS NULL) --only last updated record
)
SELECT *
FROM anonymity_ids
ORDER BY 
    l_diversity, 
    k_anonymity DESC, 
    agegroup, 
    admissionyeargroup, 
    gender, 
    alive, 
    weightgroup, 
    heightgroup, 
    patientid,
    admissionid;
"""
risk_researcher_ids = pd.read_sql(sql_risk_researcher_ids,con)
risk_researcher_ids.head()

Unnamed: 0,admissionid,patientid,lengthofstay,agegroup,admissionyeargroup,gender,alive,weightgroup,heightgroup,diagnosis,diagnosis_subgroup,diagnosis_group,l_diversity,l_diversity_diags,k_anonymity,risk
0,2611,2275,299,50-59,2003-2009,Man,1,80-89,,,,,2.0,ASD\nN/A,18,0.055556
1,3517,3054,22,50-59,2003-2009,Man,1,80-89,,,,,2.0,ASD\nN/A,18,0.055556
2,4228,3672,22,50-59,2003-2009,Man,1,80-89,,,,,2.0,ASD\nN/A,18,0.055556
3,4755,4137,19,50-59,2003-2009,Man,1,80-89,,,,,2.0,ASD\nN/A,18,0.055556
4,5188,4505,36,50-59,2003-2009,Man,1,80-89,,,,,2.0,ASD\nN/A,18,0.055556


## Friendly researcher
### P(access)
Access to the database is freely available for genuine researchers

In [6]:
p_access_friendly_researcher = 1.0
p_access_friendly_researcher

1.0

### P(acquaintance)
Inadvertent recognition of acquaintance of the friendly researcher. Uses the Dunbar estimate (150) for the average number of people somebody knows.

$P_{acquaintance} = 1 - (1 - \rho)^{150}$, where $\rho$ is the prevalence of ICU admissions in the population

In [7]:
admissions['patientid'].nunique()

20109

Conservative estimate of $\rho$ (prevalence of ICU admissions in the population):

$\rho = \frac{\text{number of patients in db}}{\text{number of adult patients in the Netherlands}}$


In [8]:
n_db_patients = admissions['patientid'].nunique()
n_citizens_18yo = 13.7e6 #Dutch Citizens > 18 years: 13.7 million (Bron: CBS)
p_prevalence = n_db_patients/n_citizens_18yo


p_acquaintance_friendly_researcher = 1 - (1 - p_prevalence)**150
p_acquaintance_friendly_researcher

0.19774861784974562

### P(re-id) - re-identification risk based on dataset and background knowledge
Using strict average risk

In [9]:
p_re_id_friendly_researcher_avg = risk_researcher_ids['risk'].mean()
p_re_id_friendly_researcher_avg

0.04661126980005158

In [10]:
k_friendly_researcher_avg = risk_researcher_ids['k_anonymity'].mean()
k_friendly_researcher_avg

88.62901410888946

In [11]:
l_friendly_researcher_avg = risk_researcher_ids['l_diversity'].mean()
l_friendly_researcher_avg

26.120404949381328

In [12]:
p_re_id_friendly_researcher_max = risk_researcher_ids['risk'].max()
p_re_id_friendly_researcher_max

0.5

In [13]:
k_friendly_researcher_min =  risk_researcher_ids['k_anonymity'].min()
k_friendly_researcher_min

2

In [14]:
#checks l-diversity but ignores groups without any real diagnoses
l_friendly_researcher_min = risk_researcher_ids['l_diversity'].min()
l_friendly_researcher_min

2.0

### P(final risk)
Risk of re-identification given access to the database, knowing any patient and the strict average risk of identification in the dataset.

In [15]:
p_final_risk_friendly_researcher = p_access_friendly_researcher * p_acquaintance_friendly_researcher * p_re_id_friendly_researcher_avg
p_final_risk_friendly_researcher

0.009217314179181788

## Alternate risk calculation
Based on estimation of P(acquaintance) using citizens in Noord-Holland province

A worst case scenario estimate of $\rho$ (prevalence of ICU admissions in the population):

$\rho = \frac{\text{number of patients in db}}{\text{number of adult patients in Noord-Holland province}}$

In [16]:
n_db_patients = admissions['patientid'].nunique()
n_citizens_province_NH = 2.26e6 #Dutch Citizens > 20 years in Noord-Holland province: 2.4 million (Bron: CBS)
p_prevalence = n_db_patients/n_citizens_province_NH


p_acquaintance_friendly_researcher_province = 1 - (1 - p_prevalence)**150
p_acquaintance_friendly_researcher_province

0.7383222246822686

### Alternate: P(final risk)
Worst case P(final risk) for *friendly researcher*: <0.10

In [17]:
p_final_risk_friendly_researcher_province = p_access_friendly_researcher * p_acquaintance_friendly_researcher_province * p_re_id_friendly_researcher_avg
p_final_risk_friendly_researcher_province

0.03441413641403952

## Rogue researcher
### P(access)
Access to the database is freely available for genuine researchers

In [18]:
p_access_rogue_researcher = 1.0
p_access_rogue_researcher

1.0

### P(intention)
Intentionally re-identifying patients, even though there is data agreement in place. Low risk: 0.01-0.10.

In [19]:
p_intention_rogue_researcher = 0.10 #conservative estimate
p_intention_rogue_researcher

0.1

### P(re-id) - re-identification risk based on dataset and background knowledge
Using strict average risk

In [20]:
p_re_id_rogue_researcher_avg = risk_researcher_ids['risk'].mean()
p_re_id_rogue_researcher_avg

0.04661126980005158

In [21]:
k_rogue_researcher_avg = risk_researcher_ids['k_anonymity'].mean()
k_rogue_researcher_avg

88.62901410888946

In [22]:
l_rogue_researcher_avg = risk_researcher_ids['l_diversity'].mean()
l_rogue_researcher_avg

26.120404949381328

In [23]:
p_re_id_rogue_researcher_max = risk_researcher_ids['risk'].max()
p_re_id_rogue_researcher_max

0.5

In [24]:
k_rogue_researcher_min = risk_researcher_ids['k_anonymity'].min()
k_rogue_researcher_min

2

In [25]:
#checks l-diversity but ignores groups without any real diagnoses
l_rogue_researcher_min =  risk_researcher_ids['l_diversity'].min()
l_rogue_researcher_min

2.0

### P(final risk)
Risk of re-identification given access to the database, ignoring data agreements and using the strict average risk of identification any patient in the dataset.

In [26]:
p_final_risk_rogue_researcher = p_access_rogue_researcher * p_intention_rogue_researcher * p_re_id_rogue_researcher_max
p_final_risk_rogue_researcher

0.05

## Rogue insurance company with assumed background knowledge:
Exclusively for ICU patients (billing)
- age
- gender
- admission year
- alive at discharge
- admissioncount

### Determine equivalence classes, *k*-anonymity and *l*-diversity based on diagnoses in listitems
Re-identification P(re_id) is calculated as 1/*k*. For calculating *l*-diversity with groups containing `null` values or `diagnose anders` (no real diagnoses documented) the following is assumed for calculation:
- Groups with only `null` values: *l*-diversity = `null`
- Groups with at least one 'real' diagnosis, count `null` values as one group

In [27]:
sql_risk_insurance_ids = """
WITH diagnosis_groups AS (
SELECT admissionid,
        item, 
        value as diagnosis_group,
        CASE
            WHEN itemid = 13110 AND valueid BETWEEN 1 AND 3 THEN 1 --D_Hoofdgroep
            WHEN itemid = 16651 AND valueid BETWEEN 7 AND 9 THEN 1 --DMC_Hoofdgroep
            WHEN itemid = 16997 AND valueid BETWEEN 11 AND 20 THEN 1 --APACHE IV Groepen
            WHEN itemid = 18588 AND valueid BETWEEN 1 AND 7 THEN 1 --Apache II Hoofdgroep
            ELSE 0
        END AS surgical,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
    FROM listitems
    WHERE itemid IN (
        --MAIN GROUP - LEVEL 0
        13110, --D_Hoofdgroep
        16651 --DMC_Hoofdgroep, Medium Care
    )
),diagnosis_subgroups AS (
SELECT admissionid,
        item, 
        value as diagnosis_subgroup,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
    FROM listitems
    WHERE itemid IN (
        --SUB GROUP - LEVEL 1
        13111, --D_Subgroep_Thoraxchirurgie
        16669, --DMC_Subgroep_Thoraxchirurgie
        13112, --D_Subgroep_Algemene chirurgie
        16665, --DMC_Subgroep_Algemene chirurgie
        13113, --D_Subgroep_Neurochirurgie
        16667, --DMC_Subgroep_Neurochirurgie
        13114, --D_Subgroep_Neurologie
        16668, --DMC_Subgroep_Neurologie
        13115, --D_Subgroep_Interne geneeskunde
        16666 --DMC_Subgroep_Interne geneeskunde
    )
),diagnoses AS (
SELECT admissionid,
        item, 
        value as diagnosis,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY measuredat DESC) AS rownum
FROM listitems
WHERE itemid IN (
        -- Diagnosis - LEVEL 2
        --SURGICAL
        13116, --D_Thoraxchirurgie_CABG en Klepchirurgie
        16671, --DMC_Thoraxchirurgie_CABG en Klepchirurgie
        13117, --D_Thoraxchirurgie_Cardio anders
        16672, --DMC_Thoraxchirurgie_Cardio anders
        13118, --D_Thoraxchirurgie_Aorta chirurgie
        16670, --DMC_Thoraxchirurgie_Aorta chirurgie
        13119, --D_Thoraxchirurgie_Pulmonale chirurgie
        16673, --DMC_Thoraxchirurgie_Pulmonale chirurgie
        
        
        13141, --D_Algemene chirurgie_Algemeen   
        16642, --DMC_Algemene chirurgie_Algemeen
        13121, --D_Algemene chirurgie_Buikchirurgie
        16643, --DMC_Algemene chirurgie_Buikchirurgie
        13123, --D_Algemene chirurgie_Endocrinologische chirurgie
        16644, --DMC_Algemene chirurgie_Endocrinologische chirurgie
        13145, --D_Algemene chirurgie_KNO/Overige
        16645, --DMC_Algemene chirurgie_KNO/Overige
        13125, --D_Algemene chirurgie_Orthopedische chirurgie
        16646, --DMC_Algemene chirurgie_Orthopedische chirurgie
        13122, --D_Algemene chirurgie_Transplantatie chirurgie
        16647, --DMC_Algemene chirurgie_Transplantatie chirurgie
        13124, --D_Algemene chirurgie_Trauma
        16648, --DMC_Algemene chirurgie_Trauma
        13126, --D_Algemene chirurgie_Urogenitaal
        16649, --DMC_Algemene chirurgie_Urogenitaal
        13120, --D_Algemene chirurgie_Vaatchirurgie
        16650, --DMC_Algemene chirurgie_Vaatchirurgie
        13128, --D_Neurochirurgie _Vasculair chirurgisch
        16661, --DMC_Neurochirurgie _Vasculair chirurgisch
        13129, --D_Neurochirurgie _Tumor chirurgie
        16660, --DMC_Neurochirurgie _Tumor chirurgie
        13130, --D_Neurochirurgie_Overige
        16662, --DMC_Neurochirurgie_Overige

        --MEDICAL
        13133, --D_Interne Geneeskunde_Cardiovasculair
        16653, --DMC_Interne Geneeskunde_Cardiovasculair
        13134, --D_Interne Geneeskunde_Pulmonaal
        16658, --DMC_Interne Geneeskunde_Pulmonaal
        13135, --D_Interne Geneeskunde_Abdominaal
        16652, --DMC_Interne Geneeskunde_Abdominaal
        13136, --D_Interne Geneeskunde_Infectieziekten
        16655, --DMC_Interne Geneeskunde_Infectieziekten
        13137, --D_Interne Geneeskunde_Metabool
        16656, --DMC_Interne Geneeskunde_Metabool
        13138, --D_Interne Geneeskunde_Renaal
        16659, --DMC_Interne Geneeskunde_Renaal
        13139, --D_Interne Geneeskunde_Hematologisch
        16654, --DMC_Interne Geneeskunde_Hematologisch
        13140, --D_Interne Geneeskunde_Overige
        16657, --DMC_Interne Geneeskunde_Overige
        13131, --D_Neurologie_Vasculair neurologisch
        16664, --DMC_Neurologie_Vasculair neurologisch
        13132, --D_Neurologie_Overige
        16663, --DMC_Neurologie_Overige 
        13127 --D_KNO/Overige
        )
),
admissions_cleaned AS (
    SELECT
        admissionid,
        patientid,
        lengthofstay,
        agegroup, 
        admissionyeargroup,
        gender,
        CASE 
            WHEN destination = 'Overleden' THEN 0
            ELSE 1
        END as alive,
        ROW_NUMBER() OVER
            (PARTITION BY patientid ORDER BY admittedat) AS icu_admissioncount
        FROM admissions
        WHERE location LIKE '%IC%'
),
anonymity_groups AS (
    -- possible diagnosis grouping: diagnosis, diagnosis_subgroup, diagnosis_group, 
    SELECT 
        gender, 
        agegroup, 
        alive,
        admissionyeargroup,
        icu_admissioncount,
        COUNT(DISTINCT COALESCE(diagnosis,'N/A')) AS l_diversity, 
        STRING_AGG(DISTINCT COALESCE(diagnosis,'N/A'), '\n') AS l_diversity_diags,
        COUNT(patientid) AS k_anonymity 
    FROM admissions_cleaned
    LEFT JOIN diagnoses ON 
        admissions_cleaned.admissionid = diagnoses.admissionid
    LEFT JOIN diagnosis_subgroups ON
        admissions_cleaned.admissionid = diagnosis_subgroups.admissionid
    LEFT JOIN diagnosis_groups ON 
        admissions_cleaned.admissionid = diagnosis_groups.admissionid         
    WHERE (diagnoses.rownum = 1 OR diagnoses.rownum IS NULL) AND 
        (diagnosis_subgroups.rownum = 1 OR diagnosis_subgroups.rownum IS NULL) AND
        (diagnosis_groups.rownum = 1 OR diagnosis_groups.rownum IS NULL) --only last updated record
    GROUP BY 
        gender,
        agegroup, 
        alive,
        admissionyeargroup,
        icu_admissioncount  
),
anonymity_ids AS (
    SELECT 
        a.*, 
        diagnosis, 
        diagnosis_subgroup, 
        diagnosis_group, 
        CASE 
            WHEN l_diversity = 1 AND 
            (l_diversity_diags = 'N/A' OR l_diversity_diags = 'Diagnose anders') THEN NULL
            ELSE l_diversity
        END AS l_diversity,
        l_diversity_diags,
        k_anonymity,
        1.0/k_anonymity AS risk
    FROM admissions_cleaned a
    LEFT JOIN anonymity_groups k ON 
        a.agegroup = k.agegroup  AND 
        a.admissionyeargroup = k.admissionyeargroup AND
        a.gender = k.gender AND 
        a.alive = k.alive AND 
        a.icu_admissioncount = k.icu_admissioncount
    LEFT JOIN diagnoses ON 
        a.admissionid = diagnoses.admissionid
    LEFT JOIN diagnosis_subgroups ON 
        a.admissionid = diagnosis_subgroups.admissionid
    LEFT JOIN diagnosis_groups ON 
        a.admissionid = diagnosis_groups.admissionid         
    WHERE 
        (diagnoses.rownum = 1 OR diagnoses.rownum IS NULL) AND 
        (diagnosis_subgroups.rownum = 1 OR diagnosis_subgroups.rownum IS NULL) AND
        (diagnosis_groups.rownum = 1 OR diagnosis_groups.rownum IS NULL) --only last updated record
)
SELECT *
FROM anonymity_ids
ORDER BY 
    l_diversity, 
    k_anonymity DESC, 
    agegroup,
    admissionyeargroup,
    gender, 
    alive,
    icu_admissioncount,
    patientid,
    admissionid;
"""
risk_insurance_ids = pd.read_sql(sql_risk_insurance_ids,con)
risk_insurance_ids.head()

Unnamed: 0,admissionid,patientid,lengthofstay,agegroup,admissionyeargroup,gender,alive,icu_admissioncount,diagnosis,diagnosis_subgroup,diagnosis_group,l_diversity,l_diversity_diags,k_anonymity,risk
0,12373,10685,21,40-49,2010-2016,Man,0,2,Diagnose anders,Overige,Interne geneeskunde,2.0,Diagnose anders\nNa reanimatie,3,0.333333
1,21678,18713,587,40-49,2010-2016,Man,0,2,Diagnose anders,Hematologisch,Interne geneeskunde,2.0,Diagnose anders\nNa reanimatie,3,0.333333
2,22647,19559,72,40-49,2010-2016,Man,0,2,Na reanimatie,Cardiovasculair,Interne geneeskunde,2.0,Diagnose anders\nNa reanimatie,3,0.333333
3,5221,4526,83,50-59,2003-2009,Man,0,3,Diagnose anders,Overige,Neurochirurgie,2.0,Decompensatio cordis\nDiagnose anders,3,0.333333
4,17316,14946,124,50-59,2003-2009,Man,0,3,Decompensatio cordis,Cardiovasculair,Interne geneeskunde,2.0,Decompensatio cordis\nDiagnose anders,3,0.333333


### P(access)
Only after a data breach insurance companies might be able to access the data. Based on historical data this appears to be 
0.27.

In [28]:
p_access_rogue_insurance = 0.27
p_access_rogue_insurance

0.27

### P(intention)
Based on conservative assumptions: 0.10

In [29]:
p_intention_rogue_insurance = 0.10
p_intention_rogue_insurance

0.1

## P(re-id) - re-identification risk based on dataset and background knowledge
Using strict average risk

In [30]:
p_re_id_rogue_insurance_avg = risk_insurance_ids['risk'].mean()
p_re_id_rogue_insurance_avg

0.009028608724030013

In [31]:
k_rogue_insurance_avg = risk_insurance_ids['k_anonymity'].mean()
k_rogue_insurance_avg

682.3492875013598

In [32]:
l_rogue_insurance_avg = risk_insurance_ids['l_diversity'].mean()
l_rogue_insurance_avg

65.11981053218166

In [33]:
p_re_id_rogue_insurance_max = risk_insurance_ids['risk'].max()
p_re_id_rogue_insurance_max

0.5

In [34]:
k_rogue_insurance_min = risk_insurance_ids['k_anonymity'].min()
k_rogue_insurance_min

2

In [35]:
l_rogue_insurance_min = risk_insurance_ids['l_diversity'].min()
l_rogue_insurance_min

2.0

## P(final risk)
Risk of re-identification given data breach of the database, rogue employee using the company database and the strict average risk of identification in the dataset.

In [36]:
p_final_risk_rogue_insurance = p_access_rogue_insurance * p_intention_rogue_insurance * p_re_id_rogue_insurance_avg
p_final_risk_rogue_insurance

0.00024377243554881038

In [37]:
display(HTML("""
<table style="border-spacing: 0.5em; border-collapse: separate">
    <thead style="border-spacing: 1em; border-collapse: separate">
        <tr style="border-spacing: 1em;">
            <th style="text-align:center; border: solid 0; border-bottom-width:2px;" colspan=3>Chance or risk</th>                        
            <th style="text-align:center; border: solid 0; border-bottom-width:2px;" colspan=3>Average risk based</th>
            <th style="text-align:center; border: solid 0; border-bottom-width:2px;" colspan=3>Maximum risk based</th>
        </tr>
        <tr>
            <th style="text-align:left">Adversary</th>
            <th>P(access)</th>
            <th>P(intention)</th>
            <th>P(re-id)</th>
            <th><i>k</i>-anonymity</th>
            <th><i>l</i>-diversity</th>
            <th>P(re-id)</th>
            <th><i>k</i>-anonymity</th>
            <th><i>l</i>-diversity</th>
            <th>P(final risk)</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td style="text-align:left">Friendly Researcher</td>
            <td>{p_access_friendly_researcher:.2f}</td>
            <td>{p_acquaintance_friendly_researcher:.2f} <sup>*</sup>
            <td>{p_re_id_friendly_researcher_avg:.3f}
            <td>{k_friendly_researcher_avg:.0f}
            <td>{l_friendly_researcher_avg:.0f}
            <td>{p_re_id_friendly_researcher_max:.2f}
            <td>{k_friendly_researcher_min:.0f}
            <td>{l_friendly_researcher_min:.0f}
            <td>{p_final_risk_friendly_researcher:.2f} <sup>&dagger;</sup>
        </tr>
        <tr>
            <td style="text-align:left">Rogue Researcher
            <td>{p_access_rogue_researcher:.2f}
            <td>{p_intention_rogue_researcher:.2f}
            <td>{p_re_id_rogue_researcher_avg:.3f} 
            <td>{k_rogue_researcher_avg:.0f}
            <td>{l_rogue_researcher_avg:.0f}
            <td>{p_re_id_rogue_researcher_max:.2f}
            <td>{k_rogue_researcher_min:.0f}
            <td>{l_rogue_researcher_min:.0f}
            <td>{p_final_risk_rogue_researcher:.2f} <sup>&Dagger;</sup>
        </tr>
        <tr>
            <td style="text-align:left">Rogue Insurance Company
            <td>{p_access_rogue_insurance:.2f}
            <td>{p_intention_rogue_insurance:.2f}
            <td>{p_re_id_rogue_insurance_avg:.3f}
            <td>{k_rogue_insurance_avg:.0f}
            <td>{l_rogue_insurance_avg:.0f}       
            <td>{p_re_id_rogue_insurance_max:.2f} 
            <td>{k_rogue_insurance_min:.0f}
            <td>{l_rogue_insurance_min:.0f}
            <td>{p_final_risk_rogue_insurance:.4f} <sup>&dagger;</sup>
        </tr>
    </tbody>
</table>

* acquaintance risk, the risk of knowing somebody in the database. &dagger; using strict average risk. &Dagger; using maximum risk. 
""".format(
            p_access_friendly_researcher = p_access_friendly_researcher,
            p_acquaintance_friendly_researcher = p_acquaintance_friendly_researcher,
            p_re_id_friendly_researcher_avg = p_re_id_friendly_researcher_avg,
            k_friendly_researcher_avg = k_friendly_researcher_avg,
            l_friendly_researcher_avg = l_friendly_researcher_avg,
            p_re_id_friendly_researcher_max = p_re_id_friendly_researcher_max,
            k_friendly_researcher_min = k_friendly_researcher_min,
            l_friendly_researcher_min = l_friendly_researcher_min,
            p_final_risk_friendly_researcher = p_final_risk_friendly_researcher,

            p_access_rogue_researcher = p_access_rogue_researcher,
            p_intention_rogue_researcher = p_intention_rogue_researcher,
            p_re_id_rogue_researcher_avg = p_re_id_rogue_researcher_avg,
            k_rogue_researcher_avg = k_rogue_researcher_avg,
            l_rogue_researcher_avg = l_rogue_researcher_avg,
            p_re_id_rogue_researcher_max = p_re_id_rogue_researcher_max,
            k_rogue_researcher_min = k_rogue_researcher_min,
            l_rogue_researcher_min = l_rogue_researcher_min,
            p_final_risk_rogue_researcher = p_final_risk_rogue_researcher,

            p_access_rogue_insurance = p_access_rogue_insurance,
            p_intention_rogue_insurance = p_intention_rogue_insurance,
            p_re_id_rogue_insurance_avg = p_re_id_rogue_insurance_avg,
            k_rogue_insurance_avg = k_rogue_insurance_avg,
            l_rogue_insurance_avg = l_rogue_insurance_avg,
            p_re_id_rogue_insurance_max = p_re_id_rogue_insurance_max,
            k_rogue_insurance_min = k_rogue_insurance_min,
            l_rogue_insurance_min = l_rogue_insurance_min,
            p_final_risk_rogue_insurance = p_final_risk_rogue_insurance
        )
    )
)

Chance or risk,Chance or risk,Chance or risk,Average risk based,Average risk based,Average risk based,Maximum risk based,Maximum risk based,Maximum risk based,Unnamed: 9_level_0
Adversary,P(access),P(intention),P(re-id),k-anonymity,l-diversity,P(re-id),k-anonymity,l-diversity,P(final risk)
Friendly Researcher,1.0,0.20 *,0.047,89,26,0.5,2,2,0.01 †
Rogue Researcher,1.0,0.10,0.047,89,26,0.5,2,2,0.05 ‡
Rogue Insurance Company,0.27,0.10,0.009,682,65,0.5,2,2,0.0002 †
