In [2]:
#install only if you have not installed before
#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#!pip install seaborn

#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## K-Anonymity

In [4]:
#Let's start with a simple dataset
data = pd.read_csv('testdata.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   UID      10 non-null     object
 1   DoB      10 non-null     object
 2   Gender   10 non-null     object
 3   Disease  10 non-null     object
dtypes: object(4)
memory usage: 452.0+ bytes


### Preprocessing

The given format of the DoB is difficult to be grouped, how can we change the representation of the DoB. <br>
And at this stage if you want to remove any columns that is not needed to be shared, you can remove it.

In [5]:
# Let's calculate the ages and replace the DoB column with the Age
def calculate_age(dataset):
    # Convert the 'DoB' column to datetime format
    dataset['DoB'] = pd.to_datetime(dataset['DoB'], format='%d/%m/%Y')
    
    today = datetime.today()
    dataset['DoB'] = dataset['DoB'].apply(lambda dob: today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day)))
    dataset.rename(columns={"DoB": "Age"}, inplace = True)

calculate_age(data)

data

Unnamed: 0,UID,Age,Gender,Disease
0,dgre6789,24,M,Cancer
1,spea6271,23,M,Infection
2,jgib9592,22,M,AIDS
3,hbar4405,21,F,AIDS
4,mhan6405,23,M,Cancer
5,amcl3591,24,F,Cancer
6,rrit959,21,M,Cancer
7,csow4222,22,F,Infection
8,asin5401,23,F,AIDS
9,pbur8178,23,F,AIDS


In [6]:
# Remove the UID column
data.drop(columns = ['UID'],inplace = True)
data

Unnamed: 0,Age,Gender,Disease
0,24,M,Cancer
1,23,M,Infection
2,22,M,AIDS
3,21,F,AIDS
4,23,M,Cancer
5,24,F,Cancer
6,21,M,Cancer
7,22,F,Infection
8,23,F,AIDS
9,23,F,AIDS


#### Question 1

What are the quasi identifiers in the dataset?

<font color="blue"> Write your answer </font>

#### Question 2

What are the categorical quasi identifiers in the dataset?

<font color="blue"> Write your answer </font>

#### Question 3

What is the sensitive column?

<font color="blue"> Write your answer </font>

### Identify QIDs

In [23]:
#Save the QIDs
qids = ['Age', 'Gender']

### Partition

In [24]:
#save the categorical QIDs
categorical = ['Gender']

In [32]:
#Partition the dataset

#A function to find the column wise variability of the values in a partition
def get_qid_variability(partition): 
    var = {}
    for qid in qids:
        var[qid] = (data[qid][partition]).nunique() #{'col':num, 'col2': num}
    
    var =dict(sorted(var.items(), key = lambda items:items[1],reverse = True))  
    return var
    
    
#split a partition
def split(partition, column):
    values = data[column][partition]
    lhs, rhs = [], []
    
    if column not in categorical:
        median = values.median()
        lhs = values.index[values < median]
        rhs = values.index[values >= median]
    #else:
        #get a list of the unique values in "values"
        #split the above list into 2:lhs_mask, rhs_mask
        #lhs =indexes of the values that matches lhs_mask
    return lhs,rhs

#partitioning algorithm
def mondrian(k):
    final_partitions = []
    partitions = [data.index]
    
    while partitions:
        partition = partitions.pop(0)
        
        if len(partition) >= k * 2:
            qid_variability = get_qid_variability(partition)
            
            for column in qid_variability:
                lhs,rhs = split(partition, column)
                
                if len(lhs) < k or len(rhs) < k:
                    continue
                else:
                    partitions.extend((lhs,rhs))
                    break
            else:
                final_partitions.append(partition)
        else:
            final_partitions.append(partition)
    
    
    return final_partitions

In [33]:
#Partition the dataset to achieve 2-Anonymity
final_partitions = mondrian(2)
final_partitions

[Index([0, 1, 4, 5, 8, 9], dtype='int64'),
 Index([3, 6], dtype='int64'),
 Index([2, 7], dtype='int64')]

### Anonymise

In [34]:
#Let's Anonymise the partitions
#In each partition,replace numerical qids with [min value - max value] format
#In each partition,replace categorical qids with value1~value2...
def anonymise():
    anon_copy = data.copy()
    
    #bring partitions closer
    new_order =[]
    for par in final_partitions:
        new_order.extend(par)
    
    anon_copy = anon_copy.reindex(new_order)
    
    for qid in qids:
        for par in final_partitions:
            if qid not in categorical:
                values = anon_copy[qid][par] 
                
                min_value = values.min()
                max_value = values.max()
                
                anon_copy.loc[par, qid] = f"[{min_value} - {max_value}]"
            #else
    
    
    return anon_copy

anonymise()

Unnamed: 0,Age,Gender,Disease
0,[23 - 24],M,Cancer
1,[23 - 24],M,Infection
4,[23 - 24],M,Cancer
5,[23 - 24],F,Cancer
8,[23 - 24],F,AIDS
9,[23 - 24],F,AIDS
3,[21 - 21],F,AIDS
6,[21 - 21],M,Cancer
2,[22 - 22],M,AIDS
7,[22 - 22],F,Infection


### Visualise the anonymised table

In [35]:
# Alternate partitions are colored in yellow for your clarity
def styled_data(data_frame):
    colored_rows =[]

    for partition_index,partition in enumerate(final_partitions):
        if partition_index % 2 == 0:
            colored_rows.extend(partition.values)
    return data_frame.style.set_properties(subset = pd.IndexSlice[colored_rows, qids], **{'background-color' : 'yellow'})

styled_data(anonymise())

Unnamed: 0,Age,Gender,Disease
0,[23 - 24],M,Cancer
1,[23 - 24],M,Infection
4,[23 - 24],M,Cancer
5,[23 - 24],F,Cancer
8,[23 - 24],F,AIDS
9,[23 - 24],F,AIDS
3,[21 - 21],F,AIDS
6,[21 - 21],M,Cancer
2,[22 - 22],M,AIDS
7,[22 - 22],F,Infection


In [36]:
new_order = []
for partition in final_partitions:
    new_order.extend(partition)
        
    part_data = data.reindex(new_order)
    
from IPython.display import HTML

def side_by_side(*dfs):
    html = '<div style="display:flex">'
    for df in dfs:
        html += '<div style="margin-right: 2em">'
        html += df.to_html()
        html += '</div>'
    html += '</div>'
    display(HTML(html))
    
side_by_side(data, styled_data(part_data), styled_data(anonymise()))

Unnamed: 0,Age,Gender,Disease
0,24,M,Cancer
1,23,M,Infection
2,22,M,AIDS
3,21,F,AIDS
4,23,M,Cancer
5,24,F,Cancer
6,21,M,Cancer
7,22,F,Infection
8,23,F,AIDS
9,23,F,AIDS

Unnamed: 0,Age,Gender,Disease
0,24,M,Cancer
1,23,M,Infection
4,23,M,Cancer
5,24,F,Cancer
8,23,F,AIDS
9,23,F,AIDS
3,21,F,AIDS
6,21,M,Cancer
2,22,M,AIDS
7,22,F,Infection

Unnamed: 0,Age,Gender,Disease
0,[23 - 24],M,Cancer
1,[23 - 24],M,Infection
4,[23 - 24],M,Cancer
5,[23 - 24],F,Cancer
8,[23 - 24],F,AIDS
9,[23 - 24],F,AIDS
3,[21 - 21],F,AIDS
6,[21 - 21],M,Cancer
2,[22 - 22],M,AIDS
7,[22 - 22],F,Infection


#### Question 4

What are your comments about the privacy gurantees when K is low?

<font color="blue"> Write your answer </font>