# Producing pseudonymised IDs using double hashing

This notebook pseudonymises a dataset of synthetic IDs using double hashing with SHA3-512 and SHA3-256 encoding of the IDs.

In [1]:
# Import required packages
import pandas as pd
import hashlib
import random as rnd

## Create a dummy dataset for testing

Produce a synthetic dataset with 1 million rows and 2 columns:
* Column 0 is a numerical identifier of 7 digits
* Column 1 is a binary string variable

In [2]:
n = 1000000
gen = ['m', 'f']
id_list = list()
gen_list = list()
for i in range(0,n):
    ids = rnd.randrange(1000000, 9999999)
    id_list.append(ids)
    gen_choice = rnd.choice(gen)
    gen_list.append(gen_choice)
d_dict = {'id': id_list, 'gender': gen_list}
df = pd.DataFrame(d_dict)
print(df.head())
print('Dummy dataset created')

        id gender
0  7260232      f
1  3316241      m
2  1427566      f
3  4825511      m
4  8715645      f
Dummy dataset created


Print the number of unique identifiers in the dummy dataset

In [3]:
uni_id = df['id'].unique()
print('The number of unique identifiers in the dataset is ' + str(len(uni_id)))

The number of unique identifiers in the dataset is 946507


Remove any rows containing duplicate identifiers from the dataset

In [4]:
df_uni = df.drop_duplicates(subset=['id'], keep = 'first').copy(deep=True)
print('Duplicate identifiers have been removed from the dataset')

Duplicate identifiers have been removed from the dataset


Convert the values in the id column to string objects

In [5]:
df_uni['id'] = df_uni['id'].astype(str)
print('The contents of the id column has been converted to string objects')

The contents of the id column has been converted to string objects


# Define functions for pseudonymisation using double hashing

These functions will pseudonymise unique identifiers in a dataset to provide a master list.
* The process first converts each identifier to uft-8 format then encrypts each identifier using a sha3_512 key.
* This value is then divided by 10^n where n is the number of digits to return which must be less than 512. The smaller the number of digits returned the greater the likelihood of duplicate values which will increase processing time.
* This process is then repeated returning a number of digits equal to or if specified greater than that specified in the first hashing process.
* Where duplicates are produced the hashing process will be completed until no duplicates remain.
* Processing time will depend on the number of identifiers being pseudonymised and the number of digits being returned. To reduce processing time, the more identifiers being returned the greater the number of digits being returned should be.

This method of double hashing is GDPR compliant and non-reversible.

### Encode ID using sha encryption

In [6]:
def encode(df, i, org_col, n):
    '''
    Encode an ID using SHA3_512 encryption.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe contained the ID to be encoded
    i : integer
        Row/Index of the ID to be encoded
    org_col: string
        Name of the column with the ID to be encoded
    n : integer
        Desired length of the new ID

    Returns
    -------
    h_id : string
        Newly encoded ID
    '''
    # Convert ID to UTF-8 format, encrypt using SHA3_512 key, then divide by
    # 10^n where n is the number of digits to return - smaller number means
    # greater likelihood of duplicate values which will increase processing time
    h_id = int(hashlib.sha3_512(
        df.loc[df.index[i], org_col].encode("utf-8")).hexdigest(),16) % (10 ** n)
    return h_id

In [7]:
def reencode(df, i, org_col, n):
    '''
    Encode an ID using SHA3_256 encryption.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe contained the ID to be encoded
    i : integer
        Row/Index of the ID to be encoded
    org_col: string
        Name of the column with the ID to be encoded
    n : integer
        Desired length of the new ID

    Returns
    -------
    h_id : string
        Newly encoded ID
    '''
    # Convert ID to UTF-8 format, encrypt using SHA3_512 key, then divide by
    # 10^n where n is the number of digits to return - smaller number means
    # greater likelihood of duplicate values which will increase processing time
    h_id = int(hashlib.sha3_256(
        df.loc[df.index[i], org_col].encode("utf-8")).hexdigest(),16) % (10 ** n)
    return h_id

### Iterate over ID's, run encode function, append to list and concatenate with dataframe

In [8]:
def hash_process(df, org_col, n, new_col):
    '''
    Create a new column with encoded IDs and add to the dataframe.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe with IDs to be pseudonymised
    org_col : string
        Original column of IDs
    n : integer
        Desired length of the pseudonymised IDs
    new_col : string
        Name of new column that will contain the pseudonymised IDs

    Returns
    -------
    df : pandas dataframe
        Dataframe with new column of pseudonymised IDs added
    '''
    # Create an empty list to store the IDs
    hash_id_list = list()

    # Iterate over the IDs
    for i in range(len(df)):
        # Generate a pseudonymised ID and save to list
        h_id = encode(df, i, org_col, n)
        hash_id_list.append(h_id)

    # Create a new column in the dataframe with the pseudonymised IDs
    df[new_col] = hash_id_list

    return df

### Re-encode a duplicate and replace inplace in dataframe

In [9]:
def reencode_duplicate(df, i, org_col, tar_col, n):
    '''
    Replace an ID in the dataframe with a newly encoded ID.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe containing the ID to be replaced
    i : integer
        Row/index of the ID to be replaced
    org_col : string
        Name of column with the original ID, that was encoded to produce the
        ID in tar_col
    tar_col : string
        Name of column with the ID to be replaced
    n : integer
        Desired length of the pseudonymised ID
    '''
    # Re-encode ID
    new_hash = reencode(df, i, org_col,  n)
    # Replace that ID in the dataframe
    df.at[i, tar_col] = new_hash
    return df

### Find indexes of duplicate encoded ID's

In [10]:
def find_duplicates(df, col):
    '''
    Find index of duplicate encoded IDs.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe with the IDs
    col : string
        Name of the column with the IDs

    Returns
    -------
    dup_ind : boolean series
        Series where True if ID is duplicate
    dup_count : integer
        Number of duplicate IDs in the dataframe
    '''
    # Get indices of duplicate IDs
    dup_ind = df.duplicated(subset=[col])
    # Get count of duplicate IDs
    dup_count = dup_ind.sum()
    return dup_ind, dup_count

### Run check for duplicate encoded ID's and re-encode

In [11]:
def duplicate_check(df, org_col, tar_col, n):
    '''
    Check for duplicate encoded IDs, and rencode (using the original ID). Repeat
    until there are no duplicates remaining.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe with the original and pseudonymised IDs
    org_col : string
        Name of column with the original IDs that were just pseudonymised
    targ_col : string
        Name of column of IDs within which we are checking for duplicates
    n : integer
        Desired length of pseudonymised IDs

    Returns
    -------
    df : pandas dataframe
        Dataframe with duplicate IDs replaced
    '''
    # Find the indices and number of duplicates
    dup_ind, dup_count = find_duplicates(df, tar_col)

    # If there are any present...
    while dup_count > 0:

        # Print count of duplicate indices
        print(dup_count)
        # print(dup_ind)

        # Reduce length of produced ID by 1
        n = n - 1

        # Loop through the indices and re-encode
        for i in range(len(dup_ind)):
            if dup_ind[i] == True:
                df = reencode_duplicate(df, i, org_col, tar_col, n)

        # Get indices and counts of any remaining duplicates
        dup_ind, dup_count = find_duplicates(df, tar_col)

    return df

### Convert column to string type

In [12]:
def col_to_string(df, id_col):
    '''
    Convert column data type to string.

    Parameters
    ----------
    df : pandas dataframe
        Dataframe containing the column to be converted
    id_col : string
        Name of the column to be converted

    Returns
    -------
    df : pandas dataframe
        Dataframe with converted column
    '''
    df[id_col] = df[id_col].astype(str)
    return df

### Run the functions required for double hash encryption

In [13]:
def double_hash_pseudo(df, id_col, n1):
    '''
    Pseudonymise IDs using double-hashing

    Parameters
    ----------
    df : pandas dataframe
        Dataframe with the IDs to be pseudonymised
    id_col : string
        Name of the ID column to be pseudonymised
    n1 : integer
        Desired length of the final pseudonymised IDs

    Returns
    -------
    df_h2_rev : pandas dataframe
        Dataframe with two new columns (in-progress and final set of pseudo IDs)
    '''
    # Set names for the ID columns
    h1_col = 'h_id'
    h2_col = 'h_id_two'

    # Convert original ID column to string
    df = col_to_string(df, id_col)

    # Add a new column with pseudymised IDs to the dataframe
    df_h1 = hash_process(df, id_col, n1, h1_col)
    # print(df_h1.head())
    # print(len(df_h1))

    # Run a duplicate check
    df_h1_rev = duplicate_check(df_h1, id_col, h1_col, n1)

    # Print status update
    print(len(df_h1_rev))
    print('Hash one complete')

    # Repeat process a second time
    df = col_to_string(df, h1_col)
    df_h2 = hash_process(df_h1_rev, h1_col, n1, h2_col)
    df_h2_rev = duplicate_check(df_h2, h1_col, h2_col, n1)

    return df_h2_rev

### Run the double hash function

In [14]:
df_pseudo = double_hash_pseudo(df, 'id', 12)
print(len(df_pseudo))
# print(df_pseudo.head())

53493
2005
253
34
5
1000000
Hash one complete
1000000


In [15]:
print(df_pseudo)
df_pseudo['h_id_two'].min()

             id gender          h_id      h_id_two
0       7260232      f  232241645964  347719229216
1       3316241      m  106410238251  191104966042
2       1427566      f  402590042291   87835444441
3       4825511      m  707826129375  709888269764
4       8715645      f  328794091852  177046366543
...         ...    ...           ...           ...
999995  9709425      m  509706073734  701117790328
999996  7975837      m  613809405124   91982678534
999997  8648401      m  974823974511  259188951056
999998  3590847      m   65858948736  200306710672
999999  1673734      f   18167887387  540251442555

[1000000 rows x 4 columns]


25928

### Write the hashed ID's to csv

In [16]:
df_pseudo.to_csv('pseudo_out.csv')