In [None]:
import pandas as pd
import hashlib
import random as rnd

# Create a dummy dataset for testing
Contains 1 million rows and 2 columns
Column 0 is a numerical identifier of 7 digits
Column 1 is a binary string variable

In [None]:
n = 1000000
gen = ['m', 'f']
id_list = list()
gen_list = list()
for i in range(0,n):
    ids = rnd.randrange(1000000, 9999999)
    id_list.append(ids)
    gen_choice = rnd.choice(gen)
    gen_list.append(gen_choice)
d_dict = {'id': id_list, 'gender': gen_list}
df = pd.DataFrame(d_dict)
print(df.head())
print('Dummy dataset created')

## Print the number of unique identifiers in the dummy dataset

In [None]:
uni_id = df['id'].unique()
print('The number of unique identifiers in the dataset is ' + str(len(uni_id)))

## Remove any rows containing duplicate identifiers from the dataset

In [None]:
df_uni = df.drop_duplicates(subset=['id'], keep = 'first').copy(deep=True)
print('Duplicate identifiers have been removed from the dataset')

## Convert the values in the id column to string objects

In [None]:
df_uni['id'] = df_uni['id'].astype(str)
print('The contents of the id column has been converted to string objects')

# Function for pseudonymisation using double hashing
This function will pseudonymise unique identifiers in a dataset to provide a master list
The process first converts each identifier to uft-8 format then encrypts each identifier using a sha3_512 key. This value is then divided by 10^n where n is the number of digits to return which must be less than 512. The smaller the number of digits returned the greater the likelihood of duplicate values which will increase processing time.
This process is then repeated returning a number of digits equal to or if specified greater than that specified in the first hashing process.
Where duplicates are produced the hashing process will be completed until no duplicates remain.
Processing time will depend on the number of identifiers being pseudonymised and the number of digits being returned. To reduce processing time, the more identifiers being returned the greater the number of digits being returned should be.
This method of double hashing is GDPR compliant and non-reversable.

### Encode ID using sha encryption

In [None]:
def encode(df, i, org_col, n):
    h_id = int(hashlib.sha3_512(df.loc[df.index[i], org_col].encode("utf-8")).hexdigest(),16) % (10 ** n)
    return h_id

In [None]:
def reencode(df, i, org_col, n):
    h_id = int(hashlib.sha3_256(df.loc[df.index[i], org_col].encode("utf-8")).hexdigest(),16) % (10 ** n)
    return h_id

### Iterate over ID's, run encode function, append to list and concatenate with dataframe

In [None]:
def hash_process(df, org_col, n, new_col):
    hash_id_list = list()
    for i in range(len(df)):
        h_id = encode(df, i, org_col, n)
        hash_id_list.append(h_id)
    df[new_col] = hash_id_list
    return df

### Re-encode a duplicate and replace inplace in dataframe

In [None]:
def reencode_duplicate(df, i, org_col, tar_col, n):
    new_hash = reencode(df, i, org_col,  n)
    df.at[i, tar_col] = new_hash
    return df

### Find indexes of duplicate encoded ID's

In [None]:
def find_duplicates(df, col):
    dup_ind = df.duplicated(subset=[col])
    dup_count = dup_ind.sum()
    return dup_ind, dup_count

### Run check for duplicate encoded ID's and re-encode

In [None]:
def duplicate_check(df, org_col, tar_col, n):
    dup_ind, dup_count = find_duplicates(df, tar_col)
    while dup_count > 0:
        print(dup_count)
        # print(dup_ind)
        n = n - 1
        for i in range(len(dup_ind)):
            if dup_ind[i] == True:
                df = reencode_duplicate(df, i, org_col, tar_col, n)
        dup_ind, dup_count = find_duplicates(df, tar_col)
    return df

### Convert column to string type

In [None]:
def col_to_string(df, id_col):
    df[id_col] = df[id_col].astype(str)
    return df

### Run the functions required for double hash encryption

In [None]:
def double_hash_pseudo(df, id_col, n1):
    h1_col = 'h_id'
    h2_col = 'h_id_two'
    df = col_to_string(df, id_col)
    df_h1 = hash_process(df, id_col, n1, h1_col)
    # print(df_h1.head())
    # print(len(df_h1))
    df_h1_rev = duplicate_check(df_h1, id_col, h1_col, n1)
    print(len(df_h1_rev))
    print('Hash one complete')
    df = col_to_string(df, h1_col)
    df_h2 = hash_process(df_h1_rev, h1_col, n1, h2_col)
    df_h2_rev = duplicate_check(df_h2, h1_col, h2_col, n1)
    return df_h2_rev

### Run the double hash function

In [None]:
df_pseudo = double_hash_pseudo(df, 'id', 12)
print(len(df_pseudo))
# print(df_pseudo.head())

In [None]:
print(df_pseudo)
df_pseudo['h_id_two'].min()

### Write the hashed ID's to csv

In [None]:
df_pseudo.to_csv('pseudo_out.csv')