In [1]:
import os
import pandas as pd
from hashlib import sha256, blake2b, algorithms_available

# Dataset

In [2]:
url = "https://raw.githubusercontent.com/Admindatosgobes/Laboratorio-de-Datos/main/Data%20Science/Aplicaci%C3%B3n%20pr%C3%A1ctica%20de%20t%C3%A9cnicas%20de%20anonimizaci%C3%B3n/Datos/data.csv"
data = pd.read_csv(url)
data.head()
  

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,58,Male,8700
1,5418686973265201,55255,36,Female,9700
2,5527060358825468,55559,32,Female,6800
3,5312916958971375,55700,58,Male,4700
4,5541858987662877,55925,52,Male,5700


In [3]:
data['zipcode'] = data['zipcode'].astype(str)
data['creditcard'] = data['creditcard'].astype(str)

# Dataframe

In [4]:
data[:10]


Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,58,Male,8700
1,5418686973265201,55255,36,Female,9700
2,5527060358825468,55559,32,Female,6800
3,5312916958971375,55700,58,Male,4700
4,5541858987662877,55925,52,Male,5700
5,5155271703366251,55338,38,Female,7100
6,5485337334153888,55840,38,Male,6000
7,5293804792403628,55772,32,Female,7000
8,5275938856549264,55641,19,Male,100
9,5303041772852809,55861,82,Male,4000


In [5]:
df = pd.DataFrame(data, columns=['creditcard', 'zipcode','age','gender','salary'])
df_backup = df.copy()
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,58,Male,8700
1,5418686973265201,55255,36,Female,9700
2,5527060358825468,55559,32,Female,6800
3,5312916958971375,55700,58,Male,4700
4,5541858987662877,55925,52,Male,5700


# Comprobar Anonimato-K

In [6]:
#https://programming-dp.com/notebooks/ch2.html

def queryKAnonymized(row):
  return f'zipcode == \'{row.zipcode}\'' \
         f' & gender == \'{row.gender}\'' \
         f' & age == {row.age}' \
         f' & salary == {row.salary}'

def isKAnonymized(df, k, queryFunction = queryKAnonymized):
  for index, row in df.iterrows():
    if df.query(queryFunction(row)).shape[0] < k: return False
  return True

In [7]:
isKAnonymized(df, 1)

True

In [8]:
isKAnonymized(df, 2)

False

In [9]:
def getNotKAnonymized(df, k, queryFunction = queryKAnonymized):
  rowsNotKAnonymized = pd.DataFrame()
  for index, row in df.iterrows():
    group = df.query(queryFunction(row))
    if group.shape[0] < k: 
      rowsNotKAnonymized = pd.concat([rowsNotKAnonymized, group])
  return rowsNotKAnonymized.drop_duplicates()

In [10]:
getNotKAnonymized(df, 1)

In [11]:
getNotKAnonymized(df, 2)

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,58,Male,8700
1,5418686973265201,55255,36,Female,9700
2,5527060358825468,55559,32,Female,6800
3,5312916958971375,55700,58,Male,4700
4,5541858987662877,55925,52,Male,5700
...,...,...,...,...,...
994,5395287779118434,55640,82,Female,1700
995,5564301173493387,55067,21,Female,7400
996,5193534712511173,55901,23,Female,8100
997,5164269869571382,55601,52,Male,3500


# Generalización por redondeo

## Generalización de números enteros

In [12]:
def generalizeInt(df, column, level):
  return df[column].apply(
    lambda x: round(x / (10**level)) * (10**level)
  )

In [13]:
df.salary = generalizeInt(df, 'salary', 3)

In [14]:
df.age = generalizeInt(df, 'age', 1)

In [15]:
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,60,Male,9000
1,5418686973265201,55255,40,Female,10000
2,5527060358825468,55559,30,Female,7000
3,5312916958971375,55700,60,Male,5000
4,5541858987662877,55925,50,Male,6000


In [16]:
isKAnonymized(df, 2)

False

In [17]:
getNotKAnonymized(df, 2)

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55335,60,Male,9000
1,5418686973265201,55255,40,Female,10000
2,5527060358825468,55559,30,Female,7000
3,5312916958971375,55700,60,Male,5000
4,5541858987662877,55925,50,Male,6000
...,...,...,...,...,...
994,5395287779118434,55640,80,Female,2000
995,5564301173493387,55067,20,Female,7000
996,5193534712511173,55901,20,Female,8000
997,5164269869571382,55601,50,Male,4000


## Generalización de códigos alfanuméricos

In [18]:
def generalizeStringCode(df, column, level):
  return df[column].apply(
    lambda x: x[:-level] + ('*' * level)
  )

In [19]:
df.zipcode = generalizeStringCode(df, 'zipcode', 2)

In [20]:
isKAnonymized(df, 2)

False

In [21]:
getNotKAnonymized(df, 2)

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,553**,60,Male,9000
1,5418686973265201,552**,40,Female,10000
2,5527060358825468,555**,30,Female,7000
4,5541858987662877,559**,50,Male,6000
7,5293804792403628,557**,30,Female,7000
...,...,...,...,...,...
990,5190197578253687,554**,100,Male,4000
993,5342057613343975,556**,50,Male,9000
995,5564301173493387,550**,20,Female,7000
996,5193534712511173,559**,20,Female,8000


# Generalización por agrupación

In [22]:
def applyRules(x, rules):
  for key in rules:
    if (x >= rules[key]['min'] and x <= rules[key]['max']): return key
  return "outlier"

def groupDiscretization(df, column, rules):
  return df[column].apply(
    lambda x: applyRules(x, rules)
  )

## Discretización del salario en 3 grupos

In [23]:
df.salary = df_backup.salary
df.describe()

Unnamed: 0,age,salary
count,999.0,999.0
mean,57.437437,4972.172172
std,24.405555,2941.739971
min,20.0,100.0
25%,40.0,2300.0
50%,60.0,5000.0
75%,80.0,7500.0
max,100.0,10000.0


In [24]:
salaryRules = {
    'low': {'min': 0, 'max': 1500},
    'medium': {'min': 1500, 'max': 3000},
    'high': {'min': 3000, 'max': 10000}
}
df.salary = groupDiscretization(df, 'salary', salaryRules)
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,553**,60,Male,high
1,5418686973265201,552**,40,Female,high
2,5527060358825468,555**,30,Female,high
3,5312916958971375,557**,60,Male,high
4,5541858987662877,559**,50,Male,high


In [25]:
def queryKAnonymized_salaryGrouped(row):
  return f'zipcode == \'{row.zipcode}\'' \
         f' & gender == \'{row.gender}\'' \
         f' & age == {row.age}' \
         f' & salary == \'{row.salary}\''

isKAnonymized(df, 2, queryKAnonymized_salaryGrouped)

False

In [26]:
len(getNotKAnonymized(df, 2, queryKAnonymized_salaryGrouped))

155

## Discretización de edad en 3 grupos

In [27]:
df.age = df_backup.age
df.describe()

Unnamed: 0,age
count,999.0
mean,57.364364
std,24.116729
min,18.0
25%,35.0
50%,57.0
75%,79.0
max,100.0


In [28]:
ageRules = {
    'junior': {'min': 0, 'max': 30},
    'medium': {'min': 30, 'max': 50},
    'senior': {'min': 50, 'max': 101}
}
df.age = groupDiscretization(df, 'age', ageRules)
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,553**,senior,Male,high
1,5418686973265201,552**,medium,Female,high
2,5527060358825468,555**,medium,Female,high
3,5312916958971375,557**,senior,Male,high
4,5541858987662877,559**,senior,Male,high


In [29]:
def queryKAnonymized_salaryAgeGrouped(row):
  return f'zipcode == \'{row.zipcode}\'' \
         f' & gender == \'{row.gender}\'' \
         f' & age == \'{row.age}\'' \
         f' & salary == \'{row.salary}\''

len(getNotKAnonymized(df, 2, queryKAnonymized_salaryAgeGrouped))

30

## Discretización de código postal en 3 grupos

In [30]:
df.zipcode.head()

0    553**
1    552**
2    555**
3    557**
4    559**
Name: zipcode, dtype: object

In [31]:
zipcodeRules = {
    '550**-552**': {'min': '550**', 'max': '552**'},
    '553**-556**': {'min': '553**', 'max': '556**'},
    '557**-559**': {'min': '557**', 'max': '559**'}
}
df.zipcode = groupDiscretization(df, 'zipcode', zipcodeRules)
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,553**-556**,senior,Male,high
1,5418686973265201,550**-552**,medium,Female,high
2,5527060358825468,553**-556**,medium,Female,high
3,5312916958971375,557**-559**,senior,Male,high
4,5541858987662877,557**-559**,senior,Male,high


In [32]:
len(getNotKAnonymized(df, 2, queryKAnonymized_salaryAgeGrouped))

0

In [33]:
getNotKAnonymized(df, 3, queryKAnonymized_salaryAgeGrouped)

Unnamed: 0,creditcard,zipcode,age,gender,salary
233,5148234499441506,550**-552**,medium,Female,low
561,5434614557694547,550**-552**,medium,Female,low
636,5184789207964060,557**-559**,junior,Female,medium
718,5494158926215755,557**-559**,junior,Female,medium


## Discretización de código postal en 2 grupos

In [34]:
df.zipcode = df_backup.zipcode
df.zipcode

0      55335
1      55255
2      55559
3      55700
4      55925
       ...  
994    55640
995    55067
996    55901
997    55601
998    55547
Name: zipcode, Length: 999, dtype: object

In [35]:
zipcodeRulesTwoGroups = {
    '55000-55499': {'min': '55000', 'max': '55499'},
    '55500-55999': {'min': '55500', 'max': '55999'}
}
df.zipcode = groupDiscretization(df, 'zipcode', zipcodeRulesTwoGroups)
df.head()

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,5557783527541459,55000-55499,senior,Male,high
1,5418686973265201,55000-55499,medium,Female,high
2,5527060358825468,55500-55999,medium,Female,high
3,5312916958971375,55500-55999,senior,Male,high
4,5541858987662877,55500-55999,senior,Male,high


In [36]:
len(getNotKAnonymized(df, 5, queryKAnonymized_salaryAgeGrouped))

0

In [37]:
getNotKAnonymized(df, 6, queryKAnonymized_salaryAgeGrouped)

Unnamed: 0,creditcard,zipcode,age,gender,salary
195,5282876440236496,55000-55499,junior,Male,low
709,5338783612672926,55000-55499,junior,Male,low
790,5590375584035431,55000-55499,junior,Male,low
831,5165553732142557,55000-55499,junior,Male,low
926,5467404471699503,55000-55499,junior,Male,low
227,5243495311011488,55000-55499,medium,Female,low
233,5148234499441506,55000-55499,medium,Female,low
405,5342140519111603,55000-55499,medium,Female,low
561,5434614557694547,55000-55499,medium,Female,low
587,5550229459841456,55000-55499,medium,Female,low


# Filtrado de variables

In [38]:
df_nozip = df.copy()
df_nozip.drop(columns=['zipcode'], inplace=True)
df_nozip.head()

Unnamed: 0,creditcard,age,gender,salary
0,5557783527541459,senior,Male,high
1,5418686973265201,medium,Female,high
2,5527060358825468,medium,Female,high
3,5312916958971375,senior,Male,high
4,5541858987662877,senior,Male,high


In [39]:
def queryKAnonymized_noZip(row):
  return f'gender == \'{row.gender}\'' \
         f' & age == \'{row.age}\'' \
         f' & salary == \'{row.salary}\''

len(getNotKAnonymized(df_nozip, 11, queryKAnonymized_noZip))

0

In [40]:
getNotKAnonymized(df_nozip, 12, queryKAnonymized_noZip)

Unnamed: 0,creditcard,age,gender,salary
17,5217036561413487,junior,Female,low
53,5431264098786171,junior,Female,low
373,5492256342135698,junior,Female,low
465,5407119200524540,junior,Female,low
555,5574301016330760,junior,Female,low
630,5308576867572221,junior,Female,low
634,5438231572216713,junior,Female,low
701,5512074870840795,junior,Female,low
802,5416287053945374,junior,Female,low
902,5209381246715967,junior,Female,low


# Cifrado y seudonimización

## Cifrado SHA256

In [41]:
def encodeSHA256(df, column):
  return df[column].apply(
    lambda x: sha256(x.encode('utf-8')).hexdigest()
  )

In [42]:
encodeSHA256(df, 'creditcard')

0      8d184c9660fb4fb4020b668ae72331c058ff2ecee27339...
1      71c24d0c3e9886856df731e8b7f79f8c84549b9199fd83...
2      4be30506958bf04b22543e3854e3b3252956719c0a10d8...
3      4fac46c6a4cec927a80f3fad07befd4008cfc1b409294f...
4      0b403cbfc500b119e1b5eab465a0e0c1cb5bdb9c76e16a...
                             ...                        
994    324e5152c96e530d3a07c783c57d327df37d4750c3ca36...
995    8e9f2919c79ecf9f62642ff4138e99aa63ef95174b9706...
996    7b53f2d60091e7d8fcd953364a1583cfcda26c3c63d856...
997    caf6a4dda3f2c1060b2594e209f0911dc914b0eb0a9c41...
998    64541d50ad0c31425e7f46b609d0ee979a18e4c59bb754...
Name: creditcard, Length: 999, dtype: object

In [43]:
sum(
    encodeSHA256(df, 'creditcard') == 
    encodeSHA256(df, 'creditcard')
)

999

In [44]:
for name in algorithms_available:
	print(name)

sha1
blake2b
sha3_384
sha3_512
sha3_224
sha384
sha512
shake_128
sha3_256
blake2s
md5
shake_256
sha256
sha224


## Cifrado Blake2b

In [45]:
# https://docs.python.org/es/dev/library/hashlib.html#hashlib.blake2b
# https://www.blake2.net/

def encodeStringBLAKE2B(x, size=64, domain=b'', salt=b''):
  h = blake2b(
      digest_size = size, 
      person = domain, 
      salt = salt 
  )
  h.update(x.encode('utf8'))
  return h.hexdigest()

def encodeBLAKE2B(df, column, size=64, domain=b'', salt=b''):
  return df[column].apply(
    lambda x: encodeStringBLAKE2B(x, size, domain, salt)
  )

In [46]:
encodeBLAKE2B(df, 'creditcard', size=10)

0      174f11dd2cc3672d9510
1      36c6c74ac8b337b06745
2      228fce22fb2e00a8a54f
3      734e901f4319cc18f393
4      5f5131cc88aaefa23a43
               ...         
994    30c725768c8584a56197
995    1d964c8df977df307bbb
996    21e72784fdd0d6b9c1c8
997    ede715899bbf4b2dc553
998    ead99cce8f346bbabe78
Name: creditcard, Length: 999, dtype: object

In [47]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10) == 
    encodeBLAKE2B(df, 'creditcard', size=10)
)

999

## Cifrado Blake2b con gestión de dominio

In [48]:
encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1')

0      27fae45b3ce8a1b105d0
1      fe528d1da3b60a2bea4a
2      2efaadf53748c20e7325
3      7c4e863ae6dec29b79be
4      cb2e7c6661dac2228b4d
               ...         
994    af545c82dd1ccd46ab9d
995    8c5acfb28674725164a2
996    84911b85ca1504816131
997    998035e804420ef27724
998    3183dd9039ba610d72b2
Name: creditcard, Length: 999, dtype: object

In [49]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10) == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1')
)

0

In [50]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1') == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1')
)

999

In [51]:
encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP2')

0      ac16a9be96bb0b7d8633
1      baa6787047d38abaa8ac
2      9e4bd96ef3b159b5285a
3      4772e752517a79ad6ae9
4      470dbb03beb86e275ae6
               ...         
994    96b6aff51c05f9b9ca67
995    e0b07b0ce5727765f5ca
996    fec517378845e38cb969
997    5f23fd3f6d9c71d34aa5
998    9f46fe65a2c72979ed5f
Name: creditcard, Length: 999, dtype: object

In [52]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1') == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP2')
)

0

## Cifrado Blake2b con salt (sal)

In [53]:
randomSalt = os.urandom(blake2b.SALT_SIZE)
encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=randomSalt)

0      8f4215d746a97e4f6cb8
1      94e29fd2afc8363bee3e
2      0baea46f0ac62f9ed355
3      d9a6ba5ef2cf3cc13f14
4      1834336a0f4550e72796
               ...         
994    000d6af0303e999bbe50
995    271e66fb9d71f902bb47
996    5ffc5bc2fca5f9c774d0
997    e98a15215cbe3986213d
998    eb5514c608cae08292cb
Name: creditcard, Length: 999, dtype: object

In [54]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1') == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=randomSalt)
)

0

In [55]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=randomSalt) == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=randomSalt)
)

999

In [56]:
sum(
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=randomSalt) == 
    encodeBLAKE2B(df, 'creditcard', size=10, domain=b'APP1', salt=os.urandom(blake2b.SALT_SIZE))
)

0

# Resultado final de anonimización

## Conjunto anonimizado con Anonimato K=5 y código postal en 2 grupos

In [57]:
df.creditcard = encodeBLAKE2B(df, 'creditcard', size=10, domain=b'Dataset con ZIP', salt=os.urandom(blake2b.SALT_SIZE))
df

Unnamed: 0,creditcard,zipcode,age,gender,salary
0,2cf78d190904b78ed00d,55000-55499,senior,Male,high
1,e3126b172002fcfb4b94,55000-55499,medium,Female,high
2,9b6b66e1415eeeed2380,55500-55999,medium,Female,high
3,9fbf27b11693cbdb4bc8,55500-55999,senior,Male,high
4,98b15057a2b2416eab84,55500-55999,senior,Male,high
...,...,...,...,...,...
994,42cedb546e0f8ea1ba64,55500-55999,senior,Female,medium
995,4bd87b002af99099bb91,55000-55499,junior,Female,high
996,ff38359544062ca42ac8,55500-55999,junior,Female,high
997,96fcf2f7c0945f7731cd,55500-55999,senior,Male,high


In [58]:
isKAnonymized(df, 5, queryKAnonymized_salaryAgeGrouped)

True

## Conjunto anonimizado con Anonimato K=11, sin código postal

In [59]:
df_nozip.creditcard = encodeBLAKE2B(df, 'creditcard', size=10, domain=b'Dataset sin ZIP', salt=os.urandom(blake2b.SALT_SIZE))
df_nozip

Unnamed: 0,creditcard,age,gender,salary
0,8440c89cb3ff21889b0e,senior,Male,high
1,98782ade3c47154b8503,medium,Female,high
2,7f19c90e966f3f4dddca,medium,Female,high
3,cc9ff10eae0063870e40,senior,Male,high
4,f81b4988f329ccdc0b8a,senior,Male,high
...,...,...,...,...
994,6c09813b0d3a9c9dd7d8,senior,Female,medium
995,260d13c880ec89f794e6,junior,Female,high
996,b171d9dc822fa5b48f11,junior,Female,high
997,d4c3e827234230e1f856,senior,Male,high


In [60]:
isKAnonymized(df, 11, queryKAnonymized_noZip)

True