# Part 2 : privacy evaluation

In [1]:
import csv
import numpy as np

In [2]:
import bisect

## Grid parameters
# Top left corner of the area
MAP_LAT = 46.5
MAP_LON = 6.55

# Total area size
MAP_SIZE_LAT = 0.07
MAP_SIZE_LON = 0.10

# Number of cells
CELL_NUM_LAT = 10
CELL_NUM_LON = 10

# Grid lines
GRID_LAT_POINTS = [MAP_LAT + i * (MAP_SIZE_LAT / CELL_NUM_LAT)
                   for i in range(1, CELL_NUM_LAT + 1)]
GRID_LON_POINTS = [MAP_LON + i * (MAP_SIZE_LON / CELL_NUM_LON)
                   for i in range(1, CELL_NUM_LON + 1)]


def location_to_cell_id(lat, lon):
    """Get the grid cell ID for a given latitude and longitude."""
    if not (MAP_LAT <= lat < MAP_LAT + MAP_SIZE_LAT) or not (
        MAP_LON <= lon < MAP_LON + MAP_SIZE_LON
    ):
        raise ValueError("Out of area range.")

    i = bisect.bisect(GRID_LAT_POINTS, lat)
    j = bisect.bisect(GRID_LON_POINTS, lon)
    return i * CELL_NUM_LAT + j + 1


## Load data

In [3]:
# Load queries
queries = np.empty((0,5))
with open('queries.csv') as queries_file:
    query_reader = csv.reader(queries_file, delimiter=" ")
    line_count = 0
    for row in query_reader:
        if line_count == 0:
            line_count += 1
        else:
            queries = np.append(queries, np.array([row]), axis=0)
            line_count += 1
    print(f'Processed {line_count} lines.')

Processed 20444 lines.


In [4]:
# Load pois
pois = np.empty((0,5))
with open('pois.csv') as pois_file:
    pois_reader = csv.reader(pois_file, delimiter=" ")
    line_count = 0
    for row in pois_reader:
        if line_count == 0:
            line_count += 1
        else:
            pois = np.append(pois, np.array([row]), axis=0)
            line_count += 1
    print(f'Processed {line_count} lines.')

Processed 1061 lines.


In [5]:
# Print every different possible queries
diff_queries = np.unique(pois[:,2])
print(diff_queries)

['appartment_block' 'bar' 'cafeteria' 'club' 'company' 'dojo' 'gym'
 'laboratory' 'office' 'restaurant' 'supermarket' 'villa']


## Attacks

In [6]:
# ATTACK 1 : Interests of some users
queries_interests = np.copy(queries)
queries_interests = queries_interests[:,[0,4]] # select ip address and poi

cond_gym = queries[:,4] == 'gym'
cond_dojo = queries[:,4] == 'dojo'
cond_interests = np.logical_or(cond_gym, cond_dojo) # select gyms and dojos

queries_interests = queries_interests[cond_interests]
print("Number of users with interests found : ", np.shape(np.unique(queries_interests[:,0]))[0])

queries_interests = [tuple(row) for row in queries_interests] # to make the queries unique
queries_interests = np.unique(queries_interests, axis=0)
print("Size : ", np.shape(queries_interests)[0])
print(queries_interests)

Number of users with interests found :  166
Size :  213
[['0.98.248.97' 'dojo']
 ['10.229.150.53' 'dojo']
 ['10.229.150.53' 'gym']
 ['100.255.65.73' 'dojo']
 ['101.193.212.180' 'gym']
 ['103.107.27.105' 'gym']
 ['104.149.206.168' 'gym']
 ['105.148.239.144' 'gym']
 ['107.201.148.122' 'dojo']
 ['107.201.148.122' 'gym']
 ['11.105.81.179' 'dojo']
 ['11.173.13.2' 'dojo']
 ['11.173.13.2' 'gym']
 ['113.167.82.177' 'dojo']
 ['113.167.82.177' 'gym']
 ['113.244.164.228' 'dojo']
 ['114.193.164.29' 'dojo']
 ['115.106.150.37' 'dojo']
 ['115.186.150.175' 'dojo']
 ['115.207.37.109' 'dojo']
 ['117.103.63.202' 'gym']
 ['121.125.118.222' 'gym']
 ['121.229.130.24' 'dojo']
 ['123.194.67.60' 'dojo']
 ['123.239.52.251' 'dojo']
 ['123.239.52.251' 'gym']
 ['126.107.209.19' 'gym']
 ['126.150.186.135' 'gym']
 ['127.4.23.40' 'dojo']
 ['127.94.142.97' 'gym']
 ['129.133.79.138' 'gym']
 ['13.103.179.102' 'dojo']
 ['13.103.179.102' 'gym']
 ['13.144.1.235' 'gym']
 ['13.191.142.105' 'gym']
 ['130.144.146.191' 'dojo']


In [7]:
# ATTACK 2 : Home of some users and their interests

# Queries time mod 24 : numpy array 'queries_t' is a copy of queries but
# with time mod 24 so we can filter by the time of the day
time = queries[:,3].astype(np.float)
time = time % 24
queries_t = np.copy(queries)
queries_t[:,3] = time

# Queries that could come from home of the user (between midnight and 6 o'clock, without 'club' or 'bar')
cond_time = queries_t[:,3].astype(np.float) < 6
cond_no_club = queries_t[:,4] != 'club'
cond_no_bar = queries_t[:,4] != 'bar'
all_cond = np.logical_and(cond_time, np.logical_and(cond_no_club, cond_no_bar))

queries_home = queries_t[all_cond]

print("Size : ", np.shape(queries_home)[0])
print(queries_home[:,[0,1,2,4]])

Size :  79
[['82.230.180.56' '46.56596058841714' '6.642228369125065' 'gym']
 ['82.230.180.56' '46.56596058841714' '6.642228369125065' 'restaurant']
 ['46.103.95.199' '46.557453635243554' '6.593135121452934' 'restaurant']
 ['46.103.95.199' '46.557453635243554' '6.593135121452934' 'dojo']
 ['46.103.95.199' '46.5613247900066' '6.59239490482244' 'restaurant']
 ['46.103.95.199' '46.56252868802353' '6.596979919954148' 'cafeteria']
 ['46.103.95.199' '46.56252868802353' '6.596979919954148' 'dojo']
 ['46.103.95.199' '46.55626358771141' '6.591222669318815' 'dojo']
 ['46.103.95.199' '46.55626358771141' '6.591222669318815' 'cafeteria']
 ['46.103.95.199' '46.56075161011403' '6.596973196900907' 'cafeteria']
 ['46.103.95.199' '46.556516722962456' '6.593581353284495' 'cafeteria']
 ['46.103.95.199' '46.56252868802353' '6.596979919954148' 'cafeteria']
 ['231.216.0.104' '46.53795463959577' '6.605128263999426' 'gym']
 ['140.81.135.193' '46.53265953669294' '6.615344714498467' 'cafeteria']
 ['127.94.142.97'

In [8]:
# ATTACK 3 : Workplace of some users

# From monday to friday (24h * 5d = 120h)
queries_work = np.copy(queries)
queries_work = queries[queries[:,3].astype(np.float) < 120]

# Time mod 24
time = queries_work[:,3].astype(np.float)
time = time % 24
queries_work[:,3] = time

# Select time from 9am to 4pm
cond_time1 = queries_work[:,3].astype(np.float) > 9
cond_time2 = queries_work[:,3].astype(np.float) < 16
cond_time = np.logical_and(cond_time1, cond_time2)

# Select interests
cond_gym = queries_work[:,4] == 'gym'
cond_bar = queries_work[:,4] == 'bar'
cond_cafeteria = queries_work[:,4] == 'cafeteria'
cond_restaurant = queries_work[:,4] == 'restaurant'
cond_interests = np.logical_or(np.logical_or(cond_gym, cond_bar), np.logical_or(cond_cafeteria, cond_restaurant))

cond_all = np.logical_and(cond_time, cond_interests)
queries_work = queries_work[cond_all] # Monday to friday, 8am to 4pm, with selected interests

print("Unique users : ", np.shape(np.unique(queries_work[:,0]))[0])
print("Size : ", np.shape(queries_work)[0])
print(queries_work[1:100])

Unique users :  200
Size :  2046
[['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '14.912447506427014' 'restaurant']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '12.334538864268396' 'cafeteria']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '12.334538864268396' 'restaurant']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '11.359388449373135' 'cafeteria']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '11.359388449373135' 'restaurant']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '11.78984190081674' 'cafeteria']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '11.78984190081674' 'restaurant']
 ['34.101.177.245' '46.55034241161493' '6.6028524983313'
  '14.796291220463075' 'restaurant']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '13.212732990410444' 'cafeteria']
 ['34.101.177.245' '46.53294222140508' '6.591174086010503'
  '13.212732990410444' 'restaurant']
 ['244.190.169.

In [9]:
# Attack 4 : Find home from priors on interests and working place
working_place_coord1 = '46.53294222140508'
working_place_coord2 = '6.591174086010503'

cond_coord1 = queries_t[:,1] == working_place_coord1
cond_coord2 = queries_t[:,2] == working_place_coord2
cond_dojo = queries_t[:,4] == 'restaurant'
cond_club = queries_t[:,4] == 'cafeteria'
cond_time1 = queries_t[:,3].astype(np.float) > 11
cond_time2 = queries_t[:,3].astype(np.float) < 13
cond_all = np.logical_and(np.logical_and(cond_time1, cond_time2), np.logical_and(cond_coord1, cond_coord2), np.logical_or(cond_dojo, cond_club))

queries_user = np.copy(queries)
queries_user = queries_user[cond_all]

print("Number of users with these conditions : ", np.shape(np.unique(queries_user[:,0]))[0])
print("IP addresses found : ", np.unique(queries_user[:,0]))

Number of users with these conditions :  5
IP addresses found :  ['104.149.206.168' '105.148.239.144' '202.161.113.142' '250.163.7.81'
 '34.101.177.245']


## Defence

In [10]:
# Add noise to coordinates, drawn from laplacian distribution with mean 0 and scale sqrt(0.5*4*10^(-4))
mean = 0
scale = np.sqrt(0.5 * 1 * 10**(-6))

queries_noisy = np.copy(queries)
qn_coord = queries_noisy[:,[1,2]].astype(np.float) 
noise = np.random.laplace(mean, scale, np.shape(qn_coord)) 
qn_coord = qn_coord + noise

# Make sure that it did not go out of bounds
for i in range(np.shape(qn_coord)[0]):
    
    if(qn_coord[i][0] < MAP_LAT):
        qn_coord[i][0] = MAP_LAT
    elif(qn_coord[i][0] >= MAP_LAT + MAP_SIZE_LAT):
        qn_coord[i][0] = MAP_LAT + MAP_SIZE_LAT - 10**(-6) # -10^(-6) so that it is smaller than LAT + LAT_SIZE
    
    if(qn_coord[i][1] < MAP_LON):
        qn_coord[i][1] = MAP_LON
    elif(qn_coord[i][1] >= MAP_LON + MAP_SIZE_LON):
        qn_coord[i][1] = MAP_LON + MAP_SIZE_LON - 10**(-6) # -10^(-6) so that it is smaller than LAT + LAT_SIZE
    
queries_noisy[:,[1,2]] = qn_coord
print("STD : ", np.std(queries[:,1].astype(np.float)-queries_noisy[:,1].astype(np.float)))

STD :  0.0010062870957853532


In [11]:
# ATTACK 2 with defence : Home of some users and their interests

# Queries time mod 24 : numpy array 'queries_t' is a copy of queries but
# with time mod 24 so we can filter by the time of the day
time = queries_noisy[:,3].astype(np.float)
time = time % 24
queries_t_n = np.copy(queries_noisy)
queries_t_n[:,3] = time

# Queries that could come from home of the user (between midnight and 6 o'clock, without 'club' or 'bar')
cond_time = queries_t_n[:,3].astype(np.float) < 6
cond_no_club = queries_t_n[:,4] != 'club'
cond_no_bar = queries_t_n[:,4] != 'bar'
all_cond = np.logical_and(cond_time, np.logical_and(cond_no_club, cond_no_bar))

queries_home_n = queries_t_n[all_cond]

print("Size : ", np.shape(queries_home_n)[0])
print(queries_home_n[:,[0,1,2,4]])

Size :  79
[['82.230.180.56' '46.56589485096079' '6.640167248506715' 'gym']
 ['82.230.180.56' '46.564887603332146' '6.642028704565246' 'restaurant']
 ['46.103.95.199' '46.556390867148295' '6.5928720678138095' 'restaurant']
 ['46.103.95.199' '46.55633493721258' '6.593235768410641' 'dojo']
 ['46.103.95.199' '46.56012263350656' '6.592757792828251' 'restaurant']
 ['46.103.95.199' '46.56259124705649' '6.597543517907726' 'cafeteria']
 ['46.103.95.199' '46.56243813163433' '6.59655960264963' 'dojo']
 ['46.103.95.199' '46.555899070226715' '6.5950388874141' 'dojo']
 ['46.103.95.199' '46.555759158902646' '6.591396591190188' 'cafeteria']
 ['46.103.95.199' '46.561784536869986' '6.5961014982837165' 'cafeteria']
 ['46.103.95.199' '46.556915440234306' '6.593641700131736' 'cafeteria']
 ['46.103.95.199' '46.56235362815451' '6.596840009670903' 'cafeteria']
 ['231.216.0.104' '46.53788427690273' '6.605127037757712' 'gym']
 ['140.81.135.193' '46.5327427843106' '6.617930088181798' 'cafeteria']
 ['127.94.142.

In [29]:
# ATTACK 3 with defence : Workplace of some users

# From monday to friday (24h * 5d = 120h)
queries_work_n = np.copy(queries_noisy)
queries_work_n = queries_noisy[queries_noisy[:,3].astype(np.float) < 120]

# Time mod 24
time = queries_work_n[:,3].astype(np.float)
time = time % 24
queries_work_n[:,3] = time

# Select time from 9am to 4pm
cond_time1 = queries_work_n[:,3].astype(np.float) > 9
cond_time2 = queries_work_n[:,3].astype(np.float) < 16
cond_time = np.logical_and(cond_time1, cond_time2)

# Select interests
cond_gym = queries_work_n[:,4] == 'gym'
cond_bar = queries_work_n[:,4] == 'bar'
cond_cafeteria = queries_work_n[:,4] == 'cafeteria'
cond_restaurant = queries_work_n[:,4] == 'restaurant'
cond_interests = np.logical_or(np.logical_or(cond_gym, cond_bar), np.logical_or(cond_cafeteria, cond_restaurant))

cond_all = np.logical_and(cond_time, cond_interests)
queries_work_n = queries_work_n[cond_all] # Monday to friday, 8am to 4pm, with selected interests

print("Unique users : ", np.shape(np.unique(queries_work_n[:,0]))[0])
print("Size : ", np.shape(queries_work_n)[0])
print(queries_work_n[0:100])

Unique users :  200
Size :  2046
[['34.101.177.245' '46.533540446681755' '6.594702607140822'
  '14.912447506427014' 'cafeteria']
 ['34.101.177.245' '46.53305879588325' '6.592037599783317'
  '14.912447506427014' 'restaurant']
 ['34.101.177.245' '46.53317279668529' '6.590604102238297'
  '12.334538864268396' 'cafeteria']
 ['34.101.177.245' '46.53306841651351' '6.591265479498984'
  '12.334538864268396' 'restaurant']
 ['34.101.177.245' '46.53318477416316' '6.588533731356185'
  '11.359388449373135' 'cafeteria']
 ['34.101.177.245' '46.533856207248235' '6.591355694847234'
  '11.359388449373135' 'restaurant']
 ['34.101.177.245' '46.53228241704763' '6.592051888929724'
  '11.78984190081674' 'cafeteria']
 ['34.101.177.245' '46.53268374589016' '6.590977540781997'
  '11.78984190081674' 'restaurant']
 ['34.101.177.245' '46.551225609109814' '6.602762843670195'
  '14.796291220463075' 'restaurant']
 ['34.101.177.245' '46.53231030865269' '6.589350076821599'
  '13.212732990410444' 'cafeteria']
 ['34.101.1

In [49]:
# Results of defence applied to Attack 3, for the first 3 users
first_user = queries_work_n[0:11,[1,2]].astype(np.float)
second_user = queries_work_n[11:21,[1,2]].astype(np.float)
third_user = queries_work_n[21:32,[1,2]].astype(np.float)

print("Real coord 1 : ", queries_work[0,[1,2]].astype(np.float))
print("Reconstructed coord 1 : ", np.mean(first_user[:,0]), np.mean(first_user[:,1]))
print("~200m error")

print("Real coord 2 : ", queries_work[11,[1,2]].astype(np.float))
print("Reconstructed coord 2 : ", np.mean(second_user[:,0]), np.mean(second_user[:,1]))
print("~50m error")

print("Real coord 3 : ", queries_work[21,[1,2]].astype(np.float))
print("Reconstructed coord 3 : ", np.mean(third_user[:,0]), np.mean(third_user[:,1]))
print("~330m error")

Real coord 1 :  [46.53294222  6.59117409]
Reconstructed coord 1 :  46.53474762414503 6.5922327797454905
~200m error
Real coord 2 :  [46.54078235  6.59189659]
Reconstructed coord 2 :  46.54107132888438 6.592535743058119
~50m error
Real coord 3 :  [46.5463774   6.57535305]
Reconstructed coord 3 :  46.54822508096495 6.578853001269809
~334m error


In [13]:
# Attack 4 with defence : Find home from priors on interests and working place
working_place_coord1 = '46.53294222140508'
working_place_coord2 = '6.591174086010503'

cond_coord1 = queries_noisy[:,1] == working_place_coord1
cond_coord2 = queries_noisy[:,2] == working_place_coord2
cond_dojo = queries_noisy[:,4] == 'restaurant'
cond_club = queries_noisy[:,4] == 'cafeteria'
cond_time1 = queries_t[:,3].astype(np.float) > 11
cond_time2 = queries_t[:,3].astype(np.float) < 13
cond_all = np.logical_and(np.logical_and(cond_time1, cond_time2), np.logical_and(cond_coord1, cond_coord2), np.logical_or(cond_dojo, cond_club))

queries_user_n = np.copy(queries_noisy)
queries_user_n = queries_user_n[cond_all]

print("Number of users with these conditions : ", np.shape(np.unique(queries_user_n[:,0]))[0])
print("IP addresses found : ", np.unique(queries_user_n[:,0]))

Number of users with these conditions :  0
IP addresses found :  []


In [14]:
# Compare cell differences between real and noisy queries
real_coord = queries[:,[1,2]].astype(np.float)
noisy_coord = queries_noisy[:,[1,2]].astype(np.float)

real_res = np.array([location_to_cell_id(coord[0], coord[1]) for coord in real_coord])
noisy_res = np.array([location_to_cell_id(coord[0], coord[1]) for coord in noisy_coord])

diff = real_res - noisy_res
diff = [0 if d == 0 else 1 for d in diff]

print("Error [%] : ", 100*np.sum(diff)/np.size(diff))

Error [%] :  15.486963752873844
