# Assumption and  Adversial Model

We assume that each IP address is unique and associated to one user and that IP address does not change overt time, it is not necessary a realistic assumption since the ISP provide an random address ip if the user use 4g for example.
We assume that 2 successive request send in the same cell count has 1 visit in the cell.
We assume the connection is encrypted so the adversary cannot eavesdrop and read the packets but we assume that if the adversary have access to any datapresented in section 2.1, that the adversary can do data analysis and that the adversary can do request on its own. More concretely the service provider could be considered as the adversary and search to know information about indivual users.

These assumptions are usefull for privacy analysis because they give to the adversary a possibility to exploits data while not being unrealistic.

# Attack Strategy

## frequency attack

With a given ip we can count the number of time the user with this ip visited a given cell or given type of POI to learn what the hobbies of the user are and where he lives. We could also infer with timestamp to learn where the user work and what are his work hours. After this we can compare between different users if they are visiting the same place at the same time and learning which users are knowing each other. 

# Demonstration

In [53]:
from query import get_nearby_pois
from os import path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [64]:
def parse_queries():
    query_by_user = {}
    cwd = path.abspath('')
    queries = np.loadtxt(path.join(cwd, 'queries.csv'), delimiter=" ", dtype=object, skiprows=1)
    
    for query in queries:
        if query[0] in query_by_user:
            query_by_user[query[0]].append({'location': np.array([float(query[1]), float(query[2])]), 
                                            'timestamp': int(query[3].split(".")[0]), 'filter': query[4]})
        else:
            query_by_user[query[0]] = [{'location': np.array([float(query[1]), float(query[2])]), 
                                            'timestamp': int(query[3].split(".")[0]), 'filter': query[4]}]
    return query_by_user
    
def parse_pois():
    pois_by_id = {}
    min_lat, max_lat = float("inf"), 0
    min_long, max_long = float("inf"), 0
    with open('pois.csv') as file:
        next(file)
        for line in file.readlines():
            line = line.split()
            pois_by_id[int(line[0])] = {"cell":int(line[1]), "type": line[2], 
                                        "location": (float(line[3]), float(line[4]))}
            if float(line[3]) < min_lat:
                min_lat = float(line[3])
            elif float(line[3]) > max_lat:
                max_lat = float(line[3])
            
            if float(line[4]) < min_long:
                min_long = float(line[4])
            elif float(line[4]) > max_long:
                max_long = float(line[4])
    print("min_lat: " + str(min_lat) + ", max_lat: "+ str(max_lat))
    print("min_long: " + str(min_long) + ", max_long: "+ str(max_long))
    return pois_by_id

query_by_user = parse_queries()
pois_by_id = parse_pois()

min_lat: 46.50003195953242, max_lat: 46.569850585348725
min_long: 6.55005143958594, max_long: 6.649892848816985


In [3]:
## create plot for type of pois visited
def plot_type_searched(queries):
    for user, queries in queries.items():
        pois_type_visited = {}
        for query in queries:
            if query['filter'] in pois_type_visited:
                pois_type_visited[query['filter']] += 1
            else:
                pois_type_visited[query['filter']] = 1
                
        types = list(pois_type_visited.keys())
        nb_of_visit = list(pois_type_visited.values())
        
        fig = plt.figure(figsize = (10, 5))
        plt.bar(types, nb_of_visit, width = 0.4)
        
        plt.xlabel("poi type")
        plt.ylabel("Number of visit")
        plt.title("visited type by {user_id}".format(user_id=user))
        path_name = "figures/type_searched/type_searched_{user_id}.png".format(user_id = user)
        plt.savefig(path_name)
        plt.close()

plot_type_searched(query_by_user)

In [94]:
## find most visited coordinates
def plot_visited_coordinates(queries, pois):
    for user, queries in queries.items():
        coord_visited = {}
        previous_cell = 0
        
        for query in queries:
            #as all nearby_pois are in same cell, we can just take the first poi of the list and retrive its cell
            location = tuple(list(query['location']))
            if location in coord_visited:
                coord_visited[location] += 1
            else:
                coord_visited[location] = 1
        
        coord = list(coord_visited.keys())
        nb_of_visit = list(coord_visited.values())
        fig = plt.figure(figsize = (20, 5))
        x_pos = np.arange(len(coord))
        plt.bar(x_pos, nb_of_visit, width = 0.4)
    
        plt.xlabel("coord")
        plt.ylabel("Number of visit")
        plt.title("coord visited by {user_id}".format(user_id=user))
        plt.xticks(x_pos,coord)
        path_name = "figures/coord_visited/coord_visited_{user_id}.png".format(user_id = user)
        plt.savefig(path_name)
        plt.close()
        
plot_visited_coordinates(query_by_user, pois_by_id)

In [16]:
def plot_number_of_meeting_between_user(queries):
    for user1, queries1 in queries.items():
        meeting = {}
        for query1 in queries1:
            for user2, queries2 in queries.items():
                if user1 != user2:
                    for query2 in queries2:
                        if query1['timestamp'] == query2['timestamp']:
                            if user2 in meeting:
                                meeting[user2] += 1
                            else:
                                meeting[user2] = 1
        users = list(meeting.keys())
        nb_of_meeting = list(meeting.values())
        fig = plt.figure(figsize = (30,5))
        plt.bar(users, nb_of_meeting, width=0.4)
        
        plt.xlabel("user met")
        plt.ylabel("Number of meeting")
        plt.title("meeting of {user_id} with other users".format(user_id=user1))
        path_name = "figures/meetings/meeting_of_{user_id}.png".format(user_id = user1)
        plt.savefig(path_name)
        plt.close()
plot_number_of_meeting_between_user(query_by_user)

In [None]:
#TODO: recreate wich poi were visited by users during 1 day

# Defense

## definition

We state the definition of the privacy as the k-anonmity achieve by the databse. A great k leads to a better privacy.
We state de definiton of utility as the capacity of a given data give the correct result of a querry.

## defense description

We will use databse sanitization as a defense on different data.
First we will make the user location more generalized. In fact we could track the exact position of a user since the location is accurate to the less than the milimeters. We have a 10 by 10 grid covering 0.069818625816305 in latitude and 0.099841409231045 in longitude so 0.06 latitude and 0.09 longitude par cells. So each cells have a size about 0.78 times 1.11 km². So clearly having the current precision is overkill and a precision about 100 meters is cleary sufficient and we can only keep 3 decimals for this. It should increase the k-anonmity of our database.

Then the timestamp are currently not use by the application so we can just delete them completely. It would also incerease k-anonmity and prevent inference on meetings for exemple.

In [43]:
queries = []

with open('queries.csv') as file:
        next(file)
        for line in file.readlines():
            queries.append(line.split())

for query in queries:
    query.remove(query[3])
    query[1] = round(float(query[1]), 3)
    query[2] = round(float(query[2]), 3)

# measure k-anonymity
k_list = []
for i in range(len(queries)):
    k = 1
    for j in range(len(queries)):
        if i != j:
            if queries[i][1] == queries[j][1] and queries[i][2] == queries[j][2]:
                k+=1
    k_list.append(k)
print("k-anonymity: "+str(min(k_list)))

k-anonymity: 1


We achieve 1-anonymity set that's a poor result. A way to improve the anonimity of the databse would be to not sent de geographical coordinates but the cell id, letting the user compute the cell from his location and only sent the cell id to the service provider. This would increase the anonymity.

In [None]:
queries = []

with open('queries.csv') as file:
        next(file)
        for line in file.readlines():
            queries.append(line.split())

for query in queries:
    nearby_pois = get_nearby_pois(np.array([float(query[1]), float(query[2])]), query[4])
    if query[0] == '104.149.206.168':
        if int(pois_by_id[nearby_pois[0]]['cell']) == 1:
            print("cell 1")
    query.remove(query[3])
    query.remove(query[2])
    if nearby_pois:
        if int(pois_by_id[nearby_pois[0]]['cell']) == 1:
            print(int(pois_by_id[nearby_pois[0]]['cell']))
        query[1] = int(pois_by_id[nearby_pois[0]]['cell'])
    else:
        query[1] = 0

# measure k-anonymity
k_list = []
for i in range(len(queries)):
    k = 1
    for j in range(len(queries)):
        if i != j:
            if queries[i][1] == queries[j][1]:
                k+=1
    if k == 1:
        print(queries[i])
    k_list.append(k)
print("k-anonymity: "+str(min(k_list)))

1
