# CRYPTECS Mobility Use Case

In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import json

from tqdm.auto import tqdm
tqdm.pandas()

Remove comment from the following cell to install the dependencies.

In [14]:
# Basic packages needed
# ! pip install numpy pandas tqdm

# For visualisation
# ! pip install keplergl geopandas

# For privacy attacks
# ! pip install scikit-mobility

## Load data

We used Shanghai Telecom dataset. However, due to the small number of individuals (9000) for a large span of times, we generated (25000) synthetic individuals based on the diestra algorithm.

UIDs in [1, 25 000] are synthetic data. UIDs in [25 001, 34 616] are real inidividuals.

In [15]:
data = pd.read_csv('./data.csv')

In [16]:
data.head()

Unnamed: 0,datetime,lat,lng,station,uid
0,2014-11-14 02:00:00,31.248836,121.463246,2781,1
1,2014-10-22 06:00:00,31.248836,121.463246,2781,1
2,2014-10-22 03:00:00,31.276982,121.460649,496,1
3,2014-10-22 01:00:00,31.25393,121.455087,69,1
4,2014-10-21 23:00:00,31.281363,121.459935,456,1


In [17]:
data['datetime'] = pd.to_datetime(data['datetime'])

In [18]:
# Extract month, day, hour and weekday from datetime
data['month'] = data['datetime'].progress_apply(lambda x: x.month)
data['day'] = data['datetime'].progress_apply(lambda x: x.day)
data['weekday'] = data['datetime'].progress_apply(lambda x: x.weekday())
data['hour'] = data['datetime'].progress_apply(lambda x: x.hour)

  0%|          | 0/11243864 [00:00<?, ?it/s]

  0%|          | 0/11243864 [00:00<?, ?it/s]

  0%|          | 0/11243864 [00:00<?, ?it/s]

  0%|          | 0/11243864 [00:00<?, ?it/s]

In [19]:
data.head()

Unnamed: 0,datetime,lat,lng,station,uid,month,day,weekday,hour
0,2014-11-14 02:00:00,31.248836,121.463246,2781,1,11,14,4,2
1,2014-10-22 06:00:00,31.248836,121.463246,2781,1,10,22,2,6
2,2014-10-22 03:00:00,31.276982,121.460649,496,1,10,22,2,3
3,2014-10-22 01:00:00,31.25393,121.455087,69,1,10,22,2,1
4,2014-10-21 23:00:00,31.281363,121.459935,456,1,10,21,1,23


## Visualization

In this section we plot all the antennas on a map to help visualize the problem. 
We will use similar map plotting to visualize the results of requests.

In [35]:
import geopandas as gpd
from keplergl import KeplerGl
#from shapely.geometry import Point, LineString, MultiLineString

In [36]:
with open('id_to_gps.json', 'r') as in_file:
    station_to_gps = json.load(in_file)

In [37]:
ids = []
lat = []
long = []
for k, v in station_to_gps.items():
    ids.append(int(k))
    lat.append(v[0])
    long.append(v[1])
df_stations = pd.DataFrame.from_dict({'id': ids, 'latitude': lat, 'longitude': long})

In [38]:
def plot_map(df = None, add_stations=True, map_height=900, add_ref_point=False):
    
    if add_ref_point:
        config = {
        'version': 'v1',
        'config': {
            'mapState': {
                'latitude': 31.23696246196823,
                'longitude': 121.5124232964245,
                'zoom': 9
            }
        }
        }
        map_cluster = KeplerGl(height=map_height, config=config)
    else:
        map_cluster = KeplerGl(height=map_height)
    if add_stations:
        map_cluster.add_data(data=df_stations, name="stations")
    if df is not None:
        map_cluster.add_data(data=df, name="data")
    return map_cluster

In [40]:
val_counts = data['station'].value_counts()

df = pd.DataFrame.from_dict({'id': ids,
                             'latitude': lat,
                             'longitude': long,
                             'count': [val_counts.get(s, 0) for s in ids]})

In [42]:
plot_map(df, add_stations=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21…

## Examples of request

### 1. How many people were at a station *s=702* at the time *t=20/10/2014 11h*?

In [25]:
station_id = 702
month = 10
day = 20
hour = 11

res = len(data.loc[(data['month'] == month) & (data['day'] == day) & (data['hour'] == hour) & (data['station'] == station_id)]['uid'].unique())

print(f'There were {res} peoples at station {station_id} the {day}/{month}/2014 at {hour}h.')

There were 5 peoples at station 702 the 20/10/2014 at 11h.


In [26]:
df = pd.DataFrame.from_dict({
    'lat': [station_to_gps[str(station_id)][0]],
    'lng': [station_to_gps[str(station_id)][1]],
    'total': [res]
})
plot_map(df, add_stations=False, add_ref_point=True)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(config={'version': 'v1', 'config': {'mapState': {'latitude': 31.23696246196823, 'longitude': 121.5124…

### 2. How many people were at a station *s=702* from the *15/10/2014 5h* to *21/10/2014 21h*?

In [27]:
station_id = 702
start_date = datetime(year=2014, month=10, day=15, hour=5)
end_date = datetime(year=2014, month=10, day=21, hour=21)
res = len(data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'] == station_id)]['uid'].unique())
print(f'From the {start_date} to the {end_date} there were {res} unique peoples that went by station {station_id}.')

From the 2014-10-15 05:00:00 to the 2014-10-21 21:00:00 there were 263 unique peoples that went by station 702.


In [28]:
df = pd.DataFrame.from_dict({
    'lat': [station_to_gps[str(station_id)][0]],
    'lng': [station_to_gps[str(station_id)][1]],
    'total': [res]
})
plot_map(df, add_stations=False, add_ref_point=True)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(config={'version': 'v1', 'config': {'mapState': {'latitude': 31.23696246196823, 'longitude': 121.5124…

### 3. How many people were at stations *2593, 2865, 1463, 398, 2794* the 14/09/2014?

In [29]:
stations_id = [2593, 2865, 1463, 398, 2794]
start_date = datetime(year=2014, month=9, day=14)
end_date = start_date + timedelta(days=1)
res = len(data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'].isin(stations_id))]['uid'].unique())
print(f'From the {start_date} to the {end_date} there were {res} peoples that were at one of the following station station {stations_id}.')

From the 2014-09-14 00:00:00 to the 2014-09-15 00:00:00 there were 22 peoples that were at one of the following station station [2593, 2865, 1463, 398, 2794].


In [30]:
df = pd.DataFrame.from_dict({
    'lat': [station_to_gps[str(s)][0] for s in stations_id],
    'lng': [station_to_gps[str(s)][1] for s in stations_id],
    'total': [res]*len(stations_id)
})
plot_map(df, add_stations=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data': {'index': [0, 1, 2, 3, 4], 'columns': ['lat', 'lng', 'total'], 'data': [[31.208153, 121…

### 4. How many people were at stations *2593, 2865, 1463, 398, 2794* on the 14/09/2014 were at station *31* the 17/09/2014?

In [31]:
first_stations_id = [2593, 2865, 1463, 398, 2794]
start_date = datetime(year=2014, month=9, day=14)
end_date = start_date + timedelta(days=1)
uids_first = data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'].isin(first_stations_id))]['uid'].unique()


second_stations_id = [31]#, 1461, 105, 1470]
start_date = datetime(year=2014, month=9, day=17)
end_date = start_date + timedelta(days=1)
uids_second = data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'].isin(second_stations_id))]['uid'].unique()

res = sum([(x in uids_first) for x in uids_second])

print(f'There were {res} peoples that were at stations {first_stations_id} the 14/09/2014 that also at station {second_stations_id} the 17/09/2014')

There were 2 peoples that were at stations [2593, 2865, 1463, 398, 2794] the 14/09/2014 that also at station [31] the 17/09/2014


In [32]:
slat = []
slng = []
tlat = []
tlng = []
for s in first_stations_id:
    for t in second_stations_id:
        slat.append(station_to_gps[str(s)][0])
        slng.append(station_to_gps[str(s)][1])
        tlat.append(station_to_gps[str(t)][0])
        tlng.append(station_to_gps[str(t)][1])
df = pd.DataFrame.from_dict({
    'Source Lat': slat,
    'Source Lng': slng,
    'Target Lat': tlat,
    'Target Lng': tlng,
    'total': [res]*len(slat)
})
plot_map(df, add_stations=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data': {'index': [0, 1, 2, 3, 4], 'columns': ['Source Lat', 'Source Lng', 'Target Lat', 'Targe…

### 5. How many people were at stations *2593, 2865, 1463, 398, 2794* on the first week of september were at station *31, 1461, 105, 1470* the fist week of november?

In [33]:
first_stations_id = [2593, 2865, 1463, 398, 2794]
start_date = datetime(year=2014, month=9, day=1)
end_date = start_date + timedelta(days=7)
uids_first = data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'].isin(first_stations_id))]['uid'].unique()

second_stations_id = [31, 1461, 105, 1470]
start_date = datetime(year=2014, month=11, day=3)
end_date = start_date + timedelta(days=7)
uids_second = data.loc[(data['datetime'] >= start_date) & (data['datetime'] <= end_date) & (data['station'].isin(second_stations_id))]['uid'].unique()

res = sum([(x in uids_first) for x in uids_second])

print(f'There were {res} peoples that were at stations {first_stations_id} the first week of september that also at stations {second_stations_id} the first of november.')

There were 18 peoples that were at stations [2593, 2865, 1463, 398, 2794] the first week of september that also at stations [31, 1461, 105, 1470] the first of november.


In [34]:
slat = []
slng = []
tlat = []
tlng = []

for s in first_stations_id:
    for t in second_stations_id:
        slat.append(station_to_gps[str(s)][0])
        slng.append(station_to_gps[str(s)][1])
        tlat.append(station_to_gps[str(t)][0])
        tlng.append(station_to_gps[str(t)][1])

df = pd.DataFrame.from_dict({
    'Source Lat': slat,
    'Source Lng': slng,
    'Target Lat': tlat,
    'Target Lng': tlng,
    'total': [res]*len(slat)
})

plot_map(df, add_stations=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'colu…

## Privacy attacks

*Most attacks are very slow, it is recommended to not use them on the all dataset*

Basic privacy attacks on mobility implemented in scikit mobility. The attacks are based on the followings papers:
 - Roberto Pellungrini, Luca Pappalardo, Francesca Pratesi, and Anna Monreale. 2017. A Data Mining Approach to Assess Privacy Risk in Human Mobility Data. ACM Trans. Intell. Syst. Technol. 9, 3, Article 31 (December 2017), 27 pages.
 - Roberto Pellungrini, Luca Pappalardo, Francesca Pratesi, Anna Monreale: Analyzing Privacy Risk in Human Mobility Data. STAF Workshops 2018: 114-129
 
More infos in scikit mobility docs:
 - https://scikit-mobility.github.io/scikit-mobility/index.html

In [43]:
import skmob
from skmob.privacy import attacks
from skmob.core.trajectorydataframe import TrajDataFrame

In [44]:
# We need to convert our pandas DataFrame to sk mobility's TrajDataFrame
tdf = TrajDataFrame(data)

### 1. Home Work Attack

In a home and work attack the adversary knows the coordinates of the two locations most frequently visited by an individual, and matches them against frequency vectors. A frequency vector is an aggregation on trajectory data showing the unique locations visited by an individual and the frequency with which he visited those locations. This attack does not require the generation of combinations to build the possible instances of background knowledge.

In [45]:
at = attacks.HomeWorkAttack()

In [48]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [49]:
r

Unnamed: 0,uid,risk
0,1,1.0
1,22,1.0
2,9965,1.0
3,28651,0.005618


### 2. Location Attack

In a location attack the adversary knows the coordinates of the locations visited by an individual and matches them against trajectories.

In [52]:
at = attacks.LocationAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 3. Location Frequency Attack

In a location frequency attack the adversary knows the coordinates of the unique locations visited by an individual and the frequency with which he visited them, and matches them against frequency vectors. A frequency vector, is an aggregation on trajectory data showing the unique locations visited by an individual and the frequency with which he visited those locations. It is possible to specify a tolerance level for the matching of the frequency.

In [None]:
at = attacks.LocationFrequencyAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 4. Location Probability Attack

In a location probability attack the adversary knows the coordinates of the unique locations visited by an individual and the probability with which he visited them, and matches them against probability vectors. A probability vector, is an aggregation on trajectory data showing the unique locations visited by an individual and the probability with which he visited those locations. It is possible to specify a tolerance level for the matching of the probability.

In [None]:
at = attacks.LocationProbabilityAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 5. Location Proportion Attack

In a location proportion attack the adversary knows the coordinates of the unique locations visited by an individual and the relative proportions between their frequencies of visit, and matches them against frequency vectors. A frequency vector is an aggregation on trajectory data showing the unique locations visited by an individual and the frequency with which he visited those locations. It is possible to specify a tolerance level for the matching of the proportion.

In [None]:
at = attacks.LocationProportionAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 6. Location Sequence Attack 
In a location sequence attack the adversary knows the coordinates of locations visited by an individual and the order in which they were visited and matches them against trajectories.

In [None]:
at = attacks.LocationSequenceAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 7. Location Time Attack

In a location time attack the adversary knows the coordinates of locations visited by an individual and the time in which they were visited and matches them against trajectories. The precision at which to consider the temporal information can also be specified.

In [None]:
at = attacks.LocationTimeAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r

### 8. Unique Location Attack

In a unique location attack the adversary knows the coordinates of unique locations visited by an individual, and matches them against frequency vectors. A frequency vector, is an aggregation on trajectory data showing the unique locations visited by an individual and the frequency with which he visited those locations.

In [None]:
at = attacks.UniqueLocationAttack(knowledge_length=2)

In [None]:
# We assess the risk for the individuals 28651, 9965, 22, 1
r = at.assess_risk(tdf, targets=[28651, 9965, 22, 1])

In [None]:
r