In [51]:
original_filename = 'data/sample.csv'
anonymized_filename = 'data/fake_uuids.csv'

In [52]:
import pandas as pd

# Load both datasets
original_df = pd.read_csv(original_filename, sep=';')
anonymized_df = pd.read_csv(anonymized_filename, sep=';')

In [53]:
def project_to_tiles(dataframe, fmt_str= '{0:.3f}'):
    """
    Project latitude and longitude to a 3 decimal place tile identifier.
    """

    # Transform latitude and longitude to string, format to 3 decimal places
    dataframe['latitude'] = dataframe['latitude'].apply(lambda x: fmt_str.format(x))
    dataframe['longitude'] = dataframe['longitude'].apply(lambda x: fmt_str.format(x))

    # Concatenate latitude and longitude to create a unique tile identifier
    dataframe['tile'] = dataframe['latitude'] + '|' + dataframe['longitude']

    # Only keep uuid and tile columns
    return dataframe[['uuid', 'tile']]

In [54]:
# Project both datasets to tiles
original_df = project_to_tiles(original_df)
anonymized_df = project_to_tiles(anonymized_df)

In [55]:
def find_visitors(dataframe):
    """
    Create a list of visitors per tile.
    """

    # Group by tile and create a list of visitors per tile
    visitors_per_tile = dataframe.groupby('tile')['uuid'].apply(list)

    # Create a dataframe with tile and visitors
    visitors_per_tile_df = pd.DataFrame({'tile': visitors_per_tile.index, 'visitors': visitors_per_tile.values})

    return visitors_per_tile_df

In [56]:
original_visitors = find_visitors(original_df)
anonymized_visitors = find_visitors(anonymized_df)

# Only keep tiles with only one visitor
original_visitors = original_visitors[original_visitors['visitors'].apply(lambda x: len(x) == 1)]
anonymized_visitors = anonymized_visitors[anonymized_visitors['visitors'].apply(lambda x: len(x) == 1)]

# Transform list to string
original_visitors['visitors'] = original_visitors['visitors'].apply(lambda x: x[0])
anonymized_visitors['visitors'] = anonymized_visitors['visitors'].apply(lambda x: x[0])

# Merge on tile identifier
merged_visitors = pd.merge(original_visitors, anonymized_visitors, on='tile', how='inner')
merged_visitors = merged_visitors[['visitors_x', 'visitors_y']]

# Drop duplicates rows
merged_visitors = merged_visitors.drop_duplicates()

# Show the result
merged_visitors.head(25)


Unnamed: 0,visitors_x,visitors_y
0,1,2ef33267-9a55-4f7b-9906-818dc64603f3
205,31,0c8fbf2c-43c4-4b2a-b57d-7f35fc49861c
257,8,b7397855-7b67-4fb6-b8ce-8ff097e49911
362,73,65a10763-e605-406e-ab06-a8ec6a95eed3
607,21,725375ae-83c2-41f0-99e2-3c9a336c9f3e
622,11,9831be2c-901d-4159-b5da-785bd0ba5345
636,14,305b2762-1f2a-4b97-992e-55ea8b6cdaa2
641,15,c214c01f-1e15-42eb-87d0-24234caee94c
715,25,175aedab-548c-46da-a518-de18ab5034e6
739,17,c9650a22-f739-437c-b589-44b2fd3e163e


In [60]:
# Show row with visitors_x = 48
original_df[original_df['uuid'] == 1]

Unnamed: 0,uuid,tile
50,1,4.880|45.787
62,1,4.870|45.770
104,1,4.880|45.786
313,1,4.880|45.786
328,1,4.870|45.770
...,...,...
5182609,1,4.851|45.513
5182722,1,4.876|45.781
5182729,1,-97.760|30.262
5182750,1,4.870|45.771


In [61]:
anonymized_df[anonymized_df['uuid'] == '2ef33267-9a55-4f7b-9906-818dc64603f3']

Unnamed: 0,uuid,tile
50,2ef33267-9a55-4f7b-9906-818dc64603f3,4.880|45.787
62,2ef33267-9a55-4f7b-9906-818dc64603f3,4.870|45.770
104,2ef33267-9a55-4f7b-9906-818dc64603f3,4.880|45.786
313,2ef33267-9a55-4f7b-9906-818dc64603f3,4.880|45.786
328,2ef33267-9a55-4f7b-9906-818dc64603f3,4.870|45.770
...,...,...
5182609,2ef33267-9a55-4f7b-9906-818dc64603f3,4.851|45.513
5182722,2ef33267-9a55-4f7b-9906-818dc64603f3,4.876|45.781
5182729,2ef33267-9a55-4f7b-9906-818dc64603f3,-97.760|30.262
5182750,2ef33267-9a55-4f7b-9906-818dc64603f3,4.870|45.771
