In [57]:
import pandas as pd

class Identifier:

    def __init__(self, original: pd.DataFrame, anonymous: pd.DataFrame) -> None:
        self.original = original
        self.anonymous = anonymous

    def apply(self, f) -> None:
        self.original = f(self.original)
        self.anonymous = f(self.anonymous)

    def identify(self) -> pd.DataFrame:
        print("Starting identification")
        # Count the number of times each key appears in the original and anonymous dataset
        original_count = self.original.groupby('key')['uuid'].nunique()
        anonymous_count = self.anonymous.groupby('key')['uuid'].nunique()

        print("Count done...")
        print("Transforming to dataframe...")

        # Convert the series to a dataframe
        original_count = original_count.to_frame("count").reset_index()
        anonymous_count = anonymous_count.to_frame("count").reset_index()

        print("Transform done...")
        print("Merging...")

        # Merge the counts on the key
        self.original = pd.merge(self.original, original_count, on='key')
        self.anonymous = pd.merge(self.anonymous, anonymous_count, on='key')

        print("Merge done...")
        print("Splitting...")

        # Split dataframes into identified and unidentified
        anonymous_mask = self.anonymous['count'] == 1
        original_mask = self.original['count'] == 1

        print("Mask created...")
        print("Filtering...")

        identified_anonymous = self.anonymous[anonymous_mask][['key', 'uuid']]
        unidentified_anonymous = self.anonymous[~anonymous_mask]

        identified_original = self.original[original_mask][['key', 'uuid']]
        unidentified_original = self.original[~original_mask]

        print("Filter done...")
        print("Merging identified...")

        # Clean duplicates
        identified_original.drop_duplicates(subset=['key'], inplace=True)
        identified_anonymous.drop_duplicates(subset=['key'], inplace=True)

        print(identified_original)
        print(identified_anonymous)

        # Merge identified dataframes
        identified = pd.merge(identified_original, identified_anonymous, on='key', how='inner')

        print("Merging identified done...")

        return identified, unidentified_original, unidentified_anonymous
        

In [58]:
def transform_coordinates(dataframe):
    """
    Project latitude and longitude to a precision decimal place tile identifier.
    """
    precision = 2

    # Transform latitude and longitude to string, format to precision decimal places
    dataframe['latitude'] = dataframe['latitude'].round(precision).astype(str)
    dataframe['longitude'] = dataframe['longitude'].round(precision).astype(str)

    return dataframe

In [59]:
def build_tile_id(dataframe):
    """
    Build a tile identifier from latitude and longitude.
    """
    # Concatenate latitude and longitude to create a unique tile identifier
    dataframe['key'] = dataframe['latitude'] + '|' + dataframe['longitude']

    return dataframe

In [60]:
original_filename = 'data/output/week_10.csv'
anonymized_filename = 'data/output/week_10.csv'

# Load both datasets
original_df = pd.read_csv(original_filename)
anonymized_df = pd.read_csv(anonymized_filename)

In [61]:
identifier = Identifier(original_df, anonymized_df)

In [62]:
identifier.apply(transform_coordinates)

In [63]:
identifier.apply(build_tile_id)

In [64]:
identified, unidentified_original, unidentified_anonymous = identifier.identify()

Starting identification
Count done...
Transforming to dataframe...
Transform done...
Merging...
Merge done...
Splitting...
Mask created...
Filtering...
Filter done...
Merging identified...
                 key  uuid
1540729   4.95|45.74     1
1540730   4.96|45.74     1
1540731   5.03|45.74     1
1540732   5.04|45.74     1
1540733   5.05|45.74     1
...              ...   ...
4408333  -1.67|48.11   110
4421420   2.35|48.83   110
4423553   2.34|48.83   110
4423575   2.35|48.89   110
4423811   2.31|48.82   110

[3005 rows x 2 columns]
                 key  uuid
1540729   4.95|45.74     1
1540730   4.96|45.74     1
1540731   5.03|45.74     1
1540732   5.04|45.74     1
1540733   5.05|45.74     1
...              ...   ...
4408333  -1.67|48.11   110
4421420   2.35|48.83   110
4423553   2.34|48.83   110
4423575   2.35|48.89   110
4423811   2.31|48.82   110

[3005 rows x 2 columns]
Merging identified done...


In [65]:
identified

Unnamed: 0,key,uuid_x,uuid_y
0,4.95|45.74,1,1
1,4.96|45.74,1,1
2,5.03|45.74,1,1
3,5.04|45.74,1,1
4,5.05|45.74,1,1
...,...,...,...
3000,-1.67|48.11,110,110
3001,2.35|48.83,110,110
3002,2.34|48.83,110,110
3003,2.35|48.89,110,110


In [66]:
# Print all rows with different uuids
identified[identified['uuid_x'] != identified['uuid_y']]

Unnamed: 0,key,uuid_x,uuid_y
