# Data Cleaning & Wrangling
## Location Data

In [2]:
import pandas as pd

# Loading the data
toilet_df = pd.read_csv("toiletmapexport_250301_074429.csv")

toilet_df.head(5)

Unnamed: 0,FacilityID,URL,Name,FacilityType,Address1,Town,State,AddressNote,Latitude,Longitude,...,Ambulant,Accessible,LHTransfer,RHTransfer,ToiletNote,SharpsDisposal,DrinkingWater,SanitaryDisposal,MensPadDisposal,Shower
0,1,https://toiletmap.gov.au/facility/1,Sandy Beach Reserve,Park or reserve,"Lost 15525, West Road",Bassendean,WA,,-31.921836,115.950206,...,False,True,False,False,,True,True,True,False,False
1,2,https://toiletmap.gov.au/facility/2,Point Reserve,Park or reserve,"Lot 197, North Road",Bassendean,WA,,-31.90441,115.960991,...,False,True,False,False,,True,True,True,False,False
2,3,https://toiletmap.gov.au/facility/3,Success Hill Reserve,Park or reserve,"Lot 2838, Seventh Avenue",Bassendean,WA,,-31.896289,115.955781,...,False,True,False,False,,True,True,True,False,True
3,4,https://toiletmap.gov.au/facility/4,Jubilee Reserve,Park or reserve,"Lot 6322, Robinson Road",Eden Hill,WA,,-31.891474,115.940164,...,False,False,False,False,,True,False,True,False,False
4,5,https://toiletmap.gov.au/facility/5,Ashfield Reserve,Park or reserve,"Lot 12061, 2 Coulston Road",Ashfield,WA,,-31.913433,115.936477,...,False,True,False,False,,True,True,True,False,False


In [3]:
toilet_df.columns = toilet_df.columns.str.lower()

toilet_df.columns

Index(['facilityid', 'url', 'name', 'facilitytype', 'address1', 'town',
       'state', 'addressnote', 'latitude', 'longitude', 'parking',
       'parkingaccessible', 'parkingnote', 'keyrequired', 'mlak24',
       'mlakafterhours', 'paymentrequired', 'accessnote', 'adultchange',
       'changingplaces', 'byosling', 'acshower', 'acmlak', 'adultchangenote',
       'babychange', 'babycareroom', 'babychangenote', 'dumppoint',
       'dpwashout', 'dpafterhours', 'dumppointnote', 'openinghours',
       'openinghoursnote', 'male', 'female', 'unisex', 'allgender', 'ambulant',
       'accessible', 'lhtransfer', 'rhtransfer', 'toiletnote',
       'sharpsdisposal', 'drinkingwater', 'sanitarydisposal',
       'menspaddisposal', 'shower'],
      dtype='object')

In [4]:
# Extracting needed columns
toilet_df = toilet_df[['facilityid', 'name', 'facilitytype', 'address1', 'town', 'state',
                      'latitude', 'longitude', 'parkingaccessible', 'male', 'female',
                      'unisex', 'allgender', 'accessible', 'paymentrequired', 'babychange']]



In [5]:
# Rounding latitude and longitude to 6 decimal places
toilet_df['latitude'] = toilet_df['latitude'].round(6)
toilet_df['longitude'] = toilet_df['longitude'].round(6)

toilet_df.head(5)

Unnamed: 0,facilityid,name,facilitytype,address1,town,state,latitude,longitude,parkingaccessible,male,female,unisex,allgender,accessible,paymentrequired,babychange
0,1,Sandy Beach Reserve,Park or reserve,"Lost 15525, West Road",Bassendean,WA,-31.921836,115.950206,False,False,False,True,False,True,False,True
1,2,Point Reserve,Park or reserve,"Lot 197, North Road",Bassendean,WA,-31.90441,115.960991,False,False,False,True,False,True,False,True
2,3,Success Hill Reserve,Park or reserve,"Lot 2838, Seventh Avenue",Bassendean,WA,-31.896289,115.955781,True,True,True,False,False,True,False,True
3,4,Jubilee Reserve,Park or reserve,"Lot 6322, Robinson Road",Eden Hill,WA,-31.891474,115.940164,False,True,True,False,False,False,False,False
4,5,Ashfield Reserve,Park or reserve,"Lot 12061, 2 Coulston Road",Ashfield,WA,-31.913433,115.936477,True,False,False,True,False,True,False,True


In [6]:
# Filtering to only VIC
toilet_df = toilet_df[toilet_df['state'] == 'VIC']

toilet_df.head(5)

Unnamed: 0,facilityid,name,facilitytype,address1,town,state,latitude,longitude,parkingaccessible,male,female,unisex,allgender,accessible,paymentrequired,babychange
181,252,Lloyd Street,Other,Lloyd Street,Dimboola,VIC,-36.454567,142.026468,False,True,True,False,False,True,False,False
182,253,Lloyd Street,Bus station,Lloyd Street,Dimboola,VIC,-36.454121,142.026374,False,True,True,False,False,True,False,True
183,254,Charles Street,Other,Charles Street,Jeparit,VIC,-36.142494,141.988242,False,True,True,False,False,True,False,False
184,255,Roy Street,Other,Roy Street,Jeparit,VIC,-36.143049,141.987369,False,True,True,False,False,False,False,False
185,256,Western Highway,Bus station,24 Victoria Street,Nhill,VIC,-36.333924,141.649202,False,True,True,False,False,True,False,False


In [None]:
# Standardising strings to lowercase
toilet_df = toilet_df.map(lambda x: x.lower() if isinstance(x, str) else x)

# Resetting index
toilet_df.reset_index(drop=True, inplace=True)

toilet_df.head(5)

Unnamed: 0,facilityid,name,facilitytype,address1,town,state,latitude,longitude,parkingaccessible,male,female,unisex,allgender,accessible,paymentrequired,babychange
0,252,lloyd street,other,lloyd street,dimboola,vic,-36.454567,142.026468,False,True,True,False,False,True,False,False
1,253,lloyd street,bus station,lloyd street,dimboola,vic,-36.454121,142.026374,False,True,True,False,False,True,False,True
2,254,charles street,other,charles street,jeparit,vic,-36.142494,141.988242,False,True,True,False,False,True,False,False
3,255,roy street,other,roy street,jeparit,vic,-36.143049,141.987369,False,True,True,False,False,False,False,False
4,256,western highway,bus station,24 victoria street,nhill,vic,-36.333924,141.649202,False,True,True,False,False,True,False,False


In [8]:
# Checking for duplicates
toilet_df[toilet_df[['name','facilitytype', 'latitude', 'longitude']].duplicated()]

Unnamed: 0,facilityid,name,facilitytype,address1,town,state,latitude,longitude,parkingaccessible,male,female,unisex,allgender,accessible,paymentrequired,babychange


In [9]:
toilet_df['facilitytype'].unique()

array(['other', 'bus station', 'park or reserve', 'sporting facility',
       'car park', 'community building', 'beach', 'shopping centre',
       'jetty', 'camping ground', 'airport', 'rest area', 'swimming pool',
       'caravan park', 'train station', 'food outlet', 'cemetery',
       'service station'], dtype=object)

In [10]:
# Flitering out Restaurants
toilet_df = toilet_df[toilet_df['facilitytype'] != 'food outlet']

# Renaming columns
toilet_df = toilet_df.rename(columns={'facilityid': 'Location_ID', 'address1': 'address', 'town': 'suburb',
                                      'latitude': 'Location_Lat', 'longitude': 'Location_Lon',
                                      'accessible': 'wheelchair', 'babychange': 'changing_table'})

toilet_df.head(5)

Unnamed: 0,Location_ID,name,facilitytype,address,suburb,state,Location_Lat,Location_Lon,parkingaccessible,male,female,unisex,allgender,wheelchair,paymentrequired,changing_table
0,252,lloyd street,other,lloyd street,dimboola,vic,-36.454567,142.026468,False,True,True,False,False,True,False,False
1,253,lloyd street,bus station,lloyd street,dimboola,vic,-36.454121,142.026374,False,True,True,False,False,True,False,True
2,254,charles street,other,charles street,jeparit,vic,-36.142494,141.988242,False,True,True,False,False,True,False,False
3,255,roy street,other,roy street,jeparit,vic,-36.143049,141.987369,False,True,True,False,False,False,False,False
4,256,western highway,bus station,24 victoria street,nhill,vic,-36.333924,141.649202,False,True,True,False,False,True,False,False


In [11]:
# Standardising boolean values
bool_cols = toilet_df.columns[-8:]

for col in bool_cols:
    toilet_df[col] = toilet_df[col].map({True: 'yes', False: 'no'})

toilet_df.head(5)

Unnamed: 0,Location_ID,name,facilitytype,address,suburb,state,Location_Lat,Location_Lon,parkingaccessible,male,female,unisex,allgender,wheelchair,paymentrequired,changing_table
0,252,lloyd street,other,lloyd street,dimboola,vic,-36.454567,142.026468,no,yes,yes,no,no,yes,no,no
1,253,lloyd street,bus station,lloyd street,dimboola,vic,-36.454121,142.026374,no,yes,yes,no,no,yes,no,yes
2,254,charles street,other,charles street,jeparit,vic,-36.142494,141.988242,no,yes,yes,no,no,yes,no,no
3,255,roy street,other,roy street,jeparit,vic,-36.143049,141.987369,no,yes,yes,no,no,no,no,no
4,256,western highway,bus station,24 victoria street,nhill,vic,-36.333924,141.649202,no,yes,yes,no,no,yes,no,no


# Creating Final Table Structure

In [12]:
toilet_data = toilet_df

# Creating a column with dictionary of metadata
metadata_cols = ['name', 'facilitytype', 'address', 'suburb', 'state']
toilet_data['Metadata'] = toilet_data[metadata_cols].apply(lambda row: row.to_dict(), axis=1)

toilet_data.head(5)

Unnamed: 0,Location_ID,name,facilitytype,address,suburb,state,Location_Lat,Location_Lon,parkingaccessible,male,female,unisex,allgender,wheelchair,paymentrequired,changing_table,Metadata
0,252,lloyd street,other,lloyd street,dimboola,vic,-36.454567,142.026468,no,yes,yes,no,no,yes,no,no,"{'name': 'lloyd street', 'facilitytype': 'othe..."
1,253,lloyd street,bus station,lloyd street,dimboola,vic,-36.454121,142.026374,no,yes,yes,no,no,yes,no,yes,"{'name': 'lloyd street', 'facilitytype': 'bus ..."
2,254,charles street,other,charles street,jeparit,vic,-36.142494,141.988242,no,yes,yes,no,no,yes,no,no,"{'name': 'charles street', 'facilitytype': 'ot..."
3,255,roy street,other,roy street,jeparit,vic,-36.143049,141.987369,no,yes,yes,no,no,no,no,no,"{'name': 'roy street', 'facilitytype': 'other'..."
4,256,western highway,bus station,24 victoria street,nhill,vic,-36.333924,141.649202,no,yes,yes,no,no,yes,no,no,"{'name': 'western highway', 'facilitytype': 'b..."


In [13]:
# Creating a column with dictionary of tags
tags_cols = ['parkingaccessible', 'male', 'female', 'unisex', 'allgender', 'wheelchair', 'changing_table']
toilet_data['Tags'] = toilet_data[tags_cols].apply(lambda row: row.to_dict(), axis = 1)

# Adding a column for type of accessibility feature
toilet_data['Accesibility_Type_Name'] = 'toilets'

toilet_data.head(5)

Unnamed: 0,Location_ID,name,facilitytype,address,suburb,state,Location_Lat,Location_Lon,parkingaccessible,male,female,unisex,allgender,wheelchair,paymentrequired,changing_table,Metadata,Tags,Accesibility_Type_Name
0,252,lloyd street,other,lloyd street,dimboola,vic,-36.454567,142.026468,no,yes,yes,no,no,yes,no,no,"{'name': 'lloyd street', 'facilitytype': 'othe...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",toilets
1,253,lloyd street,bus station,lloyd street,dimboola,vic,-36.454121,142.026374,no,yes,yes,no,no,yes,no,yes,"{'name': 'lloyd street', 'facilitytype': 'bus ...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",toilets
2,254,charles street,other,charles street,jeparit,vic,-36.142494,141.988242,no,yes,yes,no,no,yes,no,no,"{'name': 'charles street', 'facilitytype': 'ot...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",toilets
3,255,roy street,other,roy street,jeparit,vic,-36.143049,141.987369,no,yes,yes,no,no,no,no,no,"{'name': 'roy street', 'facilitytype': 'other'...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",toilets
4,256,western highway,bus station,24 victoria street,nhill,vic,-36.333924,141.649202,no,yes,yes,no,no,yes,no,no,"{'name': 'western highway', 'facilitytype': 'b...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",toilets


In [14]:
# Retrieving final table

toilet_data = toilet_data[['Location_ID', 'Location_Lat', 'Location_Lon', 'Accesibility_Type_Name',
                           'Metadata', 'Tags']]

toilet_data.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accesibility_Type_Name,Metadata,Tags
0,252,-36.454567,142.026468,toilets,"{'name': 'lloyd street', 'facilitytype': 'othe...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
1,253,-36.454121,142.026374,toilets,"{'name': 'lloyd street', 'facilitytype': 'bus ...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
2,254,-36.142494,141.988242,toilets,"{'name': 'charles street', 'facilitytype': 'ot...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
3,255,-36.143049,141.987369,toilets,"{'name': 'roy street', 'facilitytype': 'other'...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
4,256,-36.333924,141.649202,toilets,"{'name': 'western highway', 'facilitytype': 'b...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."


In [15]:
# Exporting data 
toilet_data.to_csv("toilets_CSV.csv", index=False)

# Combining OSM and Static data

In [16]:
import pandas as pd

toilet_static = pd.read_csv('toilets_CSV.csv')
toilet_osm = pd.read_csv('toilets_OSM.csv')

# Renaming column of toilet OSM data
toilet_static = toilet_static.rename(columns={'Accesibility_Type_Name':'Accessibility_Type_Name'})
toilet_osm = toilet_osm.rename(columns={'ACCESSIBILITY_TYPE_NAME': 'Accessibility_Type_Name'})

toilets_full = pd.concat([toilet_static, toilet_osm], axis=0)

toilets_full.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,252,-36.454567,142.026468,toilets,"{'name': 'lloyd street', 'facilitytype': 'othe...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
1,253,-36.454121,142.026374,toilets,"{'name': 'lloyd street', 'facilitytype': 'bus ...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
2,254,-36.142494,141.988242,toilets,"{'name': 'charles street', 'facilitytype': 'ot...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
3,255,-36.143049,141.987369,toilets,"{'name': 'roy street', 'facilitytype': 'other'...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
4,256,-36.333924,141.649202,toilets,"{'name': 'western highway', 'facilitytype': 'b...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."


In [17]:
# standardising lat and lon to 6 decimal places
toilets_full['Location_Lat'] = toilets_full['Location_Lat'].round(6)
toilets_full['Location_Lon'] = toilets_full['Location_Lon'].round(6)

toilets_full.head()

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,252,-36.454567,142.026468,toilets,"{'name': 'lloyd street', 'facilitytype': 'othe...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
1,253,-36.454121,142.026374,toilets,"{'name': 'lloyd street', 'facilitytype': 'bus ...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
2,254,-36.142494,141.988242,toilets,"{'name': 'charles street', 'facilitytype': 'ot...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
3,255,-36.143049,141.987369,toilets,"{'name': 'roy street', 'facilitytype': 'other'...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
4,256,-36.333924,141.649202,toilets,"{'name': 'western highway', 'facilitytype': 'b...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."


Exact matching duplicates
- Number of records counted as duplicates: 8

In [18]:
# Checking for duplicated data
# Check for duplicate rows based on both latitude and longitude
duplicates = toilets_full.duplicated(subset=['Location_Lat', 'Location_Lon'], keep=False)

# Show all duplicate rows (keeping all occurrences)
duplicate_rows = toilets_full[duplicates]
print(f"Found {len(duplicate_rows)} duplicate location entries")
duplicate_rows.sort_values(['Location_Lat', 'Location_Lon'])

Found 8 duplicate location entries


Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
5293,62142,-38.412562,144.189241,toilets,"{'name': 'four kings ', 'facilitytype': 'beach...","{'parkingaccessible': 'yes', 'male': 'yes', 'f..."
5294,62143,-38.412562,144.189241,toilets,"{'name': 'four kings changing place', 'facilit...","{'parkingaccessible': 'yes', 'male': 'no', 'fe..."
5360,62501,-38.172926,145.927828,toilets,{'name': 'west gippsland hospital - consulting...,"{'parkingaccessible': 'yes', 'male': 'yes', 'f..."
5361,62502,-38.172926,145.927828,toilets,{'name': 'west gippsland hospital - emergency ...,"{'parkingaccessible': 'yes', 'male': 'yes', 'f..."
3266,41747,-37.754995,142.008182,toilets,"{'name': 'hamilton livestock exchange', 'facil...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
115,4010647837,-37.754995,142.008182,toilets,"{""operator"": ""southern grampians shire council""}","{""wheelchair"": ""no"", ""name"": ""hamilton - lives..."
190,1423,-37.733013,142.039729,toilets,"{'name': 'lake hamilton - boat ramp', 'facilit...","{'parkingaccessible': 'yes', 'male': 'yes', 'f..."
132,4010648831,-37.733013,142.039729,toilets,"{""operator"": ""southern grampians shire council""}","{""wheelchair"": ""yes"", ""name"": ""hamilton - lake..."


In [19]:
duplicate_rows.to_csv('toilets_test.csv')

In [20]:
duplicate_rows.count()

Location_ID                8
Location_Lat               8
Location_Lon               8
Accessibility_Type_Name    8
Metadata                   8
Tags                       8
dtype: int64

Rough Duplicates (if rounded up to 4 decimals)
- Number of records counted as duplicates: 142

In [21]:
# Round coordinates to 4 decimal places (~11 meter precision)
toilets_full['rounded_lat'] = toilets_full['Location_Lat'].round(4)
toilets_full['rounded_lon'] = toilets_full['Location_Lon'].round(4)

# Check for duplicates on rounded coordinates
fuzzy_duplicates = toilets_full.duplicated(subset=['rounded_lat', 'rounded_lon'], keep=False)

# Count how many times each location appears
location_counts = toilets_full.groupby(['rounded_lat', 'rounded_lon']).size().reset_index(name='count')
duplicate_counts = location_counts[location_counts['count'] > 1]

duplicate_counts

Unnamed: 0,rounded_lat,rounded_lon,count
58,-38.7594,143.6717,2
67,-38.7399,143.6749,2
81,-38.6935,146.0823,2
102,-38.6621,143.1054,2
116,-38.6416,145.7147,2
...,...,...,...
6245,-35.1363,142.0252,2
6257,-35.0691,142.3154,2
6345,-34.3909,141.5947,2
6418,-34.1807,142.1640,2


20 Meter threshold
- Number of records detected as duplicates: 291

In [22]:
import pandas as pd
import numpy as np

def flag_nearby_duplicates(toilets_df, lat_col='Location_Lat', lon_col='Location_Lon', threshold_meters=20):
    # Calculate precision needed (~0.0002 degrees ≈ 20 meters)
    precision = int(np.ceil(-np.log10(threshold_meters/111000)))
    
    # Create rounded coordinates for grouping
    toilets_df = toilets_df.copy()
    toilets_df['coord_group'] = (
        toilets_df[lat_col].round(precision).astype(str) + "|" + 
        toilets_df[lon_col].round(precision).astype(str)
    )
    
    # Calculate group stats
    group_stats = toilets_df.groupby('coord_group').agg({
        lat_col: 'first',
        lon_col: 'first',
        'coord_group': 'size'
    }).rename(columns={'coord_group': 'nearby_count'})
    
    # Only keep groups with duplicates
    duplicate_groups = group_stats[group_stats['nearby_count'] > 1]
    
    # Merge back with original data
    result = toilets_df.merge(
        duplicate_groups, 
        on='coord_group',
        how='left',
        suffixes=('', '_ref')
    )
    
    # Clean up columns
    result['duplicate_group'] = result.groupby('coord_group').ngroup()
    result.loc[result['nearby_count'].isna(), 'duplicate_group'] = np.nan
    
    # Rename reference columns
    result = result.rename(columns={
        f'{lat_col}_ref': 'reference_lat',
        f'{lon_col}_ref': 'reference_lon'
    })
    
    # Remove temporary column
    result = result.drop(columns=['coord_group'])
    
    return result

# Usage
marked_toilets = flag_nearby_duplicates(toilets_full)

# Show only the duplicates (optional)
marked_toilets[marked_toilets['duplicate_group'].notna()].sort_values('duplicate_group')


Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags,rounded_lat,rounded_lon,reference_lat,reference_lon,nearby_count,duplicate_group
4892,61061,-34.165007,142.054815,toilets,"{'name': 'kenny park', 'facilitytype': 'park o...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",-34.1650,142.0548,-34.165025,142.054790,2.0,15.0
232,1725,-34.165025,142.054790,toilets,"{'name': 'kenny park 2', 'facilitytype': 'park...","{'parkingaccessible': 'yes', 'male': 'yes', 'f...",-34.1650,142.0548,-34.165025,142.054790,2.0,15.0
6585,4936339521,-34.180739,142.163978,toilets,{},"{""fee"": ""no"", ""wheelchair"": ""yes""}",-34.1807,142.1640,-34.180746,142.163986,2.0,30.0
213,1704,-34.180746,142.163986,toilets,"{'name': 'jaycee park', 'facilitytype': 'park ...","{'parkingaccessible': 'yes', 'male': 'yes', 'f...",-34.1807,142.1640,-34.180746,142.163986,2.0,30.0
5302,62175,-34.390857,141.594673,toilets,"{'name': 'werrimull public hall', 'facilitytyp...","{'parkingaccessible': 'no', 'male': 'yes', 'fe...",-34.3909,141.5947,-34.390857,141.594673,2.0,102.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5753,5913251715,-38.693482,146.082283,toilets,{},"{""fee"": ""no"", ""wheelchair"": ""yes""}",-38.6935,146.0823,-38.693512,146.082261,2.0,6367.0
5662,11843900525,-38.739946,143.674887,toilets,{},"{""access"": ""customers"", ""male"": ""yes"", ""wheelc...",-38.7399,143.6749,-38.739946,143.674887,2.0,6381.0
5663,11843900526,-38.739899,143.674917,toilets,{},"{""access"": ""customers"", ""female"": ""yes"", ""whee...",-38.7399,143.6749,-38.739946,143.674887,2.0,6381.0
5659,2484556678,-38.759397,143.671667,toilets,"{""check_date"": ""2023-10-30""}","{""fee"": ""no"", ""wheelchair"": ""no""}",-38.7594,143.6717,-38.759415,143.671734,2.0,6390.0


In [23]:
def remove_duplicates_keep_one(toilets_df, lat_col='Location_Lat', lon_col='Location_Lon', threshold_meters=20):
    """
    Removes nearby duplicates (keeping one per group) while preserving all unique records.
    
    Returns:
    - DataFrame with only one record per duplicate group (plus all non-duplicated records)
    """
    # First flag all duplicates
    marked_df = flag_nearby_duplicates(toilets_df, lat_col, lon_col, threshold_meters)
    
    # For duplicate groups, keep the first record; always keep non-duplicates
    deduped_df = marked_df[~marked_df['duplicate_group'].notna() | 
                           (marked_df['duplicate_group'].notna() & 
                            (marked_df.groupby('duplicate_group').cumcount() == 0))]
    
    return deduped_df

# Usage
deduped_toilets = remove_duplicates_keep_one(toilets_full)

len(deduped_toilets)

6449

In [24]:
final_toilets = deduped_toilets[['Location_Lat', 'Location_Lon', 'Accessibility_Type_Name',
                                 'Metadata', 'Tags']]

final_toilets

Unnamed: 0,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,-36.454567,142.026468,toilets,"{'name': 'lloyd street', 'facilitytype': 'othe...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
1,-36.454121,142.026374,toilets,"{'name': 'lloyd street', 'facilitytype': 'bus ...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
2,-36.142494,141.988242,toilets,"{'name': 'charles street', 'facilitytype': 'ot...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
3,-36.143049,141.987369,toilets,"{'name': 'roy street', 'facilitytype': 'other'...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
4,-36.333924,141.649202,toilets,"{'name': 'western highway', 'facilitytype': 'b...","{'parkingaccessible': 'no', 'male': 'yes', 'fe..."
...,...,...,...,...,...
6593,-33.922352,147.199795,toilets,{},"{""access"": ""yes"", ""disposal"": ""flush"", ""wheelc..."
6594,-33.924102,147.234089,toilets,{},"{""access"": ""yes"", ""disposal"": ""flush"", ""wheelc..."
6595,-33.906437,148.160552,toilets,"{""operator"": ""Weddin Shire Council""}","{""access"": ""yes"", ""fee"": ""no"", ""female"": ""yes""..."
6596,-34.167363,148.458657,toilets,{},"{""access"": ""yes"", ""fee"": ""no"", ""disposal"": ""fl..."


In [27]:
import ast

def safe_parse_dict(val):
    if isinstance(val, str) and val.strip().startswith('{') and val.strip().endswith('}'):
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return {}  # fallback in case it's not valid
    return val if isinstance(val, dict) else {}

final_toilets['Metadata'] = final_toilets['Metadata'].apply(safe_parse_dict)
final_toilets['Tags'] = final_toilets['Tags'].apply(safe_parse_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_toilets['Metadata'] = final_toilets['Metadata'].apply(safe_parse_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_toilets['Tags'] = final_toilets['Tags'].apply(safe_parse_dict)


In [30]:
final_toilets.to_json('final_toilets.json', orient='records', indent=2)