# Data Cleaning & Wrangling
## Metro Data

In [178]:
import pandas as pd

metro_df = pd.read_csv("metro-train-stations-with-accessibility-information.csv")
metro_df.head()

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman


In [179]:
# Checking for duplicates
metro_df[metro_df.duplicated()]

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station


In [180]:
# Splitting geo point col to lat and lon
metro_df[['latitude', 'longitude']] = metro_df['Geo Point'].str.split(',', expand=True).astype(float)

metro_df.head(5)

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station,latitude,longitude
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona,-37.867249,144.830604
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey,-37.761898,144.960561
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn,-37.822411,145.045617
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman,-37.733459,144.962747


In [181]:
# Retrieving only relevant information
metro_df = metro_df[['he_loop', 'lift', 'pids', 'station', 'latitude', 'longitude']]

metro_df.head(5)

Unnamed: 0,he_loop,lift,pids,station,latitude,longitude
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,No,No,LCD,Altona,-37.867249,144.830604
2,No,No,No,Anstey,-37.761898,144.960561
3,No,No,No,Auburn,-37.822411,145.045617
4,No,No,No,Batman,-37.733459,144.962747


In [182]:
# Standardising letter case
metro_df = metro_df.map(lambda x: x.lower() if isinstance(x, str) else x)
metro_df['station'] = metro_df['station'].str.title()


# Renaming columns
metro_df = metro_df.rename(columns={'station': 'name', 'latitude': 'Location_Lat',
                                    'longitude': 'Location_Lon',
                                    'pids': 'passenger_information_display'})

# Adding category column
metro_df['Accessibility_Type_Name'] = 'trains'

# Adding location ID column
metro_df['Location_ID'] = pd.NA

metro_df.head(5)

Unnamed: 0,he_loop,lift,passenger_information_display,name,Location_Lat,Location_Lon,Accessibility_Type_Name,Location_ID
0,no,no,dot matrix,Alphington,-37.778396,145.031251,trains,
1,no,no,lcd,Altona,-37.867249,144.830604,trains,
2,no,no,no,Anstey,-37.761898,144.960561,trains,
3,no,no,no,Auburn,-37.822411,145.045617,trains,
4,no,no,no,Batman,-37.733459,144.962747,trains,


# Creating Final Table Structure

In [183]:
metro_data = metro_df

# Creating a column with dictionary of metadata
metro_data['Metadata'] = "{}"

# Creating a column with dictionary of tags
tags_cols = ['name','passenger_information_display', 'lift', 'he_loop']
metro_data['Tags'] = metro_data[tags_cols].apply(lambda row: row.to_dict(), axis=1)

metro_data.head(5)

Unnamed: 0,he_loop,lift,passenger_information_display,name,Location_Lat,Location_Lon,Accessibility_Type_Name,Location_ID,Metadata,Tags
0,no,no,dot matrix,Alphington,-37.778396,145.031251,trains,,{},"{'name': 'Alphington', 'passenger_information_..."
1,no,no,lcd,Altona,-37.867249,144.830604,trains,,{},"{'name': 'Altona', 'passenger_information_disp..."
2,no,no,no,Anstey,-37.761898,144.960561,trains,,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,no,no,no,Auburn,-37.822411,145.045617,trains,,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,no,no,no,Batman,-37.733459,144.962747,trains,,{},"{'name': 'Batman', 'passenger_information_disp..."


In [184]:
# Retrieving final table
metro_data = metro_data[['Location_ID', 'Location_Lat', 'Location_Lon', 'Accessibility_Type_Name',
                         'Metadata', 'Tags']]

metro_data.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."


# Combining Static Metro data and OSM Metro data

In [185]:
import pandas as pd
import json

metro_static = metro_data
metro_OSM = pd.read_csv('trains_OSM.csv')

# Parse Metadata and Tags into Dictionaries
def safe_parse(s):
    """Safely convert string or NaN to a dictionary."""
    if isinstance(s, dict):
        return s
    if pd.isna(s) or str(s).strip() in ('', 'nan', 'None', '{}'):
        return {}
    try:
        s = str(s).replace("'", '"')  # Fix single quotes for JSON
        return json.loads(s)
    except json.JSONDecodeError:
        return {}

# Apply parsing to both DataFrames
for col in ['Metadata', 'Tags']:
    metro_static[col] = metro_static[col].apply(safe_parse)
    metro_OSM[col] = metro_OSM[col].apply(safe_parse)

# Extract and Normalize Station Names
metro_static['station_name'] = metro_static['Tags'].apply(
    lambda x: x.get('name', '').strip().title()
)
metro_OSM['station_name'] = metro_OSM['Tags'].apply(
    lambda x: x.get('name', '').strip().title()
)


# Merge DataFrames (Outer Join)
combined_df = pd.merge(
    metro_static,
    metro_OSM,
    on='station_name',
    how='outer',
    suffixes=('_static', '_osm')
)

# Combine Metadata and Tags Dictionaries
def merge_dicts(row, col):
    """Merge dictionaries with static data taking priority."""
    static_data = row.get(f'{col}_static', {})
    osm_data = row.get(f'{col}_osm', {})

    # Ensure both are dicts
    if not isinstance(static_data, dict):
        static_data = {}
    if not isinstance(osm_data, dict):
        osm_data = {}

    return {**osm_data, **static_data}

combined_df['Metadata'] = combined_df.apply(lambda x: merge_dicts(x, 'Metadata'), axis=1)
combined_df['Tags'] = combined_df.apply(lambda x: merge_dicts(x, 'Tags'), axis=1)


# Handle Location and Accessibility Data
location_cols = ['Location_Lat', 'Location_Lon']

for col in location_cols:
    col_static = f'{col}_static'
    col_osm = f'{col}_osm'

    if col_static in combined_df.columns and col_osm in combined_df.columns:
        combined_df[col] = combined_df[col_static].combine_first(combined_df[col_osm])
    elif col_static in combined_df.columns:
        combined_df[col] = combined_df[col_static]
    elif col_osm in combined_df.columns:
        combined_df[col] = combined_df[col_osm]
    else:
        combined_df[col] = None  # default if neither exists


# Handle Accessibility_Type_Name
if 'Accessibility_Type_Name_static' in combined_df.columns or 'Accessibility_Type_Name_osm' in combined_df.columns:
    combined_df['Accessibility_Type_Name'] = combined_df.get('Accessibility_Type_Name_static', '').fillna(
        combined_df.get('Accessibility_Type_Name_osm', '')
    ).fillna('trains')  # Default value if both are missing

# Select Final Columns (Only Existing Ones)
available_columns = [
    'Location_Lat', 'Location_Lon', 'Metadata', 'Tags'
]

# Add Accessibility_Type_Name only if it exists
if 'Accessibility_Type_Name' in combined_df.columns:
    available_columns.append('Accessibility_Type_Name')

final_df = combined_df[available_columns]

# Ensure Metadata and Tags are always dicts
for col in ['Metadata', 'Tags']:
    final_df[col] = final_df[col].apply(lambda x: x if isinstance(x, dict) else {})


final_df = final_df[['Location_Lat', 'Location_Lon', 'Accessibility_Type_Name',
                     'Metadata', 'Tags']]

final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metro_static[col] = metro_static[col].apply(safe_parse)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metro_static['station_name'] = metro_static['Tags'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[col] = final_df[col].apply(lambda x: x if isinstance(x, dict) else {})


Unnamed: 0,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,-37.778396,,trains,"{'network': 'PTV - Metropolitan Trains', 'ref'...","{'name': 'Alphington', 'railway': 'station', '..."
1,-37.867249,,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,-37.761898,,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,-37.822411,,trains,"{'network': 'PTV - Metropolitan Trains', 'ref'...","{'name': 'Auburn', 'railway': 'station', 'publ..."
4,-37.733459,,trains,"{'network': 'PTV - Metropolitan Trains', 'wiki...","{'name': 'Batman', 'railway': 'station', 'publ..."


In [187]:
def get_name(x):
    return x.get('name') if isinstance(x, dict) else ''

# Create a temporary column with the extracted names
final_df['temp_name'] = final_df['Tags'].apply(get_name)

# Filter out rows where temp_name contains "Station" or a "/" followed by non-space characters
final_df = final_df[
    ~final_df['temp_name'].str.contains(r'Station|/\S+', case=False, na=False)
]

# Drop the temporary column if it's no longer needed
final_df = final_df.drop(columns=['temp_name'])

final_df.head(5)

Unnamed: 0,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,-37.778396,,trains,"{'network': 'PTV - Metropolitan Trains', 'ref'...","{'name': 'Alphington', 'railway': 'station', '..."
1,-37.867249,,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,-37.761898,,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,-37.822411,,trains,"{'network': 'PTV - Metropolitan Trains', 'ref'...","{'name': 'Auburn', 'railway': 'station', 'publ..."
4,-37.733459,,trains,"{'network': 'PTV - Metropolitan Trains', 'wiki...","{'name': 'Batman', 'railway': 'station', 'publ..."


In [157]:
len(final_df['temp_name'])

228

In [188]:
final_df.to_json("final_metros.json", orient='records', indent=2)

# Combining OSM and Static data

In [191]:
tram_osm = pd.read_csv('tram_OSM.csv')

# Renaming columns
tram_osm = tram_osm.rename(columns={'Accesibility_Type_Name': 'Accessibility_Type_Name',
                                    'Location_Lan': 'Location_Lon'})

tram_osm = tram_osm[['Location_Lat', 'Location_Lon',
                     'Accessibility_Type_Name', 'Metadata', 'Tags']]

tram_osm['Metadata'] = tram_osm['Metadata'].apply(safe_parse)
tram_osm['Tags'] = tram_osm['Tags'].apply(safe_parse)

tram_osm.head(5)

Unnamed: 0,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,-37.901195,145.019339,trams,"{'operator': 'Yarra Trams', 'public_transport'...","{'name': 'Stop 63: North Road', 'wheelchair': ..."
1,-37.901537,145.019066,trams,"{'operator': 'Yarra Trams', 'public_transport'...","{'name': 'Stop 63: North Road', 'wheelchair': ..."
2,-37.903804,145.01879,trams,"{'operator': 'Yarra Trams', 'public_transport'...","{'name': 'Stop 64: Taylor Street', 'wheelchair..."
3,-37.904071,145.01858,trams,"{'operator': 'Yarra Trams', 'public_transport'...","{'name': 'Stop 64: Taylor Street', 'wheelchair..."
4,-37.906579,145.018113,trams,"{'operator': 'Yarra Trams', 'public_transport'...","{'name': 'Stop 65: Davey Avenue', 'wheelchair'..."


In [192]:
tram_osm.to_json('final_trams.json', orient='records', indent=2)