# Data Cleaning & Wrangling
## Metro Data

In [257]:
import pandas as pd

metro_df = pd.read_csv("metro-train-stations-with-accessibility-information.csv")
metro_df.head()

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman


In [258]:
# Checking for duplicates
metro_df[metro_df.duplicated()]

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station


In [259]:
# Splitting geo point col to lat and lon
metro_df[['latitude', 'longitude']] = metro_df['Geo Point'].str.split(',', expand=True).astype(float)

metro_df.head(5)

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station,latitude,longitude
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona,-37.867249,144.830604
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey,-37.761898,144.960561
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn,-37.822411,145.045617
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman,-37.733459,144.962747


In [260]:
# Retrieving only relevant information
metro_df = metro_df[['he_loop', 'lift', 'pids', 'station', 'latitude', 'longitude']]

metro_df.head(5)

Unnamed: 0,he_loop,lift,pids,station,latitude,longitude
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,No,No,LCD,Altona,-37.867249,144.830604
2,No,No,No,Anstey,-37.761898,144.960561
3,No,No,No,Auburn,-37.822411,145.045617
4,No,No,No,Batman,-37.733459,144.962747


In [261]:
# Standardising letter case
metro_df = metro_df.map(lambda x: x.title() if isinstance(x, str) else x)

# Renaming columns
metro_df = metro_df.rename(columns={'station': 'name', 'latitude': 'Location_Lat',
                                    'longitude': 'Location_Lon',
                                    'pids': 'passenger_information_display'})

# Adding category column
metro_df['Accessibility_Type_Name'] = 'trains'

# Adding location ID column
metro_df['Location_ID'] = pd.NA

metro_df.head(5)

Unnamed: 0,he_loop,lift,passenger_information_display,name,Location_Lat,Location_Lon,Accessibility_Type_Name,Location_ID
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251,trains,
1,No,No,Lcd,Altona,-37.867249,144.830604,trains,
2,No,No,No,Anstey,-37.761898,144.960561,trains,
3,No,No,No,Auburn,-37.822411,145.045617,trains,
4,No,No,No,Batman,-37.733459,144.962747,trains,


In [262]:
import pandas as pd

metro_static = metro_df
tram_osm = pd.read_csv('tram_OSM.csv')


metro_static.head(5)

Unnamed: 0,he_loop,lift,passenger_information_display,name,Location_Lat,Location_Lon,Accessibility_Type_Name,Location_ID
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251,trains,
1,No,No,Lcd,Altona,-37.867249,144.830604,trains,
2,No,No,No,Anstey,-37.761898,144.960561,trains,
3,No,No,No,Auburn,-37.822411,145.045617,trains,
4,No,No,No,Batman,-37.733459,144.962747,trains,


# Creating Final Table Structure

In [263]:
metro_data = metro_df

# Creating a column with dictionary of metadata
metro_data['Metadata'] = "{}"

# Creating a column with dictionary of tags
tags_cols = ['name','passenger_information_display', 'lift', 'he_loop']
metro_data['Tags'] = metro_data[tags_cols].apply(lambda row: row.to_dict(), axis=1)

metro_data.head(5)

Unnamed: 0,he_loop,lift,passenger_information_display,name,Location_Lat,Location_Lon,Accessibility_Type_Name,Location_ID,Metadata,Tags
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251,trains,,{},"{'name': 'Alphington', 'passenger_information_..."
1,No,No,Lcd,Altona,-37.867249,144.830604,trains,,{},"{'name': 'Altona', 'passenger_information_disp..."
2,No,No,No,Anstey,-37.761898,144.960561,trains,,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,No,No,No,Auburn,-37.822411,145.045617,trains,,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,No,No,No,Batman,-37.733459,144.962747,trains,,{},"{'name': 'Batman', 'passenger_information_disp..."


In [264]:
# Retrieving final table
metro_data = metro_data[['Location_ID', 'Location_Lat', 'Location_Lon', 'Accessibility_Type_Name',
                         'Metadata', 'Tags']]

metro_data.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."


# Combining Static Metro data and OSM Metro data

In [265]:
import json

metro_static = metro_data
metro_OSM = pd.read_csv('trains_OSM.csv')

In [266]:
def safe_parse_dict(s):
    try:
        s = str(s).replace("'", '"')
        return json.loads(s)
    except:
        return {}

# Rename columns
metro_OSM = metro_OSM.rename(columns={'Location_Lan': 'Location_Lon',
                                      'ACCESSIBILITY_TYPE_NAME': 'Accessibility_Type_Name'})

# Parse Metadata and Tags
for col in ['Metadata', 'Tags']:
    metro_static[col] = metro_static[col].apply(safe_parse_dict)
    metro_OSM[col] = metro_OSM[col].apply(safe_parse_dict)

# Extract station names
metro_static['station_name'] = metro_static['Metadata'].apply(lambda x: x.get('Station_Name'))
metro_OSM['station_name'] = metro_OSM['Tags'].apply(lambda x: x.get('name'))

metro_OSM['station_name'] = metro_OSM['station_name'].str.lower()

# Merge (outer join to keep all stations)
combined_df = pd.merge(
    metro_static,
    metro_OSM,
    on='station_name',
    how='outer',
    suffixes=('_static', '_osm')
)

# Combine Tags and Metadata (static data overrides OSM)
for col in ['Tags', 'Metadata']:
    combined_df[col] = combined_df.apply(
        lambda row: {**(row[f'{col}_osm'] if isinstance(row[f'{col}_osm'], dict) else {}),
                     **(row[f'{col}_static'] if isinstance(row[f'{col}_static'], dict) else {})},
        axis=1
    )

# Fill missing location data
for col in ['Location_ID', 'Location_Lat', 'Location_Lon']:
    combined_df[col] = combined_df[f'{col}_static'].fillna(combined_df[f'{col}_osm'])

# Final columns
final_df = combined_df[[
    'Location_ID', 'Location_Lat', 'Location_Lon',
    'station_name', 'Metadata', 'Tags',
    'Accessibility_Type_Name_static', 'Accessibility_Type_Name_osm'
]].rename(columns={
    'Accessibility_Type_Name_static': 'Accessibility_Type_Name'
})

# Fill missing Accessibility_Type_Name
final_df['Accessibility_Type_Name'] = final_df['Accessibility_Type_Name'].fillna('trains')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metro_static[col] = metro_static[col].apply(safe_parse_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metro_static['station_name'] = metro_static['Metadata'].apply(lambda x: x.get('Station_Name'))


In [268]:
final_df = final_df[['Location_ID', 'Location_Lat', 'Location_Lon', 
                     'Accessibility_Type_Name', 'Metadata', 'Tags']]

final_df

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."
...,...,...,...,...,...,...
315,2.965347e+09,-37.665843,145.017276,trains,"{'network': 'PTV - Metropolitan Trains', 'netw...","{'name': 'Lalor', 'railway': 'station', 'publi..."
316,2.965353e+09,-37.680498,145.014288,trains,"{'network': 'PTV - Metropolitan Trains', 'netw...","{'name': 'Thomastown', 'railway': 'station', '..."
317,5.255117e+09,-37.602502,145.100902,trains,"{'network': 'PTV - Metropolitan Trains', 'wiki...","{'name': 'Mernda', 'railway': 'station', 'publ..."
318,5.139486e+09,-37.579136,144.727983,trains,"{'wikidata': 'Q7639009', 'wikipedia': 'en:Sunb...","{'name': 'Sunbury', 'railway': 'station', 'pub..."


In [232]:
final_df.to_csv("metros_test.csv")

# Combining OSM and Static data

In [269]:
tram_osm = pd.read_csv('tram_OSM.csv')

# Renaming columns
tram_osm = tram_osm.rename(columns={'Accesibility_Type_Name': 'Accessibility_Type_Name',
                                    'Location_Lan': 'Location_Lon'})

tram_osm = tram_osm[['Location_ID', 'Location_Lat', 'Location_Lon',
                     'Accessibility_Type_Name', 'Metadata', 'Tags']]

tram_osm.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,3991484688,-37.901195,145.019339,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 63: North Road"", ""wheelchair"": ..."
1,3991484689,-37.901537,145.019066,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 63: North Road"", ""wheelchair"": ..."
2,3991484690,-37.903804,145.01879,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 64: Taylor Street"", ""wheelchair..."
3,3991484691,-37.904071,145.01858,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 64: Taylor Street"", ""wheelchair..."
4,3991484692,-37.906579,145.018113,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 65: Davey Avenue"", ""wheelchair""..."


In [270]:
# Combining the data
transpo_full = pd.concat([final_df, tram_osm], axis=0)

transpo_full.head(5)

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."


In [272]:
transpo_full

Unnamed: 0,Location_ID,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."
...,...,...,...,...,...,...
1463,4.312275e+09,-37.731013,145.014753,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 52: Tyler Street"", ""wheelchair""..."
1464,4.312275e+09,-37.729143,145.017185,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 53: Ethel Grove"", ""wheelchair"":..."
1465,4.312275e+09,-37.729129,145.017491,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 53: Ethel Grove"", ""wheelchair"":..."
1466,4.312275e+09,-37.727924,145.019351,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 54: McColl Street"", ""wheelchair..."


In [273]:
transpo_full = transpo_full.drop('Location_ID', axis=1)

transpo_full

Unnamed: 0,Location_Lat,Location_Lon,Accessibility_Type_Name,Metadata,Tags
0,-37.778396,145.031251,trains,{},"{'name': 'Alphington', 'passenger_information_..."
1,-37.867249,144.830604,trains,{},"{'name': 'Altona', 'passenger_information_disp..."
2,-37.761898,144.960561,trains,{},"{'name': 'Anstey', 'passenger_information_disp..."
3,-37.822411,145.045617,trains,{},"{'name': 'Auburn', 'passenger_information_disp..."
4,-37.733459,144.962747,trains,{},"{'name': 'Batman', 'passenger_information_disp..."
...,...,...,...,...,...
1463,-37.731013,145.014753,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 52: Tyler Street"", ""wheelchair""..."
1464,-37.729143,145.017185,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 53: Ethel Grove"", ""wheelchair"":..."
1465,-37.729129,145.017491,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 53: Ethel Grove"", ""wheelchair"":..."
1466,-37.727924,145.019351,trams,"{""operator"": ""Yarra Trams"", ""public_transport""...","{""name"": ""Stop 54: McColl Street"", ""wheelchair..."


In [274]:
transpo_full.to_json('final_transpo.json', orient='records', indent=2)