# Data Cleaning & Wrangling
## Metro Data

In [1]:
import pandas as pd

metro_df = pd.read_csv("metro-train-stations-with-accessibility-information.csv")
metro_df.head()

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman


In [3]:
# Checking for duplicates
metro_df[metro_df.duplicated()]

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station


In [4]:
# Splitting geo point col to lat and lon
metro_df[['latitude', 'longitude']] = metro_df['Geo Point'].str.split(',', expand=True).astype(float)

metro_df.head(5)

Unnamed: 0,Geo Point,Geo Shape,he_loop,lift,pids,station,latitude,longitude
0,"-37.77839599999999, 145.031251","{""coordinates"": [145.031251, -37.7783959999999...",No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,"-37.86724899999996, 144.830604","{""coordinates"": [144.830604, -37.8672489999999...",No,No,LCD,Altona,-37.867249,144.830604
2,"-37.761897999999974, 144.96056099999998","{""coordinates"": [144.96056099999998, -37.76189...",No,No,No,Anstey,-37.761898,144.960561
3,"-37.82241099999999, 145.045617","{""coordinates"": [145.045617, -37.8224109999999...",No,No,No,Auburn,-37.822411,145.045617
4,"-37.73345899999998, 144.96274700000004","{""coordinates"": [144.96274700000004, -37.73345...",No,No,No,Batman,-37.733459,144.962747


In [5]:
# Retrieving only relevant information
metro_df = metro_df[['he_loop', 'lift', 'pids', 'station', 'latitude', 'longitude']]

metro_df.head(5)

Unnamed: 0,he_loop,lift,pids,station,latitude,longitude
0,No,No,Dot Matrix,Alphington,-37.778396,145.031251
1,No,No,LCD,Altona,-37.867249,144.830604
2,No,No,No,Anstey,-37.761898,144.960561
3,No,No,No,Auburn,-37.822411,145.045617
4,No,No,No,Batman,-37.733459,144.962747


In [6]:
# Standardising letter case
metro_df = metro_df.map(lambda x: x.upper() if isinstance(x, str) else x)

metro_df.head(5)

Unnamed: 0,he_loop,lift,pids,station,latitude,longitude
0,NO,NO,DOT MATRIX,ALPHINGTON,-37.778396,145.031251
1,NO,NO,LCD,ALTONA,-37.867249,144.830604
2,NO,NO,NO,ANSTEY,-37.761898,144.960561
3,NO,NO,NO,AUBURN,-37.822411,145.045617
4,NO,NO,NO,BATMAN,-37.733459,144.962747
