In [1]:
import pandas as pd
import numpy as np
import requests as rq
from geopandas import GeoDataFrame as gpd
from bs4 import BeautifulSoup

### Extracting Station Names and Location

In [2]:
station_df = gpd.from_file('data/raw/PTV_METRO_TRAIN_STATION.mid')
station_df.sample(5)

Unnamed: 0,STOP_ID,STOP_NAME,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry
19,19844,Belgrave Railway Station (Belgrave),-37.909102,145.355291,2,Belgrave,POINT (145.355291 -37.909102)
47,19872,Highett Railway Station (Highett),-37.948425,145.041872,2,Frankston,POINT (145.041872 -37.948425)
95,19920,Carnegie Railway Station (Carnegie),-37.886241,145.058575,1,"Pakenham,Cranbourne",POINT (145.058575 -37.886241)
119,19944,Malvern Railway Station (Malvern),-37.866253,145.029294,1,"Frankston,Pakenham,Cranbourne",POINT (145.029294 -37.866253)
45,19870,Bayswater Railway Station (Bayswater),-37.84173,145.268136,2,Belgrave,POINT (145.268136 -37.84173)


In [3]:
station_df_na = station_df.isna()
station_df_na.describe()

Unnamed: 0,STOP_ID,STOP_NAME,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry
count,221,221,221,221,221,221,221
unique,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False
freq,221,221,221,221,221,221,221


In [4]:
station_df[['STOP_NAME']] = station_df[['STOP_NAME']].applymap(lambda x: x.split('(')[0].replace(' Railway Station ','').lower())
station_df[['NUMBER_OF_ROUTES']] = station_df[['ROUTES_USING_STOP']].applymap(lambda x: len(x.split(',')))

In [5]:
station_df.drop(columns=['geometry','ROUTES_USING_STOP', 'TICKETZONE', 'STOP_ID'], inplace=True)
station_df.rename(columns={'STOP_NAME':'STATION_NAME'}, inplace=True)

In [6]:
station_df.to_csv('data/STATION_NAMES.csv')
station_df.head()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES
0,sunbury,-37.579091,144.727319,1
1,diggers rest,-37.627017,144.719922,1
2,stony point,-38.374235,145.221837,1
3,crib point,-38.366123,145.204043,1
4,morradoo,-38.354033,145.189602,1


### Scraping Station Codes

In [7]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_Melbourne_railway_stations'
wiki_content = rq.get(wiki_url)
soup = BeautifulSoup(wiki_content.text, 'html.parser')

In [8]:
data=[]
rows = soup.find('table', attrs={'class':'wikitable sortable'}).find('tbody').find_all('tr')
for row in rows[1:]:
    data.append([row.find_all('td')[0].find('a').get_text().lower(), row.find_all('td')[1].get_text()])
station_code_df = pd.DataFrame(data, columns=['STATION_NAME', 'STATION_CODE'])

In [9]:
station_code_df.isna().describe()

Unnamed: 0,STATION_NAME,STATION_CODE
count,222,222
unique,1,1
top,False,False
freq,222,222


In [10]:
station_code_df.to_csv('STATION_CODES.csv')
station_code_df.head()

Unnamed: 0,STATION_NAME,STATION_CODE
0,aircraft,ACF
1,alamein,ALM
2,albion,ALB
3,alphington,ALP
4,altona,ALT


### Extracting Parking Capacity

In [11]:
parking_df = gpd.from_file('data/raw/PTV_TRAIN_CARPARK.mid')
parking_df.sample(5)

Unnamed: 0,STATION_NAME,COMMUTER_CAPACITY,geometry
439,ECM,22,"POLYGON ((145.067516 -37.82564, 145.067546 -37..."
139,GWY,45,"POLYGON ((145.161197 -37.879354, 145.161181 -3..."
395,NWG,10,"POLYGON ((145.181271 -37.819718, 145.181281 -3..."
127,WTL,31,"POLYGON ((145.135858 -37.9359, 145.135855 -37...."
302,CGB,96,"POLYGON ((144.944657 -37.592655, 144.944651 -3..."


In [12]:
parking_df.drop(columns=['geometry'], inplace=True)
parking_df.rename(columns={'STATION_NAME':'STATION_CODE', 'COMMUTER_CAPACITY':'PARKING_CAPACITY'}, inplace=True)
parking_df.sample(5)

Unnamed: 0,STATION_CODE,PARKING_CAPACITY
304,BMS,40
79,CDN,54
116,SNP,500
221,BFD,338
365,OMD,117


In [13]:
parking_df.isna().describe()

Unnamed: 0,STATION_CODE,PARKING_CAPACITY
count,443,443
unique,1,1
top,False,False
freq,443,443


In [14]:
parking_df[['PARKING_CAPACITY']] = parking_df[['PARKING_CAPACITY']].astype('int64')
parking_df = parking_df.groupby('STATION_CODE').sum().reset_index()

In [15]:
parking_df.to_csv('data/PARKING_CAPACITY.csv')
parking_df.head()

Unnamed: 0,STATION_CODE,PARKING_CAPACITY
0,ACF,201
1,ALB,673
2,ALP,119
3,ASH,252
4,ASP,78


### Extracting Bike Storage Capacity

In [16]:
bike_df = gpd.from_file('data/raw/PTV_TRAIN_STATION_BIKE_STORAGE.mid')
bike_df.sample(5)

Unnamed: 0,STATION_NAME,TYPE,CAPACITY,geometry
672,Huntingdale,Locker,1,POINT (145.102743 -37.911325)
56,Oakleigh,Hoops,5,POINT (145.088325 -37.9004)
714,Lilydale,Locker,1,POINT (145.347694 -37.755948)
490,Berwick,Locker,1,POINT (145.345028 -38.039416)
512,Carrum,Hoops,4,POINT (145.122782 -38.076527)


In [17]:
bike_df[['CAPACITY']] = bike_df[['CAPACITY']].replace(['','Unknown'],'0')
bike_df[['CAPACITY']] = bike_df[['CAPACITY']].astype('int64')

In [18]:
bike_df.drop(columns=['geometry'], inplace=True)

In [19]:
bike_df = pd.concat([bike_df,pd.get_dummies(bike_df.TYPE)], axis= 1, sort= False)

In [20]:
bike_df.drop(columns=['TYPE'], inplace= True)

In [21]:
bike_df[['STATION_NAME']] = bike_df[['STATION_NAME']].applymap(lambda x: x.lower())
alt_names = bike_df[['STATION_NAME']].loc[bike_df['STATION_NAME'].str.len() <= 3].applymap(lambda x: station_code_df.loc[station_code_df.STATION_CODE == x.upper(), 'STATION_NAME'].reset_index(drop=True)[0])
bike_df.update(alt_names)

In [22]:
bike_df = bike_df.groupby(by='STATION_NAME').sum().reset_index()

In [23]:
bike_df.rename(columns={'CAPACITY':'BIKE_CAPACITY'}, inplace=True)
bike_df.isna().describe()

Unnamed: 0,STATION_NAME,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer
count,106,106,106,106,106,106,106
unique,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False
freq,106,106,106,106,106,106,106


In [24]:
bike_df.to_csv('data/BIKE_CAPACITY.csv')
bike_df.sample(10)

Unnamed: 0,STATION_NAME,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer
14,cardinia road,21,1,0,0,0,0
93,syndal,26,0,0,0,1,1
4,bayswater,46,0,1,0,0,1
31,fairfield,12,0,0,12,0,0
36,gardiner,29,0,1,0,0,1
77,preston,32,0,1,0,0,1
72,nunawading,15,0,1,10,0,0
76,prahran,34,0,0,8,0,1
60,mentone,12,0,1,8,0,0
49,highett,26,0,0,0,0,1


### Merging Data

In [25]:
full_data = pd.merge(station_df, bike_df, on='STATION_NAME', how='outer')

In [26]:
full_data.isna().describe()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer
count,221,221,221,221,221,221,221,221,221,221
unique,1,1,1,1,2,2,2,2,2,2
top,False,False,False,False,True,True,True,True,True,True
freq,221,221,221,221,115,115,115,115,115,115


In [27]:
full_data.fillna(0, inplace=True)

In [28]:
full_data = pd.merge(full_data, station_code_df, on='STATION_NAME', how='outer')

In [29]:
full_data.isna().describe()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE
count,223,223,223,223,223,223,223,223,223,223,223
unique,1,2,2,2,2,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False,False,False,False
freq,223,221,221,221,221,221,221,221,221,221,222


In [30]:
full_data.loc[full_data.isna().LONGITUDE == True]

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE
221,jolimont,,,,,,,,,,JLI
222,showgrounds,,,,,,,,,,SGS


In [31]:
full_data.loc[full_data.STATION_NAME == 'jolimont-mcg', 'STATION_CODE'] = 'JLI'

In [32]:
full_data.dropna(inplace=True)

In [33]:
full_data.head()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE
0,sunbury,-37.579091,144.727319,1.0,25.0,0.0,0.0,0.0,0.0,1.0,SUY
1,diggers rest,-37.627017,144.719922,1.0,25.0,0.0,0.0,0.0,0.0,1.0,DIT
2,stony point,-38.374235,145.221837,1.0,0.0,0.0,0.0,0.0,0.0,0.0,STY
3,crib point,-38.366123,145.204043,1.0,0.0,0.0,0.0,0.0,0.0,0.0,CPT
4,morradoo,-38.354033,145.189602,1.0,0.0,0.0,0.0,0.0,0.0,0.0,MRO


In [34]:
full_data = pd.merge(full_data, parking_df, on='STATION_CODE', how='outer')

In [35]:
full_data.isna().describe()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE,PARKING_CAPACITY
count,226,226,226,226,226,226,226,226,226,226,226,226
unique,2,2,2,2,2,2,2,2,2,2,1,2
top,False,False,False,False,False,False,False,False,False,False,False,False
freq,221,221,221,221,221,221,221,221,221,221,226,165


In [36]:
full_data.dropna(inplace=True)
full_data.isna().describe()

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE,PARKING_CAPACITY
count,160,160,160,160,160,160,160,160,160,160,160,160
unique,1,1,1,1,1,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False,False,False,False,False,False
freq,160,160,160,160,160,160,160,160,160,160,160,160


In [37]:
full_data.to_csv('FULL_DATA.csv')
full_data.head(5)

Unnamed: 0,STATION_NAME,LATITUDE,LONGITUDE,NUMBER_OF_ROUTES,BIKE_CAPACITY,Cages,Hoops,Locker,Other,Parkiteer,STATION_CODE,PARKING_CAPACITY
0,sunbury,-37.579091,144.727319,1.0,25.0,0.0,0.0,0.0,0.0,1.0,SUY,565.0
1,diggers rest,-37.627017,144.719922,1.0,25.0,0.0,0.0,0.0,0.0,1.0,DIT,548.0
2,stony point,-38.374235,145.221837,1.0,0.0,0.0,0.0,0.0,0.0,0.0,STY,8.0
3,crib point,-38.366123,145.204043,1.0,0.0,0.0,0.0,0.0,0.0,0.0,CPT,20.0
4,morradoo,-38.354033,145.189602,1.0,0.0,0.0,0.0,0.0,0.0,0.0,MRO,10.0
