In [176]:
import pandas as pd
import os, sys
import numpy as np


file_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(file_dir)
newPath = os.path.join(parent_dir, 'data/gtdb/stops.txt')

stopsstatic = pd.read_csv(newPath, delimiter=',')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stopsstatic


Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454
5,8220DB000007,"Rotunda, Rotunda Hospital",53.352841,-6.26457
6,8220DB000008,"Rotunda, Parnell Square West",53.353272,-6.265184
7,8220DB000010,"Rotunda, Parnell Square West",53.353392,-6.265389
8,8220DB000011,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356958,-6.264628
9,8220DB000012,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356805,-6.264589


In [177]:
# See how many rows and columns are in it
stopsstatic.shape

(4691, 4)

In [178]:
stopsstatic.dtypes

stop_id       object
stop_name     object
stop_lat     float64
stop_lon     float64
dtype: object

In [179]:
#Missing data all features
stopsstatic.isnull().sum()

stop_id      0
stop_name    0
stop_lat     0
stop_lon     0
dtype: int64

In [180]:
# Check for irregular cardinality. 
print("Unique values for:\n- Stop_id:", pd.unique(stopsstatic.stop_id.ravel()))
print("\n- stop_name:", sorted(pd.unique(stopsstatic.stop_name.ravel())))
print("\n- stop_lat:", sorted(pd.unique(stopsstatic.stop_lat.ravel())))
print("\n- stop_lon:", sorted(pd.unique(stopsstatic.stop_lon.ravel())))


Unique values for:
- Stop_id: ['8220B007612' '8220DB000002' '8220DB000003' ... 'gen:57102:3606:0:1'
 'gen:57102:3607:0:1' 'gen:57102:5245:0:1']

- stop_name: ['Abberley', 'Abbey Park', 'Abbeyvale Brackenstown Road', 'Adamstown (Dublin), Finnstown Abbey Estate', 'Adamstown (South Dublin), Outer Ring Road', 'Adamstown (Wexford), Hotel', 'Adamstown, Adamstown Avenue', 'Adamstown, Castlegate Estate', 'Adamstown, Castlegate Way', 'Adamstown, Grange Manor Close', 'Adamstown, Lock Road', 'Adamstown, Outside Train Station', 'Adamstown, Rosberry Avenue', 'Airside, Veterinary Hospital', 'All Saints Road (Sybill Hill Ave)', 'Amiens Street Junction Upper Sheriff Str', 'Applewood Glen Ellan Road Swords', 'Arbour Hill, Arbour Place', 'Arbour Hill, Aughrim Street', 'Arbour Hill, Brunswick Street', 'Arbour Hill, Drumalee Estate', 'Arbour Hill, Military Hospital', 'Arbour Hill, Montpelier Gardens', 'Arbour Hill, Phoenix Park Gate', 'Arbour Hill, Prussia Street', 'Arbour Hill, Stanhope Street Convent', 

In [181]:
stopsstatic["is_duplicate"]= stopsstatic.duplicated()
 
stopsstatic

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,is_duplicate
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529,False
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723,False
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811,False
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175,False
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454,False
5,8220DB000007,"Rotunda, Rotunda Hospital",53.352841,-6.26457,False
6,8220DB000008,"Rotunda, Parnell Square West",53.353272,-6.265184,False
7,8220DB000010,"Rotunda, Parnell Square West",53.353392,-6.265389,False
8,8220DB000011,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356958,-6.264628,False
9,8220DB000012,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356805,-6.264589,False


In [182]:
# Return the cardinality of the set of unique values, i.e., number of distinct values for feature x
len(stopsstatic['stop_id'].unique())

4691

In [183]:
len(stopsstatic['stop_name'].unique())

3264

In [184]:
len(stopsstatic['stop_lon'].unique())

4690

In [185]:
len(stopsstatic['stop_lat'].unique())

4690

In [186]:
len(stopsstatic['is_duplicate'].unique())

1

So this means that all stop_ids are unique. Some bus stop names are duplicates - in cases where there are a number of stops on the one road. Also, no row is duplicated in its entirety.

In [187]:
del stopsstatic['is_duplicate']

In [188]:
stopsstatic.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454


In [189]:
stopsstatic['true_stop_id'] = np.nan

In [190]:
stopsstatic.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529,
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723,
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811,
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175,
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454,


In [191]:
# trying to access a single value...
mystring=stopsstatic.iloc[0]['stop_id']

print(mystring)

8220B007612


In [192]:
# trying to access a single value...
mystring2=stopsstatic.iloc[2]['stop_id']

print(mystring2)

8220DB000003


In [193]:
newstring=mystring[-4:]
print(newstring)

7612


In [194]:
newstring2=mystring2[-4:]
print(newstring2)

0003


In [195]:
def lastFourDigits(stop_id):
    true_stop_id=stop_id[-4:]
    return true_stop_id

stopsstatic['true_stop_id'] = stopsstatic.apply(lambda row: lastFourDigits(row['stop_id']), axis=1)

stopsstatic.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529,7612
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723,2
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811,3
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175,4
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454,6


In [196]:
stopsstatic


Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
0,8220B007612,Davenport Hotel Merrion Street,53.341347,-6.250529,7612
1,8220DB000002,"Rotunda, Parnell Square West",53.352244,-6.263723,0002
2,8220DB000003,"Rotunda, Granby Place",53.352309,-6.263811,0003
3,8220DB000004,"Rotunda, Rotunda Hospital",53.352575,-6.264175,0004
4,8220DB000006,"Rotunda, Saint Martin's Chapel",53.352749,-6.264454,0006
5,8220DB000007,"Rotunda, Rotunda Hospital",53.352841,-6.26457,0007
6,8220DB000008,"Rotunda, Parnell Square West",53.353272,-6.265184,0008
7,8220DB000010,"Rotunda, Parnell Square West",53.353392,-6.265389,0010
8,8220DB000011,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356958,-6.264628,0011
9,8220DB000012,"Phibsborough, Upper Dorset St (St.Joseph's Par...",53.356805,-6.264589,0012


In [197]:

stopsstatic = stopsstatic.drop(stopsstatic.index[4690])


In [199]:
stopsstatic.tail()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
4685,8350DB007574,Balywaltrim,53.182519,-6.130072,7574
4686,gen:57102:3542:0:1,"Near Tara Station, Virtual Stop Townsend Street",53.346602,-6.257058,:0:1
4687,gen:57102:3543:0:1,"Near Tara Station, Virtual Stop Townsend St",53.346512,-6.257062,:0:1
4688,gen:57102:3606:0:1,"Glenageary, Barnhill Rd",53.276828,-6.119329,:0:1
4689,gen:57102:3607:0:1,"Dublin City, Poolbeg Street",53.346822,-6.256779,:0:1


In [200]:
stopsstatic = stopsstatic.drop(stopsstatic.index[4689])

In [201]:
stopsstatic.tail()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
4684,8350DB007522,Boghall Road Oldcourt,53.188131,-6.118873,7522
4685,8350DB007574,Balywaltrim,53.182519,-6.130072,7574
4686,gen:57102:3542:0:1,"Near Tara Station, Virtual Stop Townsend Street",53.346602,-6.257058,:0:1
4687,gen:57102:3543:0:1,"Near Tara Station, Virtual Stop Townsend St",53.346512,-6.257062,:0:1
4688,gen:57102:3606:0:1,"Glenageary, Barnhill Rd",53.276828,-6.119329,:0:1


In [202]:
stopsstatic = stopsstatic.drop(stopsstatic.index[4688])

In [203]:
stopsstatic.tail()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
4683,8350DB007462,Charlesland Road (Seaborne View Apts),53.128639,-6.062503,7462
4684,8350DB007522,Boghall Road Oldcourt,53.188131,-6.118873,7522
4685,8350DB007574,Balywaltrim,53.182519,-6.130072,7574
4686,gen:57102:3542:0:1,"Near Tara Station, Virtual Stop Townsend Street",53.346602,-6.257058,:0:1
4687,gen:57102:3543:0:1,"Near Tara Station, Virtual Stop Townsend St",53.346512,-6.257062,:0:1


In [205]:
stopsstatic = stopsstatic.drop(stopsstatic.index[4687])

In [206]:
stopsstatic.tail()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
4682,8350DB007461,Charlesland Road Charlesland Shopping Ctr,53.128806,-6.062809,7461
4683,8350DB007462,Charlesland Road (Seaborne View Apts),53.128639,-6.062503,7462
4684,8350DB007522,Boghall Road Oldcourt,53.188131,-6.118873,7522
4685,8350DB007574,Balywaltrim,53.182519,-6.130072,7574
4686,gen:57102:3542:0:1,"Near Tara Station, Virtual Stop Townsend Street",53.346602,-6.257058,:0:1


In [207]:
stopsstatic = stopsstatic.drop(stopsstatic.index[4686])

In [209]:
stopsstatic.tail()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,true_stop_id
4681,8350DB007369,"Bray, Upper Dargle Road #2",53.203539,-6.121791,7369
4682,8350DB007461,Charlesland Road Charlesland Shopping Ctr,53.128806,-6.062809,7461
4683,8350DB007462,Charlesland Road (Seaborne View Apts),53.128639,-6.062503,7462
4684,8350DB007522,Boghall Road Oldcourt,53.188131,-6.118873,7522
4685,8350DB007574,Balywaltrim,53.182519,-6.130072,7574


In [210]:
# Turn the feature 'true_stop_id' from 'object' to 'int64' type
stopsstatic['true_stop_id'] = stopsstatic['true_stop_id'].astype('int64')
stopsstatic.dtypes

stop_id          object
stop_name        object
stop_lat        float64
stop_lon        float64
true_stop_id      int64
dtype: object

In [None]:
stopsstatic

In [213]:
stopsstatic.to_csv('truestops.csv')