In [1]:
import pandas as pd
import mysql.connector
import numpy as np

import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, Float, String, DateTime

# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

# Connect to database
db = mysql.connector.connect(
    host=myhost,
    user=myuser,
    password=mypasswd,
    database=mydatabase
)

cur = db.cursor()

In [2]:
engine = create_engine(f"mysql+mysqlconnector://{myuser}:{mypasswd}@{myhost}:3306/{mydatabase}", echo=True)
connection = engine.connect()

2021-08-04 23:00:58,479 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2021-08-04 23:00:58,481 INFO sqlalchemy.engine.Engine [raw sql] {}
2021-08-04 23:00:58,489 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2021-08-04 23:00:58,490 INFO sqlalchemy.engine.Engine [generated in 0.00355s] {}
2021-08-04 23:00:58,504 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2021-08-04 23:00:58,505 INFO sqlalchemy.engine.Engine [raw sql] {}


<h4>Create list of lines in 2018 data</h4>

In [3]:
trips = pd.read_csv("rt_trips_DB_2018.txt", sep=";")

In [4]:
all_lines = trips.LINEID.unique().tolist()
all_lines.sort()

In [5]:
print(all_lines)

['1', '102', '104', '11', '111', '114', '116', '118', '120', '122', '123', '13', '130', '14', '140', '142', '145', '14C', '15', '150', '151', '15A', '15B', '15D', '16', '161', '16C', '16D', '17', '17A', '18', '184', '185', '220', '236', '238', '239', '25', '25A', '25B', '25D', '25X', '26', '27', '270', '27A', '27B', '27X', '29A', '31', '31A', '31B', '31D', '32', '32X', '33', '33A', '33B', '33D', '33E', '33X', '37', '38', '38A', '38B', '38D', '39', '39A', '39X', '4', '40', '40B', '40D', '40E', '41', '41A', '41B', '41C', '41D', '41X', '42', '42D', '43', '44', '44B', '45A', '46A', '46E', '47', '49', '51D', '51X', '53', '54A', '56A', '59', '61', '63', '65', '65B', '66', '66A', '66B', '66X', '67', '67X', '68', '68A', '68X', '69', '69X', '7', '70', '70D', '75', '76', '76A', '77A', '77X', '79', '79A', '7A', '7B', '7D', '83', '83A', '84', '84A', '84X', '9']


<h4>First fill names with most up-to-date GTFS data</h4>

In [6]:
gtfs_stops = pd.read_csv("stops.txt")

In [7]:
gtfs_stops.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,8220DB000002,"Parnell Square West, stop 2",53.352244,-6.263723
1,8220DB000003,"Parnell Square West, stop 3",53.352309,-6.263811
2,8220DB000004,"Parnell Square West, stop 4",53.352575,-6.264175
3,8220DB000006,"Parnell Square West, stop 6",53.352749,-6.264454
4,8220DB000007,"Parnell Square West, stop 7",53.352841,-6.26457


In [8]:
gtfs_stops.shape

(4208, 4)

In [9]:
# Check for duplicates
gtfs_stops.drop_duplicates(inplace=True)

In [10]:
gtfs_stops.shape

(4208, 4)

In [11]:
# Create stoppointid feature
gtfs_stops["stoppointid"] = gtfs_stops.apply(lambda row: row.stop_id[-4:], axis=1)
gtfs_stops["stoppointid"] = gtfs_stops.apply(lambda row: row.stoppointid.lstrip("0"), axis=1)
gtfs_stops["stoppointid"] = gtfs_stops["stoppointid"].astype("int64")

In [12]:
gtfs_stops.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,stoppointid
0,8220DB000002,"Parnell Square West, stop 2",53.352244,-6.263723,2
1,8220DB000003,"Parnell Square West, stop 3",53.352309,-6.263811,3
2,8220DB000004,"Parnell Square West, stop 4",53.352575,-6.264175,4
3,8220DB000006,"Parnell Square West, stop 6",53.352749,-6.264454,6
4,8220DB000007,"Parnell Square West, stop 7",53.352841,-6.26457,7


In [13]:
gtfs_stops.shape

(4208, 5)

In [14]:
# Create list of unique stoppointids that appear in GTFS data
gtfs_stop_list = gtfs_stops.stoppointid.unique().tolist()

In [15]:
print(gtfs_stop_list)

[2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 102, 104, 105, 107, 110, 112, 113, 114, 115, 117, 118, 119, 126, 127, 129, 130, 131, 133, 134, 136, 137, 138, 141, 142, 143, 146, 147, 148, 149, 150, 151, 153, 154, 155, 156, 157, 158, 159, 164, 165, 166, 167, 168, 169, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 220, 228, 233, 234, 235, 236, 237, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 261, 262, 263, 264, 265, 270, 271, 272, 273, 274, 276, 277, 278, 279, 281, 284, 286, 288, 289, 292, 297, 298, 299, 300, 301, 302, 303, 

In [16]:
# Create new dataframe to hold 2018 stops and matched names
all_stops_df = pd.DataFrame(columns=["lineid", "stoppointid", "stop_name"])

In [17]:
all_stops_df

Unnamed: 0,lineid,stoppointid,stop_name


In [18]:
# For each line in 2018 data add all stops to all_stops_df
for line in all_lines:
    line_df = pd.read_csv("stoptimes/" + line + "_stops.csv")
    stops_on_line = line_df.STOPPOINTID.unique().tolist()
    for stop in stops_on_line:
        if stop in gtfs_stop_list:
            stop_name = gtfs_stops.loc[gtfs_stops["stoppointid"] == stop].iloc[0]["stop_name"]
        else:
            stop_name = np.nan
        new_row = {"lineid":line, "stoppointid":stop, "stop_name":stop_name}
        all_stops_df = all_stops_df.append(new_row, ignore_index=True)
    print("Line", line, "complete")

Line 1 complete
Line 102 complete
Line 104 complete
Line 11 complete
Line 111 complete
Line 114 complete
Line 116 complete
Line 118 complete
Line 120 complete
Line 122 complete
Line 123 complete
Line 13 complete
Line 130 complete
Line 14 complete
Line 140 complete
Line 142 complete
Line 145 complete
Line 14C complete
Line 15 complete
Line 150 complete
Line 151 complete
Line 15A complete
Line 15B complete
Line 15D complete
Line 16 complete
Line 161 complete
Line 16C complete
Line 16D complete
Line 17 complete
Line 17A complete
Line 18 complete
Line 184 complete
Line 185 complete
Line 220 complete
Line 236 complete
Line 238 complete
Line 239 complete
Line 25 complete
Line 25A complete
Line 25B complete
Line 25D complete
Line 25X complete
Line 26 complete
Line 27 complete
Line 270 complete
Line 27A complete
Line 27B complete
Line 27X complete
Line 29A complete
Line 31 complete
Line 31A complete
Line 31B complete
Line 31D complete
Line 32 complete
Line 32X complete
Line 33 complete
Line 33

In [21]:
all_stops_df.shape

(13142, 3)

In [24]:
all_stops_df.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,"Shanard Avenue, stop 226"
1,1,225,"Shanowen Avenue, stop 225"
2,1,224,"Shanard Road, stop 224"
3,1,223,"Shanliss Drive, stop 223"
4,1,222,"Shanliss Road, stop 222"


In [22]:
all_stops_df.drop_duplicates(inplace=True)

In [33]:
all_stops_df.shape

(13142, 3)

In [25]:
all_stops_df.isna().sum()

lineid           0
stoppointid      0
stop_name      856
dtype: int64

<h4>Create copy of all_stops_df and clean stop names</h4>

In [28]:
# Create copy of all_stops_df to add to 
all_stops_df_new = all_stops_df.copy()

In [29]:
all_stops_df_new.shape

(13142, 3)

In [30]:
all_stops_df_new.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,"Shanard Avenue, stop 226"
1,1,225,"Shanowen Avenue, stop 225"
2,1,224,"Shanard Road, stop 224"
3,1,223,"Shanliss Drive, stop 223"
4,1,222,"Shanliss Road, stop 222"


In [31]:
# Drop these rows from all_stops_df_new, they will be added back later of names can be found
all_stops_df_new.dropna(inplace=True)

In [32]:
all_stops_df_new.shape

(12286, 3)

In [46]:
odd_names = pd.DataFrame(columns=all_stops_df_new.columns)
for index, row in all_stops_df_new.iterrows():
    if row["stop_name"].split(" ")[-2] != "stop":
        odd_names = odd_names.append(row)
        all_stops_df_new.drop(index, inplace=True)

In [47]:
odd_names

Unnamed: 0,lineid,stoppointid,stop_name
1441,140,7491,Virtual Stop 1
3360,220,4377,"Sycamore Road, Stop 4377"
7388,40E,7676,"Northwest, 7676"
7914,41X,7654,"Knocksedan Drive,stop 7654"
8650,46A,7491,Virtual Stop 1
8740,46E,7491,Virtual Stop 1
9178,53,286,"Cathal Brugha Street, Stop No. 286"
13012,9,4377,"Sycamore Road, Stop 4377"


In [48]:
all_stops_df_new.shape

(12278, 3)

In [54]:
test = []
for index, row in all_stops_df_new.iterrows():
    test.append(row["stop_name"].split("stop")[-1])

In [55]:
print(test)

[' 226', ' 225', ' 224', ' 223', ' 222', ' 221', ' 220', ' 1620', ' 205', ' 204', ' 203', ' 85', ' 7602', ' 21', ' 18', ' 17', ' 15', ' 14', ' 12', ' 10', ' 7615', ' 278', ' 319', ' 400', ' 399', ' 398', ' 397', ' 395', ' 392', ' 391', ' 371', ' 393', ' 389', ' 388', ' 387', ' 381', ' 396', ' 19', ' 374', ' 373', ' 372', ' 390', ' 357', ' 356', ' 355', ' 354', ' 352', ' 350', ' 271', ' 265', ' 52', ' 51', ' 49', ' 47', ' 45', ' 7603', ' 44', ' 119', ' 4432', ' 214', ' 1642', ' 1641', ' 230', ' 227', ' 229', ' 231', ' 340', ' 213', ' 228', ' 48', ' 351', ' 353', ' 46', ' 1073', ' 3641', ' 3643', ' 3583', ' 3584', ' 3585', ' 3586', ' 3587', ' 3588', ' 3590', ' 3589', ' 3591', ' 3592', ' 3605', ' 3606', ' 3607', ' 3608', ' 3598', ' 4465', ' 945', ' 947', ' 928', ' 3642', ' 3609', ' 944', ' 948', ' 3636', ' 905', ' 4331', ' 4339', ' 6010', ' 3656', ' 3653', ' 6054', ' 3691', ' 3701', ' 3704', ' 3705', ' 733', ' 913', ' 3663', ' 7348', ' 938', ' 940', ' 941', ' 942', ' 943', ' 4503', ' 3599

In [60]:
test_new = []
for item in test:
    test_new.append(item.lstrip())

In [61]:
print(test_new)

['226', '225', '224', '223', '222', '221', '220', '1620', '205', '204', '203', '85', '7602', '21', '18', '17', '15', '14', '12', '10', '7615', '278', '319', '400', '399', '398', '397', '395', '392', '391', '371', '393', '389', '388', '387', '381', '396', '19', '374', '373', '372', '390', '357', '356', '355', '354', '352', '350', '271', '265', '52', '51', '49', '47', '45', '7603', '44', '119', '4432', '214', '1642', '1641', '230', '227', '229', '231', '340', '213', '228', '48', '351', '353', '46', '1073', '3641', '3643', '3583', '3584', '3585', '3586', '3587', '3588', '3590', '3589', '3591', '3592', '3605', '3606', '3607', '3608', '3598', '4465', '945', '947', '928', '3642', '3609', '944', '948', '3636', '905', '4331', '4339', '6010', '3656', '3653', '6054', '3691', '3701', '3704', '3705', '733', '913', '3663', '7348', '938', '940', '941', '942', '943', '4503', '3599', '3610', '3611', '6135', '3612', '3613', '3615', '3616', '3618', '3619', '3620', '3622', '4387', '3635', '4330', '3572',

In [64]:
for item in test_new:
    if not item.isdigit():
        print(item)
print("Finished check")

Finished check


In [109]:
stop_names_cleaned = pd.DataFrame(columns=all_stops_df_new.columns)
for index, row in all_stops_df_new.iterrows():
    lineid = row["lineid"]
    stoppointid = row["stoppointid"]
    stop_name = row["stop_name"].split("stop")[0]
    new_row = {"lineid":lineid, "stoppointid":stoppointid, "stop_name":stop_name}
    stop_names_cleaned = stop_names_cleaned.append(new_row, ignore_index=True)

In [110]:
stop_names_cleaned.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,"Shanard Avenue,"
1,1,225,"Shanowen Avenue,"
2,1,224,"Shanard Road,"
3,1,223,"Shanliss Drive,"
4,1,222,"Shanliss Road,"


In [111]:
stop_names_cleaned["stop_name"] = stop_names_cleaned.apply(lambda row: row.stop_name.strip(), axis=1)

In [112]:
stop_names_cleaned.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,"Shanard Avenue,"
1,1,225,"Shanowen Avenue,"
2,1,224,"Shanard Road,"
3,1,223,"Shanliss Drive,"
4,1,222,"Shanliss Road,"


In [113]:
stop_names_cleaned["stop_name"] = stop_names_cleaned.apply(lambda row: row.stop_name.strip(","), axis=1)

In [114]:
stop_names_cleaned.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,Shanard Avenue
1,1,225,Shanowen Avenue
2,1,224,Shanard Road
3,1,223,Shanliss Drive
4,1,222,Shanliss Road


In [115]:
stop_names_cleaned.shape

(12278, 3)

In [116]:
odd_names

Unnamed: 0,lineid,stoppointid,stop_name
1441,140,7491,Virtual Stop 1
3360,220,4377,"Sycamore Road, Stop 4377"
7388,40E,7676,"Northwest, 7676"
7914,41X,7654,"Knocksedan Drive,stop 7654"
8650,46A,7491,Virtual Stop 1
8740,46E,7491,Virtual Stop 1
9178,53,286,"Cathal Brugha Street, Stop No. 286"
13012,9,4377,"Sycamore Road, Stop 4377"


<p>Not the most elegant approach but manually dealing with odd names...</p>

In [117]:
stop_names_cleaned.dtypes

lineid         object
stoppointid    object
stop_name      object
dtype: object

In [118]:
list_1 = [0, 4, 5]

In [119]:
for index in list_1:
    new_row = odd_names.iloc[index]
    stop_names_cleaned = stop_names_cleaned.append(new_row, ignore_index=True)

In [120]:
stop_names_cleaned.tail()

Unnamed: 0,lineid,stoppointid,stop_name
12276,9,189,Fire Station
12277,9,2458,St James's Road
12278,140,7491,Virtual Stop 1
12279,46A,7491,Virtual Stop 1
12280,46E,7491,Virtual Stop 1


In [121]:
list_2 = [1, 2, 3, 6, 7]

In [122]:
for index in list_2:
    row = odd_names.iloc[index]
    new_row = {"lineid":row["lineid"], "stoppointid":row["stoppointid"], "stop_name":row["stop_name"].split(",")[0]}
    stop_names_cleaned = stop_names_cleaned.append(new_row, ignore_index=True)

In [123]:
stop_names_cleaned.tail()

Unnamed: 0,lineid,stoppointid,stop_name
12281,220,4377,Sycamore Road
12282,40E,7676,Northwest
12283,41X,7654,Knocksedan Drive
12284,53,286,Cathal Brugha Street
12285,9,4377,Sycamore Road


In [131]:
stop_names_cleaned.shape

(12286, 3)

<h4>Create list of stop numbers with missing names</h4>

In [124]:
names_missing = all_stops_df.loc[all_stops_df["stop_name"].isna()]

In [125]:
names_missing.shape

(856, 3)

In [126]:
names_missing.head()

Unnamed: 0,lineid,stoppointid,stop_name
35,1,384,
36,1,383,
37,1,4451,
40,1,385,
41,1,382,


In [127]:
# names_missing["stoppointid"] = names_missing["stoppointid"].astype("int64")

In [128]:
names_missing_list = names_missing.stoppointid.unique().tolist()

In [129]:
print(names_missing_list)

[384, 383, 4451, 385, 382, 380, 378, 376, 375, 377, 2804, 949, 951, 953, 4381, 952, 1039, 1040, 935, 815, 816, 936, 788, 3078, 3087, 1050, 1041, 4791, 4473, 4597, 1645, 1646, 1652, 5141, 1651, 4784, 1650, 4792, 1644, 1605, 4474, 4790, 674, 4389, 4390, 4785, 1766, 7129, 662, 663, 7607, 7576, 7220, 401, 6103, 2037, 3046, 3058, 6082, 4983, 7648, 7647, 3201, 3200, 3199, 3198, 3194, 3193, 3192, 3191, 3190, 3189, 6196, 6201, 6200, 3195, 3187, 3188, 6199, 6198, 6197, 3182, 4457, 3179, 3178, 3177, 3176, 3175, 3174, 3173, 3171, 3169, 3168, 3167, 3166, 3165, 3183, 7669, 4529, 7589, 7164, 116, 293, 7490, 7659, 7617, 6044, 2826, 7638, 4844, 4533, 6000, 5190, 7558, 7216, 7620, 7621, 7580, 4337, 7447, 4582, 4332, 7448, 4391, 765, 867, 869, 873, 874, 2464, 3360, 1392, 1394, 1395, 3362, 870, 3361, 863, 864, 865, 866, 2052, 462, 464, 465, 466, 467, 468, 3353, 3354, 1400, 1391, 3357, 3358, 2449, 2451, 3083, 431, 432, 433, 434, 430, 7298, 2294, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1006, 1007, 100

In [130]:
all_stops_df_new.shape

(12278, 3)

<h4>Next attempt to fill in missing stop names with data from May 2021</h4>

In [132]:
# Read in May 2021 data
may_df = pd.read_csv("May21_stops_all.csv")

In [133]:
may_df.head()

Unnamed: 0,PlateCode,ShortCommonName_en,RouteData
0,100021,Strand Street,100
1,100281,St Paul's Cresent,"100, 101, 101x"
2,132621,Laytown Road,"100, 101, 101x, 912"
3,135951,Drogheda Bus Station,"100, 100x, 101, 101x, 105, 163, 168, 182, 182..."
4,131631,Dunleer,"100, 100x"


In [134]:
may_stops_list = may_df.PlateCode.unique().tolist()

In [135]:
print(may_stops_list[0:3])

[100021, 100281, 132621]


In [136]:
matched_from_may = pd.DataFrame(columns=all_stops_df_new.columns)

for index, row in names_missing.iterrows():
    lineid = row["lineid"]
    stop_num = row["stoppointid"]
    if stop_num in may_stops_list:
        stop_name = may_df.loc[may_df["PlateCode"] == stop_num].iloc[0]["ShortCommonName_en"]
        new_row = {"lineid":lineid, "stoppointid":stop_num, "stop_name":stop_name}
        matched_from_may = matched_from_may.append(new_row, ignore_index=True)
        names_missing.drop(index, inplace=True)
        print(new_row)

{'lineid': '1', 'stoppointid': 385, 'stop_name': 'Newgrove Avenue'}
{'lineid': '1', 'stoppointid': 375, 'stop_name': 'Dromard Terrace'}
{'lineid': '102', 'stoppointid': 949, 'stop_name': 'The Coast'}
{'lineid': '102', 'stoppointid': 951, 'stop_name': 'Strand Road'}
{'lineid': '102', 'stoppointid': 953, 'stop_name': 'Station Road'}
{'lineid': '102', 'stoppointid': 4381, 'stop_name': 'Sutton Station'}
{'lineid': '102', 'stoppointid': 952, 'stop_name': 'Burrowfield Road'}
{'lineid': '102', 'stoppointid': 1039, 'stop_name': 'Seamount View'}
{'lineid': '102', 'stoppointid': 1040, 'stop_name': 'Ashley Drive'}
{'lineid': '102', 'stoppointid': 935, 'stop_name': 'Station Road'}
{'lineid': '102', 'stoppointid': 815, 'stop_name': 'Forrest Little Golf'}
{'lineid': '102', 'stoppointid': 936, 'stop_name': 'Strand Road'}
{'lineid': '102', 'stoppointid': 788, 'stop_name': 'The Oaks'}
{'lineid': '102', 'stoppointid': 3078, 'stop_name': 'Forrest Little Golf'}
{'lineid': '102', 'stoppointid': 3087, 'stop

{'lineid': '17A', 'stoppointid': 1004, 'stop_name': 'Rathvale'}
{'lineid': '17A', 'stoppointid': 1006, 'stop_name': 'Millwood Villas'}
{'lineid': '17A', 'stoppointid': 1007, 'stop_name': 'Millbrook Drive'}
{'lineid': '17A', 'stoppointid': 1008, 'stop_name': 'Kilbarrack Road'}
{'lineid': '17A', 'stoppointid': 988, 'stop_name': "St Canice's GNS"}
{'lineid': '17A', 'stoppointid': 1009, 'stop_name': "St. Donagh's"}
{'lineid': '17A', 'stoppointid': 1010, 'stop_name': 'Howth Junction'}
{'lineid': '17A', 'stoppointid': 139, 'stop_name': 'Grove Park Road'}
{'lineid': '17A', 'stoppointid': 140, 'stop_name': 'Beneavin Park'}
{'lineid': '17A', 'stoppointid': 1012, 'stop_name': 'Naomh Barróg GAA'}
{'lineid': '17A', 'stoppointid': 989, 'stop_name': 'Santry Avenue'}
{'lineid': '17A', 'stoppointid': 6009, 'stop_name': 'Hazelwood'}
{'lineid': '17A', 'stoppointid': 991, 'stop_name': 'Santry Ave Ind Est'}
{'lineid': '17A', 'stoppointid': 990, 'stop_name': 'Shanliss Way'}
{'lineid': '17A', 'stoppointid':

{'lineid': '185', 'stoppointid': 4190, 'stop_name': 'Cookstown'}
{'lineid': '185', 'stoppointid': 4105, 'stop_name': 'Hill View'}
{'lineid': '185', 'stoppointid': 4103, 'stop_name': 'Kilgarron Park'}
{'lineid': '185', 'stoppointid': 4104, 'stop_name': 'Parknasillogue Ct'}
{'lineid': '185', 'stoppointid': 6139, 'stop_name': 'Bridge Road'}
{'lineid': '185', 'stoppointid': 4108, 'stop_name': 'Enniskerry Golf Club'}
{'lineid': '185', 'stoppointid': 4106, 'stop_name': 'Kilmolin'}
{'lineid': '185', 'stoppointid': 4110, 'stop_name': 'Kilmolin'}
{'lineid': '185', 'stoppointid': 4111, 'stop_name': 'Hill View'}
{'lineid': '185', 'stoppointid': 4112, 'stop_name': 'Parknasillogue Ct'}
{'lineid': '185', 'stoppointid': 4113, 'stop_name': 'Kilgarron Park'}
{'lineid': '185', 'stoppointid': 4192, 'stop_name': 'Cookstown'}
{'lineid': '185', 'stoppointid': 4194, 'stop_name': 'Kilbride Lane'}
{'lineid': '185', 'stoppointid': 4196, 'stop_name': 'Upper Dargle Road'}
{'lineid': '185', 'stoppointid': 7369, 's

{'lineid': '45A', 'stoppointid': 3241, 'stop_name': 'Sallynoggin Park'}
{'lineid': '45A', 'stoppointid': 3534, 'stop_name': 'Churchview Road'}
{'lineid': '45A', 'stoppointid': 3242, 'stop_name': "O'Rourke Park"}
{'lineid': '45A', 'stoppointid': 4176, 'stop_name': "O'Byrne Road"}
{'lineid': '45A', 'stoppointid': 5088, 'stop_name': 'Woodview Drive'}
{'lineid': '45A', 'stoppointid': 5089, 'stop_name': 'Wolfe Tone Square'}
{'lineid': '45A', 'stoppointid': 4147, 'stop_name': 'Wolfe Tone Square'}
{'lineid': '45A', 'stoppointid': 5167, 'stop_name': 'Woodview Drive'}
{'lineid': '45A', 'stoppointid': 3548, 'stop_name': 'Shanganagh Cliffs'}
{'lineid': '45A', 'stoppointid': 3547, 'stop_name': 'Shanganagh Park'}
{'lineid': '45A', 'stoppointid': 3551, 'stop_name': 'Rathsallagh Drive'}
{'lineid': '45A', 'stoppointid': 3546, 'stop_name': 'Shankill Station'}
{'lineid': '46A', 'stoppointid': 2041, 'stop_name': "St Michael's Hosp"}
{'lineid': '47', 'stoppointid': 375, 'stop_name': 'Dromard Terrace'}
{'l

{'lineid': '76', 'stoppointid': 5006, 'stop_name': 'Le Fanu Road'}
{'lineid': '76', 'stoppointid': 5007, 'stop_name': 'Convent Lawns'}
{'lineid': '76', 'stoppointid': 2130, 'stop_name': 'Station Road'}
{'lineid': '76', 'stoppointid': 2763, 'stop_name': 'Newlands Golf Club'}
{'lineid': '76', 'stoppointid': 2620, 'stop_name': 'Belgard'}
{'lineid': '76', 'stoppointid': 2345, 'stop_name': 'Belgard Retail Park'}
{'lineid': '76', 'stoppointid': 4341, 'stop_name': 'The Square'}
{'lineid': '76', 'stoppointid': 2619, 'stop_name': 'Belgard Road'}
{'lineid': '76', 'stoppointid': 2112, 'stop_name': 'Clondalkin Main St'}
{'lineid': '76', 'stoppointid': 2116, 'stop_name': 'Michael Collins Pk'}
{'lineid': '76', 'stoppointid': 2117, 'stop_name': 'Station Road'}
{'lineid': '76', 'stoppointid': 4414, 'stop_name': 'Convent Lawns'}
{'lineid': '76', 'stoppointid': 2199, 'stop_name': 'Glenaulin'}
{'lineid': '76', 'stoppointid': 2363, 'stop_name': 'Belgard Road'}
{'lineid': '76', 'stoppointid': 5011, 'stop_n

In [137]:
matched_from_may.shape

(583, 3)

In [138]:
matched_from_may.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,385,Newgrove Avenue
1,1,375,Dromard Terrace
2,102,949,The Coast
3,102,951,Strand Road
4,102,953,Station Road


In [139]:
remaining_unnamed = names_missing.stoppointid.unique().tolist()
print("Number of unnamed stops remaining:", len(remaining_unnamed))

Number of unnamed stops remaining: 119


In [140]:
print(remaining_unnamed)

[384, 383, 4451, 382, 380, 378, 376, 377, 2804, 816, 7576, 401, 7648, 7647, 7669, 4529, 7589, 7164, 116, 7490, 7659, 7617, 6044, 2826, 7638, 5190, 7558, 7216, 7620, 7621, 7580, 4391, 997, 998, 6180, 4188, 4097, 2237, 347, 7626, 7566, 7627, 318, 6058, 4849, 4326, 5033, 7479, 7485, 7484, 7486, 7487, 5062, 6207, 403, 7592, 7594, 7205, 349, 7497, 7495, 7457, 7480, 7478, 7481, 1595, 7682, 7674, 7655, 7604, 7685, 7593, 7405, 7544, 7404, 7407, 7539, 7540, 7537, 7538, 7541, 7542, 5034, 2827, 2828, 7658, 805, 806, 7688, 7689, 7662, 7663, 7529, 2079, 7433, 2265, 2269, 2753, 7652, 2576, 7269, 3895, 3944, 7483, 7547, 7546, 3429, 7653, 6333, 2038, 4586, 4587, 7492, 7500, 7501, 7502, 7503, 7504, 4678]


In [173]:
all_stops_final = stop_names_cleaned.copy()

In [174]:
all_stops_final = all_stops_final.append(matched_from_may, ignore_index=True)

In [175]:
all_stops_final.shape

(12869, 3)

In [176]:
all_stops_final.duplicated().sum()

0

In [177]:
all_stops_final.loc[all_stops_final["stop_name"] == "College Green"]

Unnamed: 0,lineid,stoppointid,stop_name
676,122,1359,College Green
733,122,1278,College Green
769,123,1359,College Green
820,123,1278,College Green
859,13,1359,College Green
988,13,4521,College Green
1869,150,1279,College Green
1902,151,4522,College Green
2247,16,1279,College Green
2352,16,1359,College Green


In [178]:
all_stops_final["stop_name"] = all_stops_final.apply(lambda row: row.stop_name + "_" + str(row.lineid), axis=1)

In [179]:
all_stops_final.head()

Unnamed: 0,lineid,stoppointid,stop_name
0,1,226,Shanard Avenue_1
1,1,225,Shanowen Avenue_1
2,1,224,Shanard Road_1
3,1,223,Shanliss Drive_1
4,1,222,Shanliss Road_1


In [180]:
all_stops_final.drop(["lineid"], axis=1, inplace=True)

In [181]:
all_stops_final.tail()

Unnamed: 0,stoppointid,stop_name
12864,2364,Belgard Retail Park_76A
12865,2363,Belgard Road_76A
12866,416,RDS Ballsbridge_7A
12867,2041,St Michael's Hosp_7A
12868,2041,St Michael's Hosp_7D


In [182]:
test = all_stops_final[all_stops_final.duplicated(subset=["stop_name"], keep=False)]
test.sort_values(by=["stop_name"])

Unnamed: 0,stoppointid,stop_name
2381,1630,ALSAA Sports Club_16
2269,1631,ALSAA Sports Club_16
5019,1630,ALSAA Sports Club_33
4893,1631,ALSAA Sports Club_33
6914,1630,ALSAA Sports Club_41
...,...,...
2071,2915,Zion Road_15B
2212,2988,Zion Road_15D
2168,2915,Zion Road_15D
6915,7348,Zone 15_41


In [183]:
all_stops_final.drop_duplicates(subset=["stop_name"], keep=False, inplace=True)

In [184]:
all_stops_final.shape

(6073, 2)

In [185]:
all_stops_final.head()

Unnamed: 0,stoppointid,stop_name
0,226,Shanard Avenue_1
1,225,Shanowen Avenue_1
2,224,Shanard Road_1
6,220,Swords Road_1
10,203,Whitehall College_1


In [186]:
# Empty table, for refilling
truncate_query = sqlalchemy.text("TRUNCATE TABLE match_stop_names")
connection.execution_options(autocommit=True).execute(truncate_query)

2021-08-05 01:05:59,042 INFO sqlalchemy.engine.Engine TRUNCATE TABLE match_stop_names
2021-08-05 01:05:59,044 INFO sqlalchemy.engine.Engine [cached since 168.1s ago] {}
2021-08-05 01:05:59,082 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fa371377460>

In [187]:
# Refill table
def create_and_fill_table(df):
    meta = MetaData()
    
    table = Table(
        "match_stop_names", meta, 
        Column("stoppointid", Integer),
        Column("stop_name", String(30))
    )
    
    meta.create_all(engine)
    df.to_sql("match_stop_names", con=engine, if_exists="append", index=False)

In [188]:
create_and_fill_table(all_stops_final)

2021-08-05 01:06:01,376 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-08-05 01:06:01,380 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2021-08-05 01:06:01,381 INFO sqlalchemy.engine.Engine [cached since 73.57s ago] {'table_schema': 'dubbusdb', 'table_name': 'match_stop_names'}
2021-08-05 01:06:01,386 INFO sqlalchemy.engine.Engine COMMIT
2021-08-05 01:06:01,397 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2021-08-05 01:06:01,398 INFO sqlalchemy.engine.Engine [cached since 73.59s ago] {'table_schema': 'dubbusdb', 'table_name': 'match_stop_names'}
2021-08-05 01:06:01,406 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-08-05 01:06:01,433 INFO sqlalchemy.engine.Engine INSERT INTO match_stop_names (stoppointid, stop_name) VALUES (%(stoppointid)s, %(stop_name)s)
2021-08-05 01:06:01,4