In [1]:
!pip install fuzzywuzzy pandas python-Levenshtein




[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import requests
from fuzzywuzzy import process

In [3]:
# paths
input_directory = '../data/raw'
output_directory = '../data/processed'
stations_file_path = os.path.join(input_directory, 'stations.csv')

os.makedirs(output_directory, exist_ok=True)

stations_df = pd.read_csv(stations_file_path)
stations_df.head()

Unnamed: 0,code,name,latitude,longitude,yearid
0,0,Dundas St E / Regent Park Blvd,,,2016
1,1,Riverdale Park North (Broadview Ave),,,2016
2,2,Union Station,,,2016
3,3,Front St W / Blue Jays Way,,,2016
4,4,Queens Park / Bloor St W,,,2016


In [4]:
stations_df = pd.read_csv(stations_file_path)

response = requests.get("https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_information")
station_data = response.json()

# only keep relevant fields
station_coordinates = {}
for station in station_data['data']['stations']:
    station_coordinates[station['name']] = (station['lat'], station['lon'])

In [5]:
# keep tally of stations found or not with brute force
found_count = 0
not_found_count = 0

missing_coordinates = []

stations_df['latitude'] = None
stations_df['longitude'] = None

for index, row in stations_df.iterrows():
    station_name = row['name']
    
    # check direct station match
    if station_name in station_coordinates:
        stations_df.at[index, 'latitude'], stations_df.at[index, 'longitude'] = station_coordinates[station_name]
        found_count += 1
    else:
        # flip the station name and check again
        parts = station_name.split(' / ')
        flipped_station = ' / '.join(parts[::-1])
        
        if flipped_station in station_coordinates:
            stations_df.at[index, 'latitude'], stations_df.at[index, 'longitude'] = station_coordinates[flipped_station]
            found_count += 1
        else:
            missing_coordinates.append(station_name)
            not_found_count += 1

print(f"Found stations: {found_count}")
print(f"Not found stations: {not_found_count}")

Found stations: 1258
Not found stations: 669


In [6]:
from fuzzywuzzy import process

# string similarity search for not found stations
not_found_count = len(missing_coordinates)
closest_matches = []

for station_name in missing_coordinates:
    closest_match, score = process.extractOne(station_name, station_coordinates.keys())
    closest_matches.append(closest_match)  # Log the closest match

# fill closest match
for index, station_name in enumerate(missing_coordinates):
    closest_station = closest_matches[index]
    stations_df.loc[stations_df['name'] == station_name, 'closest_match'] = closest_station  
    stations_df.loc[stations_df['name'] == station_name, 'latitude'] = station_coordinates[closest_station][0] 
    stations_df.loc[stations_df['name'] == station_name, 'longitude'] = station_coordinates[closest_station][1] 

# TODO compare with closest matches manually
for station, closest_station in zip(missing_coordinates, closest_matches):
    print(f" - {station} (Closest match: {closest_station})")

 - Queens Park / Bloor St W (Closest match: Queen's Park / Bloor St W)
 - Simcoe St / Wellington St W (Closest match: Simcoe St / Wellington St W South)
 - Parliament St / Gerrard St (Closest match: Sherbourne St / Carlton St (Allan Gardens))
 - Wellesley St / Queen's Park Cres (Closest match: Wellesley St W / Queen's Park Cres)
 - College St W / Major St (Closest match: College St / Major St)
 - 424 Wellington St. W (Closest match: 420 Wellington St W)
 - Dundas St / Yonge St (Closest match: Dundas St W / Yonge St)
 - Victoria St / Gould St (Ryerson University) (Closest match: St. George St / Bloor St W)
 - Front St / Bay St (North Side) (Closest match: Front St W / Bay St (North Side))
 - Fort York  Blvd / Capreol Crt (Closest match: Fort York  Blvd / Capreol Ct)
 - College St W / Huron St (Closest match: College St / Huron St)
 - HTO Park (Queen's Quay W) (Closest match: HTO Park (Queens Quay W))
 - Beverly St / College St W (Closest match: Beverley St / College St)
 - Navy Wharf Ct

In [7]:
output_file_path = os.path.join(output_directory, 'stations.csv')
stations_df.iloc[:, :-1].to_csv(output_file_path, index=False)

In [8]:
# show where coordinates still empty
empty_coordinates_df = stations_df[stations_df['latitude'].isnull() & stations_df['longitude'].isnull()]
empty_coordinates_df

Unnamed: 0,code,name,latitude,longitude,yearid,closest_match
