In [None]:
import pandas as pd
import glob
from datetime import datetime
from tqdm import tqdm
from geopy.extra.rate_limiter import RateLimiter
import time
import googlemaps

# Import bus data and streetcar data csv files

In [None]:
# combine excel sheets together

df_bus = pd.read_csv('BusDelayData_updated.csv')
df_st = pd.read_csv('StreetcarDelayData.csv')

In [None]:
# shape of the data

print(df_bus.shape)
print(df_st.shape)

In [None]:
df_bus.describe()

### create column for categorizing bus and streetcar prior to merging datasets

In [None]:
df_bus['vtype'] = 'bus'
df_st['vtype'] = 'streetcar'

In [None]:
df_st.describe()

In [None]:
# combine bus and streetcar
df_tot = pd.concat([df_bus, df_st])

In [None]:
df_tot.shape

In [None]:
df_tot.head(10)

In [None]:
df_tot[df_tot['vtype'] == 'bus'].count() 

In [None]:
df_tot.tail()

In [None]:
df_tot = df_tot.drop(columns = 'Unnamed: 0')

In [None]:
df_tot.info()

## convert report date to datetime format and create year, month and day columns
## convert time to time format to create hour and minute column

In [None]:
df_tot['Report Date'] = pd.to_datetime(df_tot['Report Date'])

In [None]:
df_tot.head(5)

In [None]:
df_tot = df_tot.reset_index().drop(columns = 'index')

In [None]:
df_tot.info()

In [None]:
df_tot['year'] = pd.DatetimeIndex(df_tot['Report Date']).year
df_tot['month'] = pd.DatetimeIndex(df_tot['Report Date']).month
df_tot['day'] = pd.DatetimeIndex(df_tot['Report Date']).day

In [None]:
import datetime as dt

df_tot['hour'] = pd.to_datetime(df_tot['Time']).dt.hour
df_tot['minute'] = pd.to_datetime(df_tot['Time']).dt.minute

In [None]:
df_tot['Location'].str.lower()

In [None]:
df_tot['Location'] = df_tot['Location'].str.lower()
df_tot['Incident'] = df_tot['Incident'].str.lower()
df_tot['Day'] = df_tot['Day'].str.lower()
df_tot['Direction'] = df_tot['Direction'].str.lower()

df_tot

## Remove null values
## remove rows 

In [None]:
df_tot.isnull().sum()

In [None]:
#df_tot.shape

In [None]:
df_tot = df_tot.dropna()

In [None]:
df_tot.isnull().sum()

In [None]:
df_tot.shape

## Clean location data first prior to retreiving lat and long coordinates

In [None]:
df_tot = df_tot.reset_index()

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('&', 'and')

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('stn', 'station')

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('ave', 'avenue')

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('str', 'street')

In [None]:
df_tot['Day'] = df_tot['Day'].str.capitalize()

In [None]:
df_tot['Direction'] = df_tot['Direction'].str.replace('/', '')

In [None]:
df_tot = df_tot.drop(columns = 'index')

In [None]:
df_tot['Vehicle'] = df_tot['Vehicle'].astype(int)

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('/', '')

In [None]:
df_tot['Location'] = df_tot['Location'].str.replace('#', '')
df_tot['Location'] = df_tot['Location'].str.replace('!', '')

In [None]:
df_tot.head(5)

In [None]:
df_tot['Location'].value_counts()[:20]

In [None]:
location = df_tot['Location'].unique().tolist()

len(location)

In [None]:
address= 'howardpark and dundas' + ' Toronto'
loc = geolocator.geocode(address)
if loc != None:
    print(loc.latitude, loc.longitude)
else:
    print(0)

## Use geopy first to retrieve lat and long coordinates before using google api since we are limited in the number of coordinates we can retrieve.

In [None]:
geolocator = Nominatim(user_agent="my_user_agent")

latlist_total =[]
longlist_total =[]


for i in tqdm(location):

    address= i + ', Toronto,' + ' Canada'
#    time.sleep(1)
    loc = geolocator.geocode(address)
    if loc != None:
        latlist_total.append(loc.latitude)
        longlist_total.append(loc.longitude)
    else:
        latlist_total.append(0)
        longlist_total.append(0)        

In [None]:
print(len(latlist_total))
print(len(longlist_total))

In [None]:
d = {'location': location, 'lat': latlist_total, 'long': longlist_total}

In [None]:
# put lat and long data into a dataframe
df_location = pd.DataFrame(data = d)

df_location
print(len(df_location[df_location['lat'] != 0.000000]))
print(len(df_location[df_location['lat'] == 0.000000]))

#lat: 42
#long: 58

In [None]:
df_location[:40000][df_location['lat'] == 0.000000]

In [None]:
df_location.to_csv('location_coor.csv') # save coordinates retrieved using geopy

In [None]:
latlist_g = df_location['lat'].tolist()
longlist_g = df_location['long'].tolist()

In [None]:
len(location)

In [None]:
len(latlist_g)

## Retrieve remaining missing lat and long coordinates using googlemaps API

In [None]:
latlist_g2 = []
longlist_g2 = []
location_g2 = []


for i,j,k in tqdm(zip(latlist_g, longlist_g, location)):
    
    try: 
        if i == 0:
            
            address_c = k + ', Toronto,' + ' Canada'

            gmaps = googlemaps.Client(key='{insert api key here}')

            try: 
                geocode_result = gmaps.geocode(address_c)
                if geocode_result != []:
                    try:
                        lat = geocode_result[0]['geometry']['location']['lat']
                        #print(lat)
                        long = geocode_result[0]['geometry']['location']['lng']
                        #print(long)
                        latlist_g2.append(lat)
                        longlist_g2.append(long)
                        location_g2.append(k)
                    
                    except:
                        latlist_g2.append(i)
                        longlist_g2.append(j)
                        location_g2.append(k)   

                else: 
                    latlist_g2.append(i)
                    longlist_g2.append(j)
                    location_g2.append(k)      

            except:
                latlist_g2.append(i)
                longlist_g2.append(j)
                location_g2.append(k) 

        else:
            latlist_g2.append(i)
            longlist_g2.append(j)
            location_g2.append(k)

    except:
        latlist_g2.append(i)
        longlist_g2.append(j)
        location_g2.append(k) 
            
            

# Look up an address with reverse geocoding
#reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

# Request directions via public transit

In [None]:
print(len(latlist_g2))
print(len(longlist_g2))

In [None]:
len(latlist_g)

In [None]:
len(longlist_g)

In [None]:
len(latlist_g2)

In [None]:
len(longlist_g2)

In [None]:
len(location)

In [None]:
d = {'location': location, 'lat': latlist_g2, 'long': longlist_g2}

In [None]:
df_location = pd.DataFrame(data = d)

df_location.set_index('location')

df_location = df_location.rename(columns = {'location': 'Location'})
df_location.set_index('Location')

In [None]:
df_location.loc[39999]

In [None]:
latlist_g2[39999]

## random test case for coordinates

In [None]:
df_location[df_location['Location'] == 'sloane and sweeney']

In [None]:
df_location[df_location['Location'] == 'exhibition place']

## Combine location data to bus and streetcar data using location common column

In [None]:
df_tot_v1 = pd.merge(df_tot, df_location)

In [None]:
df_tot_v1

In [None]:
df_tot_v1.info()

In [None]:
df_tot_v1.to_csv('BusandStreetcarDelayData_v2.csv')