In [None]:
import os
import json
import requests
import pandas as pd
import numpy as np
import time 
import matplotlib.pyplot as plt

from functions import load_api_key, haversine

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [None]:
api_key = load_api_key()

In [None]:
rental_data_df = pd.read_csv('inputs/rental_with_coordinates.csv')
unique_addresses_df = rental_data_df.drop_duplicates(subset='address', keep='first')[['address', 'postal_code', 'latitude', 'longitude']]
print('No of unique addresses', rental_data_df['address'].nunique())

In [None]:
station_df = pd.read_csv('inputs/stations_with_coordinates.csv')
station_df.columns = 'station_' + station_df.columns
print('No of unique stations', station_df['station_station_name'].nunique())

In [None]:
time_distance_df = unique_addresses_df.merge(station_df, how = 'cross')
time_distance_df['walking_time_s'] = None
time_distance_df['walking_distance_m'] = None
print('No of possible combination from each address to each stations', len(time_distance_df))

In [None]:
time_distance_df.loc[time_distance_df['address'] == '213 CHOA CHU KANG CTRL'].head()

In [None]:
# resolve incorrect values
print(time_distance_df.loc[time_distance_df['postal_code'] == 'NIL', 'address'].unique())
time_distance_df.loc[time_distance_df['address'] == '215 CHOA CHU KANG CTRL', 'postal_code'] = 680215

rental_data_df.loc[rental_data_df['address'] == '215 CHOA CHU KANG CTRL', 'postal_code'] = 680215
rental_data_df.to_csv('inputs/rental_with_coordinates.csv', index = False)

In [None]:
time_distance_df['station_postal_code'] = time_distance_df['station_postal_code'].astype(int)
time_distance_df['postal_code'] = time_distance_df['postal_code'].astype(int)

In [None]:
# Calculate the distance for each row
time_distance_df["hervsine_distance"] = time_distance_df.apply(
                                        lambda row: haversine(row["latitude"], row["longitude"], row["station_latitude"], row["station_longitude"]), axis=1)

In [None]:
time_distance_df

In [None]:
# Group by 'address' and get the top 3 smallest distances
filtered_time_distance_df = (
    time_distance_df.groupby("address", group_keys=False)
    .apply(lambda group: group.nsmallest(3, "hervsine_distance"), include_groups=False))

filtered_time_distance_df = filtered_time_distance_df.reset_index(drop = True)
filtered_time_distance_df

In [None]:
filtered_time_distance_df['postal_code'].nunique()

In [None]:
start = time.time()

for row in range(len(filtered_time_distance_df)):
    start_latitude  = float(filtered_time_distance_df.loc[row, 'latitude'])
    start_longitude = float(filtered_time_distance_df.loc[row, 'longitude'])    
    end_latitude    = float(filtered_time_distance_df.loc[row, 'station_latitude'])
    end_longitude   = float(filtered_time_distance_df.loc[row, 'station_longitude'])   

    url = f"https://www.onemap.gov.sg/api/public/routingsvc/route?start={start_latitude}%2C{start_longitude}&end={end_latitude}%2C{end_longitude}&routeType=walk"
    headers = {"Authorization": api_key}
    response = requests.request("GET", url, headers=headers)
    while response.status_code != 200:
        time.sleep(169)
        response = requests.request("GET", url, headers=headers)
    parsed_data = json.loads(response.text)    
    filtered_time_distance_df.loc[row, 'walking_time_s'] = parsed_data['route_summary']['total_time'] 
    filtered_time_distance_df.loc[row, 'walking_distance_m'] = parsed_data['route_summary']['total_distance'] 

    if row%100 == 0:
        print(row, (time.time() - start)/60)

In [None]:
filtered_time_distance_df

# Handle previous wrong postal code

In [None]:
filtered_time_distance_df = pd.read_csv('inputs/travelling_distance.csv')

In [None]:
df = pd.DataFrame({'postal_code' : [530021],
                  'latitude' : [1.364246],
                  'longitude' : [103.8914777],
                  })

In [None]:
special_df = df.merge(station_df, how = 'cross')

In [None]:
special_df["hervsine_distance"] = special_df.apply(
                                        lambda row: haversine(row["latitude"], row["longitude"], row["station_latitude"], row["station_longitude"]), axis=1)

In [None]:
special_df

In [None]:
filtered_special_df = special_df.sort_values('hervsine_distance', ascending = True).iloc[:3]
filtered_special_df = filtered_special_df.reset_index(drop = True)
filtered_special_df

In [None]:
start = time.time()

for row in range(len(filtered_special_df)):
    start_latitude  = float(filtered_special_df.loc[row, 'latitude'])
    start_longitude = float(filtered_special_df.loc[row, 'longitude'])    
    end_latitude    = float(filtered_special_df.loc[row, 'station_latitude'])
    end_longitude   = float(filtered_special_df.loc[row, 'station_longitude'])   

    url = f"https://www.onemap.gov.sg/api/public/routingsvc/route?start={start_latitude}%2C{start_longitude}&end={end_latitude}%2C{end_longitude}&routeType=walk"
    headers = {"Authorization": api_key}
    response = requests.request("GET", url, headers=headers)
    while response.status_code != 200:
        time.sleep(169)
        response = requests.request("GET", url, headers=headers)
    parsed_data = json.loads(response.text)    
    filtered_special_df.loc[row, 'walking_time_s'] = parsed_data['route_summary']['total_time'] 
    filtered_special_df.loc[row, 'walking_distance_m'] = parsed_data['route_summary']['total_distance'] 


In [None]:
filtered_special_df

In [None]:
filtered_time_distance_df = pd.concat([filtered_time_distance_df, filtered_special_df], axis = 0).reset_index(drop = True)

In [None]:
filtered_time_distance_df.to_csv('inputs/travelling_distance.csv', index = False)