# filtering
> raw data filtering based on date and location (service area, workdays, excluding festivals, rush hours, weekends),
> currency,
> thresholding inliers by distance(?)/duration(?)  

In [None]:
#| default_exp filtering

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#| export
import os
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from tqdm import tqdm
import plotly.express as px
from datetime import timedelta
from google.oauth2 import service_account
from dotenv import load_dotenv
import pandas_gbq


In [None]:
from geopy.distance import geodesic as GD
import haversine as hs
from haversine import Unit

In [None]:
from workalendar.usa import Florida, NewYork

In [None]:
from pricing.data.database import QUERY_PRICING_TRAINING, get_dataframe_from_sql

In [None]:
load_dotenv("../.env")
key = os.getenv('GC_QUOTE_API_CREDENTIALS')

In [None]:

credential = service_account.Credentials.from_service_account_file("../" + key)


In [None]:
cal_orlando = Florida()
cal_orlando.holidays(2024)

In [None]:
cal_newyork = NewYork()
cal_newyork.holidays(2024)

In [None]:
from workalendar.asia import China
cal_shanghai = China()
cal_shanghai.holidays(2023)

In [None]:
import holidays

In [None]:
cn_holidays = holidays.China()
cn_holidays

In [None]:

df = get_dataframe_from_sql(credential=credential,query=QUERY_PRICING_TRAINING)
# df = pandas_gbq.read_gbq(
#     QUERY_PRICING_TRAINING,credential
# )

In [None]:
df

In [None]:
df.loc[:,['start_latit','start_longt','end_latit','end_longt']]
df.columns

In [None]:
df['haversine_distance'] = df.apply(lambda x: hs.haversine((x['start_latit'],x['start_longt']),(x['end_latit'],x['end_longt']), unit=Unit.KILOMETERS), axis=1)
df['geodesic_distance'] = df.apply(lambda x: GD((x['start_latit'],x['start_longt']),(x['end_latit'],x['end_longt'])).km, axis=1)

In [None]:
df.loc[:,['distance','haversine_distance','geodesic_distance','ride_status','trip_type','vehicle_class','partner']]


In [None]:
df_trips = df.loc[:,['start_latit','start_longt','end_latit','end_longt', 'distance', 'duration', 'geodesic_distance', 'dispatch_amount', 'dispatch_currency', 'from_timezone_str']]
df_trips['saved_distance'] = df['distance']/1000.0
df_trips

In [None]:
source = df_trips.loc[:,['start_latit','start_longt']]
destin = df_trips.loc[:,['end_latit','end_longt']]
source.loc[0]

In [None]:
source.loc[0,'start_latit']

In [None]:
start = f"{df_trips.loc[0,'start_longt']},{df_trips.loc[0,'start_latit']}"
end = f"{df_trips.loc[0,'end_longt']},{df_trips.loc[0,'end_latit']}"
start
end

In [None]:
url = f'http://router.project-osrm.org/route/v1/driving/{start};{end}?alternatives=false&annotations=nodes'

headers = { 'Content-type': 'application/json'}
r = requests.get(url, headers = headers)
print("Calling API ...:", r.status_code) # Status Code 200 is success

In [None]:
routejson = r.json()
route_nodes = routejson['routes'][0]['legs'][0]['annotation']['nodes']
route_distance = routejson["routes"][0]["distance"]
route_duration = routejson["routes"][0]["duration"]

In [None]:
td = timedelta(seconds=route_duration)
td
str(td)
route_duration

In [None]:

f"{str(td)}"


In [None]:
len(df)

In [None]:
df_trips.loc[:, ['route_distance', 'route_duration','saved_duration']] = np.zeros(shape=(len(df),3))
df_trips['route_duration'] = df_trips['route_duration'].astype('object')
df_trips['saved_duration'] = df_trips['saved_duration'].astype('object') 
df_trips
df_trips.dtypes

In [None]:

#td = timedelta(seconds=route_duration)
df.loc[0,'duration']
td = timedelta(seconds=float(df.loc[0,'duration']))
str(td)


In [None]:
df_trips.loc[0,'start_longt']
df_trips.loc[0,'duration']
df_trips

In [None]:
from tqdm import tqdm
for i in tqdm(range(0, len(df_trips))):
    start = f"{float(df_trips.loc[i,'start_longt'])},{float(df_trips.loc[i,'start_latit'])}"
    end   = f"{float(df_trips.loc[i,'end_longt'])},{float(df_trips.loc[i,'end_latit'])}"
    url = f'http://router.project-osrm.org/route/v1/driving/{start};{end}?alternatives=false&annotations=nodes'
    headers = { 'Content-type': 'application/json'}
    try:
        r = requests.get(url, headers = headers)
    except requests.exceptions.RequestException as e:
        df_trips.loc[i,'route_distance'] = -100.0
        df_trips.loc[i,'route_duration'] = 'N/A' 
        df_trips.loc[i,'saved_duration'] = 'N/A'
        continue
        
    # print("Calling API ...:", r.status_code) # Status Code 200 is success
    routejson = r.json()
    route_dist = routejson["routes"][0]["distance"]
    route_dura = routejson["routes"][0]["duration"]
    df_trips.loc[i,'route_distance'] = route_dist/1000
    df_trips.loc[i,'route_duration'] = str(timedelta(seconds=route_dura))
    df_trips.loc[i,'saved_duration'] = str(timedelta(seconds=float(df_trips.loc[i,'duration'])))
    
#df['route_distance'] = df.apply(lambda x: get_route_distance((x['start_latit'],x['start_longt']),(x['end_latit'],x['end_longt'])), axis=1)  # not working since it is not vectorized  (need to create UFUNC)


In [None]:
df_trips.loc[:,['distance','saved_duration','route_distance','route_duration']]

In [None]:
df['route_duration'] = df_trips['route_duration']
df['route_distance'] = df_trips['route_distance']
df['saved_duration'] = df_trips['saved_duration']
df['saved_distance'] = df['distance']/1000.0



In [None]:
df_trips['saved_distance'] = df['saved_distance']
df_trips

In [None]:
### keeping every third element in the node list to optimise time
route_list = []
for i in range(0, len(route_nodes)):
    if i % 3==1:
        route_list.append(route_nodes[i])

coordinates = []

for node in tqdm(route_list):
    try:
        url = 'https://api.openstreetmap.org/api/0.6/node/' + str(node)
        r = requests.get(url, headers = headers)
        myroot = ET.fromstring(r.text)
        for child in myroot:
            lat, long = child.attrib['lat'], child.attrib['lon']
        coordinates.append((lat, long))
    except:
        continue
print(coordinates[:10])

In [None]:
df_out = pd.DataFrame({'Node': np.arange(len(coordinates))})
df_out['coordinates'] = coordinates
df_out[['lat', 'long']] = pd.DataFrame(df_out['coordinates'].tolist())

In [None]:
# Converting Latitude and Longitude into float
df_out['lat'] = df_out['lat'].astype(float)
df_out['long'] = df_out['long'].astype(float)

# Plotting the coordinates on map
color_scale = [(0, 'red'), (1,'green')]
fig = px.scatter_mapbox(df_out, 
                        lat="lat", 
                        lon="long", 
                        zoom=8, 
                        height=600,
                        width=900)


fig.update_layout(mapbox_style="open-street-map",margin={"r":0,"t":0,"l":0,"b":0})

In [None]:
df.to_csv("../data/pricing_training_dataset_sample1.csv")
df_trips.to_csv("../data/pricing_training_dataset_trip_sample1.csv")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()