# data database scraper
> database scraper

In [None]:
#| default_exp data.database.scraper

In [None]:
#| hide
from nbdev.showdoc import *
from pyasn1_modules.rfc3279 import id_fieldType

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#| export
import os
import pandas as pd 

In [None]:
#| export
from sqlalchemy import MetaData, create_engine, asc, desc, and_, or_, not_, case, extract, cast, text, distinct, Column, update, bindparam
from sqlalchemy.types import DateTime, Date, Time, String
from sqlalchemy.schema import *
from sqlalchemy.sql import func as F, Selectable, select
from sqlalchemy.dialects import registry
from sqlalchemy.engine.row import Row
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker


In [None]:
#| export
import requests
from tqdm.notebook import tqdm

In [None]:
#| export
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from geopy.timezone import from_timezone_name

In [None]:
#| hide
from google.cloud import bigquery
from dotenv import load_dotenv


In [None]:

load_dotenv("../.env")

In [None]:
os.environ

In [None]:
key = os.getenv('GC_QUOTE_API_CREDENTIALS')

In [None]:
credential = service_account.Credentials.from_service_account_file("../" + key)


In [None]:
registry.register('bigquery', 'sqlalchemy_bigquery', 'BigQueryDialect')
engine = create_engine('bigquery://quote-api-365206',
                       credentials_path='../' + key,)


In [None]:
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
metadata = MetaData()
def get_table(project_name: str, dataset_name: str, table_name: str)-> Table:
    table = Table(f'{project_name}.{dataset_name}.{table_name}', metadata, autoload_with=engine)
    return table

In [None]:
connection = engine.connect()
connection

# local table from local sqlite database

In [None]:
data_from_date = '2024-01-01'

In [None]:
sql_eng = create_engine('sqlite:///../data/price_training_raw.db', echo=False)
conn = sql_eng.connect()
metadata = MetaData()
raw = Table('price_training_raw_2024_usd', metadata, autoload_with=sql_eng)
Session = sessionmaker(bind=sql_eng)
session = Session()
chunk_size = 100

In [None]:
metadata2 = MetaData()
fix_zones = Table('price_training_dispatch_fixed_zones', metadata2, autoload_with=sql_eng)

In [None]:
total_rows = session.query(raw).count()
total_rows

# inserted two new columns for fixed price zone start_zone and end_zone when creating sql table from csv file

In [None]:
# # engine.execute('ALTER TABLE price_training_raw ADD COLUMN start_zone TEXT')
# # engine.execute('ALTER TABLE price_training_raw ADD COLUMN end_zone TEXT')
# start_zone_column = Column('start_zone', String)
# end_zone_column = Column('end_zone', String)
# add_column_op = AddColumn(start_zone_column, raw)


In [None]:

url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'


In [None]:
# Query the table in chunks
query = session.query(raw)
chunk_size =1000

In [None]:

batch_stmt = (
    update(raw)  # 'raw' is your table object
    .where(raw.c.dispatch_id == bindparam('b_dispatch_id'))
    .values(
        route_start=bindparam('route_start'),
        route_end=bindparam('route_end')
    )
)
print(batch_stmt)
compiled = batch_stmt.compile()
print(compiled.params)


In [None]:
result_csv = '../data/dispatch_fixed_zones_all.csv'
for chunk in tqdm(pd.read_sql(query.statement, conn, index_col='dispatch_id', chunksize=chunk_size), total=total_rows//chunk_size+1, desc='Overall Processing'):
    # [chunk[r] for r in chunk]
    # l = [r for r in chunk.iterrows()]
    # l
    # print(chunk.dtypes)
    route_list = []
    for i,r in tqdm(chunk.iterrows(),total=chunk_size, desc='Chunk Processing', leave=False):
        # l = [i, r['start_ltt'], r['start_lng'], r['end_ltt'], r['end_lng']]
        # print(l)
        params = {
            'from_lat': r['start_ltt'],
            'from_lng': r['start_lng'],
            'to_lat': r['end_ltt'],
            'to_lng': r['end_lng'],
        }
        try:
            response = requests.get(url=url, params=params)
        except requests.exceptions.Timeout:
            print('Timeout')
            continue
        except requests.exceptions.TooManyRedirects:
            print('TooManyRedirects')
            continue
            # Tell the user their URL was bad and try a different one
        except requests.exceptions.RequestException as e:
            print('RequestException, Catastrophic error!')
            continue
            # catastrophic error. bail.
            # raise SystemExit(e)

        except Exception as e:
            print(f"request: {e}")
            continue
        # print('2')
        try:
            res = response.json()
        except Exception as e:
            print(f"json: {e}")
            continue
        # print('3')
        try:
            fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
        except KeyError as e:
            print(f"No Fixed Price!")
            continue
        except IndexError as e:
            print("IndexError for fix_price_zones")
            continue
        except Exception as e:
            print(f"dict: {e}")
            continue
        # print('4')
        if not isinstance(fix_price_zones,dict):
            print(f"No fix price: {fix_price_zones}")
        else:
            try:
                route_list.append((i, fix_price_zones['from'], fix_price_zones['to']))
                # fix_zone_routes_list.append(route)
                # chunk.at[i, 'route_start'] = fix_price_zones['from']
                # chunk.at[i, 'route_end'] = fix_price_zones['to']
                # ins = insert(fixed_zone_routes).values(
                #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt        ._mapping['dispatch_id'])
                # stmt = (
                #     update(raw)  # 'raw' is your table object
                #     .where(raw.c.dispatch_id == int(i))
                #     .values(
                #         route_start = fix_price_zones['from'],
                #         route_end = fix_price_zones['to']
                #     )
                # )
                # conn.execute(stmt)
                # conn.commit()
            except KeyError as e:
                print("KeyError for route")
                continue
    
    try:
        pd.DataFrame(data=route_list, columns=['dispatch_id', 'route_start', 'route_end']).to_csv(result_csv, mode='a', header=False)
    except Exception as e:
        print(f"csv: {e}")
        continue
    
    # with sqlite_eng.begin() as conn:
    #     conn.execute(
    #         stmt, 
    #         [
    #             {
    #                 'b_dispatch_id': i,
    #                 'route_start': r['route_start'],
    #                 'route_end': r['route_end']
    #             }
    #             for i,r in chunk.iterrows()
    #         ],
    #     )
    #     conn.commit()


In [None]:
len(result)
result[0]

In [None]:

with sql_eng.begin() as conn:
    conn.execute(
        batch_stmt, 
        [
            {
                'b_dispatch_id': i,
                'route_start': r['route_start'],
                'route_end': r['route_end']
            }
            for i,r in res.iterrows()
        ],
    )
    conn.commit()


In [None]:
for chunk in pd.read_sql(query.statement, local_conn, chunksize=20):
    # [chunk[r] for r in chunk]
    # l = [r for r in chunk.iterrows()]
    # l
    for i,r in chunk.iterrows():
        # l = [r[0], r[1]['start_ltt'], r[1]['start_lng'], r[1]['end_ltt'], r[1]['end_lng']]
        params = {
            'from_lat': r['start_ltt'],
            'from_lng': r['start_lng'],
            'to_lat': r['end_ltt'],
            'to_lng': r['end_lng'],
        }
        try:
            response = requests.get(url=url, params=params)
        except requests.exceptions.Timeout:
            print('Timeout')
            continue
        except requests.exceptions.TooManyRedirects:
            print('TooManyRedirects')
            continue
            # Tell the user their URL was bad and try a different one
        except requests.exceptions.RequestException as e:
            print('RequestException, Catastrophic error!')
            continue
            # catastrophic error. bail.
            # raise SystemExit(e)
        
        except Exception as e:
            print(f"request: {e}")
            continue
        # print('2')
        try:
            res = response.json()
        except Exception as e:
            print(f"json: {e}")
            continue
        # print('3')
        try:
            fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
        except KeyError as e:
            j = j +1
            print(f"{j} Not Fixed Price!")
            continue
        except IndexError as e:
            print("IndexError for fix_price_zones")
            continue
        except Exception as e:
            print(f"dict: {e}")
            continue
        # print('4')
        if not isinstance(fix_price_zones,dict):
            print(f"No fix price: {fix_price_zones}")
        else:
            try:
                # route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
                # fix_zone_routes_list.append(route)
                chunk.at[i, 'route_start']
                
                # ins = insert(fixed_zone_routes).values(
                #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt        ._mapping['dispatch_id'])
                i = i +1
            except KeyError as e:
                print("KeyError for route")
                continue
        
        price_training_q = session.query(price_training_t).limit(500)
fix_zone_routes_list = []


In [None]:
i = 0
j = 0
for pt in price_training_q:
    # print('1')
    # ride_samples.append(pt)
    params = {
        'from_lat': pt._mapping['start_ltt'],
        'from_lng': pt._mapping['start_lng'],
        'to_lat': pt._mapping['end_ltt'],
        'to_lng': pt._mapping['end_lng'],
        # 'from_utc':pt._mapping['from_utc'],
    }
    try:
        response = requests.get(url=url, params=params)
    except requests.exceptions.Timeout:
        print('Timeout')
        continue
    except requests.exceptions.TooManyRedirects:
        print('TooManyRedirects')
        # Tell the user their URL was bad and try a different one
    except requests.exceptions.RequestException as e:
        print('RequestException, Catastrophic error!')
        # continue
        # catastrophic error. bail.
        raise SystemExit(e)    
        
    except Exception as e:
        print(f"request: {e}")
        continue
    # print('2')
    try:
        res = response.json()
    except Exception as e:
        print(f"json: {e}")
        continue
    # print('3')
    try:
        fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
    except KeyError as e:
        j = j +1
        print(f"{j} Not Fixed Price!")
        continue
    except IndexError as e:
        print("IndexError for fix_price_zones")
        continue
    except Exception as e:
        print(f"dict: {e}")
        continue
    # print('4')
    if not isinstance(fix_price_zones,dict):
        print(f"No fix price: {fix_price_zones}")
    else:
        try:
            route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
            fix_zone_routes_list.append(route)
            # ins = insert(fixed_zone_routes).values(
            #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt._mapping['dispatch_id'])
            i = i +1
        except KeyError as e:
            print("KeyError for route")
            continue

        # print('5')
        if i%50 == 0:
            # connection.execute(ins)
            df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
            df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
            fix_zone_routes_list = []
            print(f"Created {i} records")

    # print('6')
df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
fix_zone_routes_list = []
print(f"Created {i} records")
    # print('6')
    # print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # # print(pt._mapping['start_place'])
    # print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # # print(pt._mapping['end_place']) 
    # print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    # print("------------")

# Use ORM to retrieve records


In [None]:
sample_count = session.query(F.count(price_training_t.c.ride_id)).scalar()
print(sample_count)


In [None]:
url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'

In [None]:

# params = {
#     'from_lat': 37.61911449999999,
#     'from_lng':-122.3816274,
#     'to_lat':37.3635295,
#     'to_lng':-121.9285932,
#     'from_utc':1727352000,
# }
# response = requests.get(url=url, params=params)
# response.json()


In [None]:
# res = response.json()
# fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
# print(fix_price_zones)

In [None]:
# ride_samples = []

In [None]:
# pt = price_training_q.first()
# pt

In [None]:
# params = {
#     'from_lat': (pt._mapping['start_ltt']),
#     'from_lng': (pt._mapping['start_lng']),
#     'to_lat': (pt._mapping['end_ltt']),
#     'to_lng': (pt._mapping['end_lng']),
#     # 'from_utc': int(pt._mapping['from_utc']),
# }
# params

# response = requests.get(url=url, params=params)
# res = response.json()
# res

In [None]:
from sqlalchemy import String,Integer,insert

In [None]:
fix_zone_routes_list

In [None]:
price_training_q = session.query(price_training_t).limit(10)
ride_samples = []
for pt in price_training_q:
    ride_samples.append(pt)
    print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # print(pt._mapping['start_place'])
    print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # print(pt._mapping['end_place']) 
    print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    print("------------")



In [None]:
ride_samples[0]._mapping

In [None]:
df = pd.DataFrame(ride_samples)
df

In [None]:

rides_q = session.query(price_training_t).limit(10)

In [None]:

for ride in rides_q:
    print(ride)

In [None]:
rides_q = session.query(ride_ride_t).limit(10)
for ride in rides_q:
    print(ride)

In [None]:
# print(price_training_t)

In [None]:
# use core to retrieve records
# rp = connection.execute(price_training_t)
# for i, record in enumerate(rp):
#     print(i, record.ride_id)

In [None]:
# results = rp.fetchall()
# results

In [None]:
# df = pd.read_sql(price_training_t, engine)
# df

In [None]:
sqlite_eng = create_engine('sqlite:///../data/price_training_from_gbq_raw.db', echo=False)

In [None]:
df.to_sql('price_training_orlando_mpv5', sqlite_eng, if_exists='replace')

# Pandas Processing


## Traffic peak time exclusion
source https://www.quora.com/What-is-the-trickiest-time-of-the-day-to-drive-in-Orlando

In [None]:
from workalendar.usa import Florida

In [None]:
cal_orlando = Florida()


In [None]:
exclude_dates = [d[0] for d in cal_orlando.holidays(2024)]
exclude_dates

In [None]:
from workalendar.usa import Florida 
import numpy as np
cal_florida = Florida()
exclude_dates_str = [str(d[0]) for d in cal_florida.holidays(2024)]
exclude_dates_str

In [None]:
exclude_dates = [d[0] for d in cal_florida.holidays(2024)]
# exclude_dates
# res = df['from_datetime_utc'].apply(lambda x: x in exclude_dates)

In [None]:
# np.any(res.apply(lambda x: x in exclude_dates))
# res[0]=True
# res
# np.any(res)


In [None]:
from pricing.data.utils import validate_datetime_in_iso_format, validate_timezone_in_iana, get_timezone_abbreviation, fix_timezone

In [None]:
df_invalid_datetime = df[df.apply(lambda x: not validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_invalid_datetime

In [None]:
df_invalid_timezone = df[df.apply(lambda x: not validate_timezone_in_iana(x['from_timezone_str']), axis=1)]
df_invalid_timezone

In [None]:
#orlando_airport = pd.read_csv('../../data/orlando_all_output.csv')
#orlando_airport.head()
#orlando_airport.dtypes
#orlando_airport.to_sql('orlando_airport', sqlite_eng, if_exists='append')

In [None]:
df['from_timezone_fix_str'] = df.apply(lambda x: fix_timezone(x['from_timezone_str']), axis=1) 
df

In [None]:

df_valid_datetime = df[df.apply(lambda x: validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_valid_timezone = df_valid_datetime[df_valid_datetime.apply(lambda x: validate_timezone_in_iana(x['from_timezone_fix_str']), axis=1)]
df_valid_timezone


In [None]:

df_valid_timezone.loc[df_invalid_timezone.index, ['from_timezone_str', 'from_timezone_fix_str']]


In [None]:
from datetime import datetime
import pytz
df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x['from_timezone_fix_str'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


In [None]:
df_dt_str = df_valid_timezone['from_datetime_fix_str']
df_valid_timezone['from_datetime_local'] = df_valid_timezone.apply(lambda x: (pd.to_datetime(x['from_datetime_fix_str']).to_datetime64()), axis=1)
df_valid_timezone

In [None]:
df_valid_timezone.dtypes

In [None]:

# df_valid_timezone['from_datetime_local_tz'] = df_valid_timezone.apply(lambda x: pytz.timezone(x.loc[:,'from_timezone_str']).localize(x.loc[:,'from_datetime_local']), axis=1)
df_valid_timezone['from_timezone'] = df_valid_timezone.apply(lambda x: pytz.timezone(x['from_timezone_fix_str']), axis=1)
df_valid_timezone


In [None]:
df_valid_timezone['from_datetime_tz'] = df_valid_timezone.apply(lambda x: x['from_timezone'].localize(x['from_datetime_local']), axis=1)
df_valid_timezone
                                    # .apply(lambda x: x.localize(x.loc[:,'from_timezone_str']), axis=1))
# df_valid_timezone


# filtering out peak traffic time

In [None]:
peak_time_str = [('07:00:00', '09:00:00'), ('16:00:00', '19:00:00')]
night_time_str = [('22:00:00', '6:00:00')]  # Shanghai, US usually no overtime extra fees New York 8pm ~ 6am
ind = []
td = []
for pt in peak_time_str:
    ind.append(pd.DatetimeIndex(pt))
ind
for i in ind:
    i[1]-i[0]

In [None]:
peak_time = []
for pt in peak_time_str:
    peak_time.append(pd.date_range(pt[0], pt[1], freq='h'))
for pt in peak_time:
    print(pt, pt.time)

In [None]:
peak_time_str

In [None]:

df_peak_traffic_time = df_valid_timezone[
    df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1])
]
df_peak_traffic_time

In [None]:

df_out_of_peak_traffic_time = df_valid_timezone[
    ~ (df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1]))
]
df_out_of_peak_traffic_time


In [None]:

# df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])))
#                             .apply(lambda x: x.strftime('%z')))
# df_utc_offset.name = 'utc_offset'
# df_utc_offset
# 

# filter out round trip (time reservation) with feature distance = 1 (m)

In [None]:
df_no_round_trip = df_valid_timezone[df_valid_timezone['distance'] > 1]

In [None]:
df_training = df_no_round_trip.loc[:,['ride_id', 'trip_type', 'trip_no', 'trip_count', 'ride_status', 'partner', 'fleet', 
                'start_place', 'end_place',
                'passenger_count', 'luggage_count',
                'dispatch_amount', 'dispatch_currency',
                'distance', 'duration', 'vehicle_class', 
                'from_datetime_tz']]
 
df_training['cent_price_per_km'] = df_training['dispatch_amount'] / df_training['distance']*100.0
df_training

In [None]:
def average_cent_per_km(x):
    x['average_cent_per_km'] = x['cent_price_per_km'].mean()
    return x

In [None]:
def fleet_trip_no(x):
    x['fleet_trip_count'] = len(x)
    return x

In [None]:
df_fleet_statistics = df_training.loc[:, ['ride_id','fleet']]
df_fleet_statistics

In [None]:

df_fleet_trip_no = df_fleet_statistics.groupby('fleet').aggregate([len])
df_fleet_trip_no.sort_values(by=('ride_id', 'len'), ascending=False, inplace=True)
df_fleet_trip_no


In [None]:
df_big_fleets = df_fleet_trip_no[df_fleet_trip_no[('ride_id','len')] >100]
df_big_fleets

In [None]:
df_big_fleet_data = []
for f in df_big_fleets.index:
    print(f)
    df_big_fleet_data.append(df_training[df_training['fleet'] == f])

In [None]:
df_big_fleet_data[0]

In [None]:
df_big_fleet_data[1]

In [None]:
df_big_fleet_data[2]

In [None]:

# df_training['average_cent_per_km'] = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis


In [None]:

df_training

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()