# data database scraper
> database scraper

In [None]:
#| default_exp data.database.scraper

In [None]:
#| hide
from nbdev.showdoc import *
from pyasn1_modules.rfc3279 import id_fieldType

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#| export
import os
import pandas as pd 

In [None]:
#| export
from sqlalchemy import MetaData, create_engine, asc, desc, and_, or_, not_, case, extract, cast, text, distinct
from sqlalchemy.types import DateTime, Date, Time
from sqlalchemy.schema import *
from sqlalchemy.sql import func as F, Selectable, select
from sqlalchemy.dialects import registry
from sqlalchemy.engine.row import Row
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker


In [None]:
#| export
import requests
from tqdm import tqdm

In [None]:
#| export
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from geopy.timezone import from_timezone_name

In [None]:
#| hide
from google.cloud import bigquery
from dotenv import load_dotenv


In [None]:

load_dotenv("../.env")

In [None]:
os.environ

In [None]:
key = os.getenv('GC_QUOTE_API_CREDENTIALS')

In [None]:
credential = service_account.Credentials.from_service_account_file("../" + key)


In [None]:
registry.register('bigquery', 'sqlalchemy_bigquery', 'BigQueryDialect')
engine = create_engine('bigquery://quote-api-365206',
                       credentials_path='../' + key,)


In [None]:
Session = sessionmaker(bind=engine)

In [None]:
session = Session()

In [None]:
metadata = MetaData()
def get_table(project_name: str, dataset_name: str, table_name: str)-> Table:
    table = Table(f'{project_name}.{dataset_name}.{table_name}', metadata, autoload_with=engine)
    return table

In [None]:
connection = engine.connect()

In [None]:
connection

# find all tables in the database

In [None]:
ride_trip_t = get_table('elife-data-warehouse-prod','ods', 'ride_trip').alias()
ride_dispatch_t = get_table('elife-data-warehouse-prod','ods', 'ride_dispatch').alias()
ride_enum_t = get_table('elife-data-warehouse-prod','ods', 'ride_enum').alias()
ride_ride_t = get_table('elife-data-warehouse-prod','ods', 'ride_ride').alias()
ride_partner_tran_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner_tran').alias()
ride_partner_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner').alias()
dim_place_t = get_table('elife-data-warehouse-prod','dim', 'dim_place').alias()
ride_vehicle_class_t = get_table('elife-data-warehouse-prod','ods', 'ride_vehicle_class').alias()

In [None]:
auction_ride_t = get_table('elife-data-warehouse-prod','ods', 'ride_auction_ride').alias()
auction_fleet_t = get_table('elife-data-warehouse-prod','ods', 'ride_auction_fleet').alias()
ride_fleet_t = get_table('elife-data-warehouse-prod','ods', 'ride_fleet').alias()

In [None]:
trip_type_id_t = select(ride_dispatch_t.c.ride_id, ride_trip_t.c.trip_type.label('trip_type_id'))
trip_type_id_t = trip_type_id_t.select_from(ride_dispatch_t
                                      .join(ride_trip_t, ride_dispatch_t.c.ride_id == ride_trip_t.c.ride_id))
trip_type_id_t = trip_type_id_t.alias()
                  
# trip_type_id_t = trip_type_id_t.limit(10)
# df = pd.read_sql(trip_type_id_t, engine
# df

In [None]:

# trip_type_id_q = session.query(trip_type_id_t).limit(10)
# # for trip in trip_type_id_q:
#     print(trip)



In [None]:
trip_type_t = select(trip_type_id_t.c.ride_id, trip_type_id_t.c.trip_type_id, ride_enum_t.c.name.label('trip_type'))
trip_type_t = trip_type_t.select_from(trip_type_id_t
                                      .join(ride_enum_t, trip_type_id_t.c.trip_type_id == ride_enum_t.c.id, isouter=True))
trip_type_t = trip_type_t.alias()


In [None]:
ride_status_t = select(ride_ride_t.c.id.label('ride_id'), ride_ride_t.c.stat.label('ride_status_id'), ride_enum_t.c.name.label('ride_status'))
ride_status_t = ride_status_t.select_from(ride_ride_t
                                          .join(ride_enum_t, ride_ride_t.c.stat == ride_enum_t.c.id, isouter=True))
ride_status_t = ride_status_t.alias()
# ride_status_t = ride_status_t.limit(10)
# df = pd.read_sql(ride_status_t, engine)
# df

In [None]:
ride_datetime_t = select(ride_ride_t.c.id.label('ride_id'), 
                         F.substring(ride_ride_t.c.from_time_str, 1, 10).label('from_date_str'),
                         F.concat(F.substring(ride_ride_t.c.from_time_str, 12, 16),':00').label('from_time_fix_str'),
                         F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10), 
                                  ' ', 
                                  F.substring(ride_ride_t.c.from_time_str, 12, 16),
                                  ':00'
                                  ).label('from_datetime_fix_str'),
                         extract(
                             'DAYOFWEEK',
                             cast(F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10),
                                       ' ',
                                       F.substring(ride_ride_t.c.from_time_str, 12, 16),
                                       ':00'
                                       ),
                              DateTime)
                         ).label('day_of_week_local'),
                         extract(
                             'DAYOFWEEK',
                             cast(F.timestamp_seconds(ride_ride_t.c.from_utc), 
                                  DateTime)
                         ).label('day_of_week_utc'),
                         F.datetime(F.timestamp_seconds(ride_ride_t.c.from_utc)).label('from_datetime_utc'),
                         ride_ride_t.c.from_timezone_str,
                         # F.current_date(ride_ride_t.c.from_timezone_str).label('current_date'),
                         #cast(ride_ride_t.c.from_time_str,DateTime).label('from_datetime'),
                         #cast(F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10), 'T', F.substring(ride_ride_t.c.from_time_str, 12, 16)),DateTime).label('from_datetime'),
                         #cast(ride_ride_t.c.from_time_str, DateTime).label('from_time_dt'),
                         # F.timezone(ride_ride_t.c.from_timezone_str, cast(ride_ride_t.c.from_time_str, DateTime).label('from_time_dt')),
                         )                  
ride_datetime_t = ride_datetime_t.select_from(ride_ride_t)
ride_datetime_t = ride_datetime_t.alias()
# ride_datetime_t = ride_datetime_t.limit(100)
# df = pd.read_sql(ride_datetime_t, engine)
# df

In [None]:
dispatch_status_t = select(ride_dispatch_t.c.id.label('ride_id'), ride_dispatch_t.c.stat.label('dispatch_status_id'), ride_enum_t.c.name.label('distpatch_status'))
dispatch_status_t = dispatch_status_t.select_from(ride_dispatch_t
                                          .join(ride_enum_t, ride_dispatch_t.c.stat == ride_enum_t.c.id, isouter=True))
dispatch_status_t = dispatch_status_t.alias()
#dispatch_status_t = dispatch_status_t.limit(10)
#df = pd.read_sql(dispatch_status_t, engine)
#df

In [None]:
auction_type_t = select(
    auction_ride_t.c.ride_id.label('ride_id'),
    auction_ride_t.c.auction_id.label('auction_id'),
    auction_fleet_t.c.fleet_id.label('auction_fleet_id'),
    ride_dispatch_t.c.to_fleet_id.label('dispatch_fleet_id'),
    ride_fleet_t.c.name.label('fleet'),
    case(
        (ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id, 'auction'),
        else_ = 'dispatch',
    ).label('dispatch_type')
)
auction_type_t = auction_type_t.select_from(auction_ride_t
                                            .join(auction_fleet_t, auction_ride_t.c.auction_id == auction_fleet_t.c.auction_id, isouter=True)
                                            .join(ride_dispatch_t, auction_ride_t.c.ride_id == ride_dispatch_t.c.ride_id, isouter=True)
                                            .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True))
auction_type_t = auction_type_t.alias()
# auction_type_t = auction_type_t.limit(100)
# df = pd.read_sql(auction_type_t, engine)
# df

In [None]:
dispatch_type_t = (select(
    ride_dispatch_t.c.id.label('ride_id'),
    ride_dispatch_t.c.to_fleet_id.label('dispatch_fleet_id'),
    auction_ride_t.c.auction_id.label('auction_id'),
    auction_fleet_t.c.fleet_id.label('auction_fleet_id'),
    case(
        (ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id, 'auction'), 
        else_ = 'dispatch',
    ).label('dispatch_type'),
    ride_fleet_t.c.name.label('fleet')
))
# .where(
#     ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id
# ))
dispatch_type_t = dispatch_type_t.select_from(ride_dispatch_t
                                          .join(auction_ride_t, ride_dispatch_t.c.ride_id == auction_ride_t.c.ride_id, isouter=True)
                                          .join(auction_fleet_t, auction_ride_t.c.auction_id == auction_fleet_t.c.auction_id, isouter=True)
                                          .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True))
dispatch_type_t = dispatch_type_t.alias()
# dispatch_type_t = dispatch_type_t.limit(100)
# df = pd.read_sql(dispatch_type_t, engine)
# df

In [None]:
partner_id_t = select(ride_ride_t.c.id.label('ride_id'), ride_partner_tran_t.c.partner_id.label('partner_id'))
partner_id_t = partner_id_t.select_from(ride_ride_t
                                        .join(ride_partner_tran_t, ride_ride_t.c.partner_tran_id == ride_partner_tran_t.c.id, isouter=True))
partner_id_t = partner_id_t.alias()
#partner_id_t = partner_id_t.limit(10)
#df = pd.read_sql(partner_id_t, engine)
#df

In [None]:
partner_t = select(partner_id_t.c.ride_id, partner_id_t.c.partner_id, ride_partner_t.c.name.label('partner'))
partner_t = partner_t.select_from(partner_id_t
                                  .join(ride_partner_t, partner_id_t.c.partner_id == ride_partner_t.c.id, isouter=True))
partner_t = partner_t.alias()
# partner_t = partner_t.limit(10)
# df = pd.read_sql(partner_t,engine)
# df

In [None]:
from_place_t = select(ride_ride_t.c.id.label('ride_id'), 
                      ride_ride_t.c.from_place_id.label('start_place_id'), 
                      dim_place_t.c.name.label('start_place'), 
                      dim_place_t.c.lng.label('lng'), 
                      dim_place_t.c.lat.label('ltt'),
                      )
from_place_t = from_place_t.select_from(ride_ride_t
                                        .join(dim_place_t, ride_ride_t.c.from_place_id == dim_place_t.c.id, isouter=True))
from_place_t = from_place_t.alias()
#from_place_t = from_place_t.limit(10)
#df = pd.read_sql(from_place_t, engine)
#df

In [None]:
to_place_t = select(ride_ride_t.c.id.label('ride_id'), 
                      ride_ride_t.c.to_place_id.label('end_place_id'), 
                      dim_place_t.c.name.label('end_place'), 
                      dim_place_t.c.lng.label('lng'), 
                      dim_place_t.c.lat.label('ltt'))
to_place_t = to_place_t.select_from(ride_ride_t
                                        .join(dim_place_t, ride_ride_t.c.to_place_id == dim_place_t.c.id, isouter=True))
to_place_t = to_place_t.alias()
#to_place_t = to_place_t.limit(10)
#df = pd.read_sql(to_place_t, engine)
#df

In [None]:
vehicle_class_t = select(ride_ride_t.c.id.label('ride_id'),
                         ride_ride_t.c.vehicle_class_id.label('vehicle_class_id'), 
                         ride_vehicle_class_t.c.name.label('vehicle_class'))
vehicle_class_t = vehicle_class_t.select_from(ride_ride_t
                                              .join(ride_vehicle_class_t, ride_ride_t.c.vehicle_class_id == ride_vehicle_class_t.c.id, isouter=True))
vehicle_class_t = vehicle_class_t.alias()
#vehicle_class_t = vehicle_class_t.limit(10)
#df = pd.read_sql(vehicle_class_t, engine)
#df

In [None]:

data_from_date = '2024-01-01'


In [None]:
price_training_t = select(ride_ride_t.c.id.label('ride_id'),
                          ride_ride_t.c.trip_count,
                          ride_ride_t.c.from_utc,
                          ride_ride_t.c.from_time_str,
                          ride_ride_t.c.from_timezone_str,
                          ride_ride_t.c.to_time_str,
                          ride_ride_t.c.to_timezone_str,
                          ride_ride_t.c.passenger_count,
                          ride_ride_t.c.luggage_count,
                          ride_ride_t.c.children_count,
                          ride_ride_t.c.infant_count,
                          ride_ride_t.c.distance,
                          ride_ride_t.c.duration,
                          ride_dispatch_t.c.id.label('dispatch_id'),
                          ride_dispatch_t.c.trip_no,
                          ride_dispatch_t.c.amount.label('dispatch_amount'),
                          ride_dispatch_t.c.currency.label('dispatch_currency'),
                          ride_datetime_t.c.from_date_str,
                          ride_datetime_t.c.from_time_fix_str,
                          ride_datetime_t.c.from_datetime_fix_str,
                          # ride_datetime_t.c.day_of_week,
                          # text("EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(ride_ride_t.from_utc))").label('day_of_week'),
                          trip_type_t.c.trip_type_id,
                          trip_type_t.c.trip_type,
                          ride_status_t.c.ride_status_id,
                          ride_status_t.c.ride_status,
                          dispatch_status_t.c.dispatch_status_id,
                          dispatch_status_t.c.distpatch_status,
                          dispatch_type_t.c.dispatch_type,
                          ride_fleet_t.c.name.label('fleet'),
                          partner_t.c.partner_id,
                          partner_t.c.partner,
                          from_place_t.c.start_place_id,
                          from_place_t.c.start_place,
                          from_place_t.c.lng.label('start_lng'),
                          from_place_t.c.ltt.label('start_ltt'),
                          to_place_t.c.end_place_id,
                          to_place_t.c.end_place,
                          to_place_t.c.lng.label('end_lng'),
                          to_place_t.c.ltt.label('end_ltt'),
                          vehicle_class_t.c.vehicle_class_id,
                          vehicle_class_t.c.vehicle_class,
                          ).where(
                              and_(
                                  ride_ride_t.c.from_time_str > data_from_date,
                                  or_(
                                    ride_dispatch_t.c.currency == 'USD',
                                    # ride_dispatch_t.c.currency == 'CNY',
                                  ),
                                  # vehicle_class_t.c.vehicle_class== 'MPV-5',
                                  # or_ (dispatch_type_t.c.dispatch_type == 'auction',
                                  #      dispatch_type_t.c.dispatch_type == 'dispatch'),
                                  # F.lower(from_place_t.c.start_place).like('%orlando%'),
                                  # F.lower(to_place_t.c.end_place).like('%orlando%'),
                                  
                                  # not_(ride_datetime_t.c.day_of_week.in_([1, 7])),
                                  # not_(ride_ride_t.c.from_time_str.in_(exclude_dates)),
                              )
                          )

In [None]:

price_training_t = price_training_t.select_from(ride_ride_t
                                                .join(ride_dispatch_t, ride_ride_t.c.id == ride_dispatch_t.c.ride_id)
                                                .join(ride_datetime_t, ride_ride_t.c.id == ride_datetime_t.c.ride_id)
                                                .join(trip_type_t, ride_ride_t.c.id == trip_type_t.c.ride_id, isouter=True)
                                                .join(ride_status_t, ride_ride_t.c.id == ride_status_t.c.ride_id, isouter=True)
                                                .join(dispatch_status_t, ride_ride_t.c.id == dispatch_status_t.c.ride_id, isouter=True)
                                                .join(dispatch_type_t, ride_ride_t.c.id == dispatch_type_t.c.ride_id, isouter=True)
                                                .join(partner_t, ride_ride_t.c.id == partner_t.c.ride_id, isouter=True)
                                                .join(from_place_t, ride_ride_t.c.id == from_place_t.c.ride_id, isouter=True)
                                                .join(to_place_t, ride_ride_t.c.id == to_place_t.c.ride_id, isouter=True)
                                                .join(vehicle_class_t, ride_ride_t.c.id == vehicle_class_t.c.ride_id, isouter=True)
                                                .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True)
                                                ).distinct(ride_dispatch_t.c.id)

In [None]:
# price_training_t = price_training_t.alias()

In [None]:
print(price_training_t)

In [None]:
def read_sql_with_progress(query, engine, chunksize=1000):
    with tqdm(total=None, desc="Reading SQL") as pbar:
        df = pd.DataFrame()
        for chunk in pd.read_sql(query, engine, chunksize=chunksize):
            df = pd.concat([df, chunk], ignore_index=True)
            pbar.update(len(chunk))
        return df


In [None]:
# chunk = pd.read_sql(price_training_t,engine, chunksize=100)
# df = pd.concat(chunk, ignore_index=True)
# df

In [None]:
df_raw_rides = read_sql_with_progress(price_training_t, engine, chunksize=1000)

In [None]:
import pandas_gbq

In [None]:
df_raw_rides = pandas_gbq.read_gbq(price_training_t, credentials=credential, progress_bar_type='tqdm_notebook')

# Use ORM to retrieve records


In [None]:
sample_count = session.query(F.count(price_training_t.c.ride_id)).scalar()
print(sample_count)


In [None]:
url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'

In [None]:

# params = {
#     'from_lat': 37.61911449999999,
#     'from_lng':-122.3816274,
#     'to_lat':37.3635295,
#     'to_lng':-121.9285932,
#     'from_utc':1727352000,
# }
# response = requests.get(url=url, params=params)
# response.json()


In [None]:
# res = response.json()
# fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
# print(fix_price_zones)

In [None]:
# ride_samples = []

In [None]:
# pt = price_training_q.first()
# pt

In [None]:
# params = {
#     'from_lat': (pt._mapping['start_ltt']),
#     'from_lng': (pt._mapping['start_lng']),
#     'to_lat': (pt._mapping['end_ltt']),
#     'to_lng': (pt._mapping['end_lng']),
#     # 'from_utc': int(pt._mapping['from_utc']),
# }
# params

# response = requests.get(url=url, params=params)
# res = response.json()
# res

In [None]:
from sqlalchemy import String,Integer,insert

In [None]:


sqlite_metadata = MetaData()
# fixed_zone_routes = Table('fixed_zone_routes', sqlite_metadata,
#                           Column('start', String),
#                           Column('end', String),
#                           Column('dispatch_id', Integer),
#                           )
sqlite_eng = create_engine('sqlite:///../data/dispatch_fix_zones.db', echo=False)
connection = sqlite_eng.connect()
sqlite_metadata.create_all(sqlite_eng)  


In [None]:

price_training_q = session.query(price_training_t).limit(500)
fix_zone_routes_list = []


In [None]:
i = 0
j = 0
for pt in price_training_q:
    # print('1')
    # ride_samples.append(pt)
    params = {
        'from_lat': pt._mapping['start_ltt'],
        'from_lng': pt._mapping['start_lng'],
        'to_lat': pt._mapping['end_ltt'],
        'to_lng': pt._mapping['end_lng'],
        # 'from_utc':pt._mapping['from_utc'],
    }
    try:
        response = requests.get(url=url, params=params)
    except requests.exceptions.Timeout:
        print('Timeout')
        continue
    except requests.exceptions.TooManyRedirects:
        print('TooManyRedirects')
        continue                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
        # Tell the user their URL was bad and try a different one
    except requests.exceptions.RequestException as e:
        print('RequestException, Catastrophic error!')
        # continue
        # catastrophic error. bail.
        raise SystemExit(e)    
        
    except Exception as e:
        print(f"request: {e}")
        continue
    # print('2')
    try:
        res = response.json()
    except Exception as e:
        print(f"json: {e}")
        continue
    # print('3')
    try:
        fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
    except KeyError as e:
        j = j +1
        print(f"{j} Not Fixed Price!")
        continue
    except IndexError as e:
        print("IndexError for fix_price_zones")
        continue
    except Exception as e:
        print(f"dict: {e}")
        continue
    # print('4')
    if not isinstance(fix_price_zones,dict):
        print(f"No fix price: {fix_price_zones}")
    else:
        try:
            route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
            fix_zone_routes_list.append(route)
            # ins = insert(fixed_zone_routes).values(
            #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt._mapping['dispatch_id'])
            i = i +1
        except KeyError as e:
            print("KeyError for route")
            continue

        # print('5')
        if i%50 == 0:
            # connection.execute(ins)
            df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
            df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
            fix_zone_routes_list = []
            print(f"Created {i} records")

    # print('6')
df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
fix_zone_routes_list = []
print(f"Created {i} records")
    # print('6')
    # print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # # print(pt._mapping['start_place'])
    # print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # # print(pt._mapping['end_place']) 
    # print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    # print("------------")

In [None]:
fix_zone_routes_list

In [None]:
price_training_q = session.query(price_training_t).limit(10)
ride_samples = []
for pt in price_training_q:
    ride_samples.append(pt)
    print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # print(pt._mapping['start_place'])
    print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # print(pt._mapping['end_place']) 
    print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    print("------------")



In [None]:
ride_samples[0]._mapping

In [None]:
df = pd.DataFrame(ride_samples)
df

In [None]:

rides_q = session.query(price_training_t).limit(10)

In [None]:

for ride in rides_q:
    print(ride)

In [None]:
rides_q = session.query(ride_ride_t).limit(10)
for ride in rides_q:
    print(ride)

In [None]:
# print(price_training_t)

In [None]:
# use core to retrieve records
# rp = connection.execute(price_training_t)
# for i, record in enumerate(rp):
#     print(i, record.ride_id)

In [None]:
# results = rp.fetchall()
# results

In [None]:
# df = pd.read_sql(price_training_t, engine)
# df

In [None]:
sqlite_eng = create_engine('sqlite:///../data/price_training_from_gbq_raw.db', echo=False)

In [None]:
df.to_sql('price_training_orlando_mpv5', sqlite_eng, if_exists='replace')

# Pandas Processing


## Traffic peak time exclusion
source https://www.quora.com/What-is-the-trickiest-time-of-the-day-to-drive-in-Orlando

In [None]:
from workalendar.usa import Florida

In [None]:
cal_orlando = Florida()


In [None]:
exclude_dates = [d[0] for d in cal_orlando.holidays(2024)]
exclude_dates

In [None]:
from workalendar.usa import Florida 
import numpy as np
cal_florida = Florida()
exclude_dates_str = [str(d[0]) for d in cal_florida.holidays(2024)]
exclude_dates_str

In [None]:
exclude_dates = [d[0] for d in cal_florida.holidays(2024)]
# exclude_dates
# res = df['from_datetime_utc'].apply(lambda x: x in exclude_dates)

In [None]:
# np.any(res.apply(lambda x: x in exclude_dates))
# res[0]=True
# res
# np.any(res)


In [None]:
from pricing.data.utils import validate_datetime_in_iso_format, validate_timezone_in_iana, get_timezone_abbreviation, fix_timezone

In [None]:
df_invalid_datetime = df[df.apply(lambda x: not validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_invalid_datetime

In [None]:
df_invalid_timezone = df[df.apply(lambda x: not validate_timezone_in_iana(x['from_timezone_str']), axis=1)]
df_invalid_timezone

In [None]:
#orlando_airport = pd.read_csv('../../data/orlando_all_output.csv')
#orlando_airport.head()
#orlando_airport.dtypes
#orlando_airport.to_sql('orlando_airport', sqlite_eng, if_exists='append')

In [None]:
df['from_timezone_fix_str'] = df.apply(lambda x: fix_timezone(x['from_timezone_str']), axis=1) 
df

In [None]:

df_valid_datetime = df[df.apply(lambda x: validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_valid_timezone = df_valid_datetime[df_valid_datetime.apply(lambda x: validate_timezone_in_iana(x['from_timezone_fix_str']), axis=1)]
df_valid_timezone


In [None]:

df_valid_timezone.loc[df_invalid_timezone.index, ['from_timezone_str', 'from_timezone_fix_str']]


In [None]:
from datetime import datetime
import pytz
df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x['from_timezone_fix_str'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


In [None]:
df_dt_str = df_valid_timezone['from_datetime_fix_str']
df_valid_timezone['from_datetime_local'] = df_valid_timezone.apply(lambda x: (pd.to_datetime(x['from_datetime_fix_str']).to_datetime64()), axis=1)
df_valid_timezone

In [None]:
df_valid_timezone.dtypes

In [None]:

# df_valid_timezone['from_datetime_local_tz'] = df_valid_timezone.apply(lambda x: pytz.timezone(x.loc[:,'from_timezone_str']).localize(x.loc[:,'from_datetime_local']), axis=1)
df_valid_timezone['from_timezone'] = df_valid_timezone.apply(lambda x: pytz.timezone(x['from_timezone_fix_str']), axis=1)
df_valid_timezone


In [None]:
df_valid_timezone['from_datetime_tz'] = df_valid_timezone.apply(lambda x: x['from_timezone'].localize(x['from_datetime_local']), axis=1)
df_valid_timezone
                                    # .apply(lambda x: x.localize(x.loc[:,'from_timezone_str']), axis=1))
# df_valid_timezone


# filtering out peak traffic time

In [None]:
peak_time_str = [('07:00:00', '09:00:00'), ('16:00:00', '19:00:00')]
night_time_str = [('22:00:00', '6:00:00')]  # Shanghai, US usually no overtime extra fees New York 8pm ~ 6am
ind = []
td = []
for pt in peak_time_str:
    ind.append(pd.DatetimeIndex(pt))
ind
for i in ind:
    i[1]-i[0]

In [None]:
peak_time = []
for pt in peak_time_str:
    peak_time.append(pd.date_range(pt[0], pt[1], freq='h'))
for pt in peak_time:
    print(pt, pt.time)

In [None]:
peak_time_str

In [None]:

df_peak_traffic_time = df_valid_timezone[
    df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1])
]
df_peak_traffic_time

In [None]:

df_out_of_peak_traffic_time = df_valid_timezone[
    ~ (df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1]))
]
df_out_of_peak_traffic_time


In [None]:

# df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])))
#                             .apply(lambda x: x.strftime('%z')))
# df_utc_offset.name = 'utc_offset'
# df_utc_offset
# 

# filter out round trip (time reservation) with feature distance = 1 (m)

In [None]:
df_no_round_trip = df_valid_timezone[df_valid_timezone['distance'] > 1]

In [None]:
df_training = df_no_round_trip.loc[:,['ride_id', 'trip_type', 'trip_no', 'trip_count', 'ride_status', 'partner', 'fleet', 
                'start_place', 'end_place',
                'passenger_count', 'luggage_count',
                'dispatch_amount', 'dispatch_currency',
                'distance', 'duration', 'vehicle_class', 
                'from_datetime_tz']]
 
df_training['cent_price_per_km'] = df_training['dispatch_amount'] / df_training['distance']*100.0
df_training

In [None]:
def average_cent_per_km(x):
    x['average_cent_per_km'] = x['cent_price_per_km'].mean()
    return x

In [None]:
def fleet_trip_no(x):
    x['fleet_trip_count'] = len(x)
    return x

In [None]:
df_fleet_statistics = df_training.loc[:, ['ride_id','fleet']]
df_fleet_statistics

In [None]:

df_fleet_trip_no = df_fleet_statistics.groupby('fleet').aggregate([len])
df_fleet_trip_no.sort_values(by=('ride_id', 'len'), ascending=False, inplace=True)
df_fleet_trip_no


In [None]:
df_big_fleets = df_fleet_trip_no[df_fleet_trip_no[('ride_id','len')] >100]
df_big_fleets

In [None]:
df_big_fleet_data = []
for f in df_big_fleets.index:
    print(f)
    df_big_fleet_data.append(df_training[df_training['fleet'] == f])

In [None]:
df_big_fleet_data[0]

In [None]:
df_big_fleet_data[1]

In [None]:
df_big_fleet_data[2]

In [None]:

# df_training['average_cent_per_km'] = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis


In [None]:

df_training

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()