# data database scraper
> database scraper

In [1]:
#| default_exp data.database.scraper

In [2]:
#| hide
from nbdev.showdoc import *
from pyasn1_modules.rfc3279 import id_fieldType

In [3]:
#| hide
import nbdev; nbdev.nbdev_export()

In [4]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
#| export
import os
import pandas as pd 

In [6]:
#| export
from sqlalchemy import MetaData, create_engine, asc, desc, and_, or_, not_, case, extract, cast, text, distinct
from sqlalchemy.types import DateTime, Date, Time
from sqlalchemy.schema import *
from sqlalchemy.sql import func as F, Selectable, select
from sqlalchemy.dialects import registry
from sqlalchemy.engine.row import Row
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker


In [7]:
#| export
import requests
from tqdm import tqdm

In [8]:
#| export
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from geopy.timezone import from_timezone_name

In [9]:
#| hide
from google.cloud import bigquery
from dotenv import load_dotenv


In [10]:

load_dotenv("../.env")

True

In [11]:
os.environ

environ{'GSM_SKIP_SSH_AGENT_WORKAROUND': 'true',
        'LC_ALL': 'en_US.UTF-8',
        'WINDOWPATH': '2',
        'PATH': '/home/x/.pyenv/versions/miniforge3-latest/envs/prc/bin:/home/x/.pyenv/versions/miniforge3-latest/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin',
        'CONDA_PREFIX': '/home/x/.pyenv/versions/miniforge3-latest/envs/prc',
        'LIBVIRT_DEFAULT_URI': 'qemu:///system',
        'LOGNAME': 'x',
        'XDG_CONFIG_DIRS': '/etc/xdg/xdg-cinnamon:/etc/xdg',
        'XAUTHORITY': '/run/user/1000/gdm/Xauthority',
        'XDG_SESSION_ID': '2',
        'CONDA_SHLVL': '1',
        'XMODIFIERS': '@im=ibus',
        'GIO_LAUNCHED_DESKTOP_FILE_PID': '6997',
        'GIO_LAUNCHED_DESKTOP_FILE': '/home/x/Desktop/pycharm.desktop',
        'QT_ACCESSIBILITY': '1',
        'CONDA_DEFAULT_ENV': 'prc',
        'XDG_SEAT': 'seat0',
        'XDG_VTNR': '2',
        'QTWEBENGINE_DICTIONARIES_PATH': '/usr/share/

In [12]:
key = os.getenv('GC_QUOTE_API_CREDENTIALS')

In [13]:
credential = service_account.Credentials.from_service_account_file("../" + key)


In [14]:
registry.register('bigquery', 'sqlalchemy_bigquery', 'BigQueryDialect')
engine = create_engine('bigquery://quote-api-365206',
                       credentials_path='../' + key,)


In [15]:
Session = sessionmaker(bind=engine)

In [16]:
session = Session()

In [17]:
metadata = MetaData()
def get_table(project_name: str, dataset_name: str, table_name: str)-> Table:
    table = Table(f'{project_name}.{dataset_name}.{table_name}', metadata, autoload_with=engine)
    return table

In [18]:
connection = engine.connect()

In [19]:
connection

<sqlalchemy.engine.base.Connection at 0x737bd2550b30>

# find all tables in the database

In [20]:
ride_trip_t = get_table('elife-data-warehouse-prod','ods', 'ride_trip').alias()
ride_dispatch_t = get_table('elife-data-warehouse-prod','ods', 'ride_dispatch').alias()
ride_enum_t = get_table('elife-data-warehouse-prod','ods', 'ride_enum').alias()
ride_ride_t = get_table('elife-data-warehouse-prod','ods', 'ride_ride').alias()
ride_partner_tran_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner_tran').alias()
ride_partner_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner').alias()
dim_place_t = get_table('elife-data-warehouse-prod','dim', 'dim_place').alias()
ride_vehicle_class_t = get_table('elife-data-warehouse-prod','ods', 'ride_vehicle_class').alias()

In [21]:
auction_ride_t = get_table('elife-data-warehouse-prod','ods', 'ride_auction_ride').alias()
auction_fleet_t = get_table('elife-data-warehouse-prod','ods', 'ride_auction_fleet').alias()
ride_fleet_t = get_table('elife-data-warehouse-prod','ods', 'ride_fleet').alias()

In [22]:
trip_type_id_t = select(ride_dispatch_t.c.ride_id, ride_trip_t.c.trip_type.label('trip_type_id'))
trip_type_id_t = trip_type_id_t.select_from(ride_dispatch_t
                                      .join(ride_trip_t, ride_dispatch_t.c.ride_id == ride_trip_t.c.ride_id))
trip_type_id_t = trip_type_id_t.alias()
                  
# trip_type_id_t = trip_type_id_t.limit(10)
# df = pd.read_sql(trip_type_id_t, engine
# df

In [23]:

# trip_type_id_q = session.query(trip_type_id_t).limit(10)
# # for trip in trip_type_id_q:
#     print(trip)



In [24]:
trip_type_t = select(trip_type_id_t.c.ride_id, trip_type_id_t.c.trip_type_id, ride_enum_t.c.name.label('trip_type'))
trip_type_t = trip_type_t.select_from(trip_type_id_t
                                      .join(ride_enum_t, trip_type_id_t.c.trip_type_id == ride_enum_t.c.id, isouter=True))
trip_type_t = trip_type_t.alias()


In [25]:
ride_status_t = select(ride_ride_t.c.id.label('ride_id'), ride_ride_t.c.stat.label('ride_status_id'), ride_enum_t.c.name.label('ride_status'))
ride_status_t = ride_status_t.select_from(ride_ride_t
                                          .join(ride_enum_t, ride_ride_t.c.stat == ride_enum_t.c.id, isouter=True))
ride_status_t = ride_status_t.alias()
# ride_status_t = ride_status_t.limit(10)
# df = pd.read_sql(ride_status_t, engine)
# df

In [26]:
ride_datetime_t = select(ride_ride_t.c.id.label('ride_id'), 
                         F.substring(ride_ride_t.c.from_time_str, 1, 10).label('from_date_str'),
                         F.concat(F.substring(ride_ride_t.c.from_time_str, 12, 16),':00').label('from_time_fix_str'),
                         F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10), 
                                  ' ', 
                                  F.substring(ride_ride_t.c.from_time_str, 12, 16),
                                  ':00'
                                  ).label('from_datetime_fix_str'),
                         extract(
                             'DAYOFWEEK',
                             cast(F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10),
                                       ' ',
                                       F.substring(ride_ride_t.c.from_time_str, 12, 16),
                                       ':00'
                                       ),
                              DateTime)
                         ).label('day_of_week_local'),
                         extract(
                             'DAYOFWEEK',
                             cast(F.timestamp_seconds(ride_ride_t.c.from_utc), 
                                  DateTime)
                         ).label('day_of_week_utc'),
                         F.datetime(F.timestamp_seconds(ride_ride_t.c.from_utc)).label('from_datetime_utc'),
                         ride_ride_t.c.from_timezone_str,
                         # F.current_date(ride_ride_t.c.from_timezone_str).label('current_date'),
                         #cast(ride_ride_t.c.from_time_str,DateTime).label('from_datetime'),
                         #cast(F.concat(F.substring(ride_ride_t.c.from_time_str, 1, 10), 'T', F.substring(ride_ride_t.c.from_time_str, 12, 16)),DateTime).label('from_datetime'),
                         #cast(ride_ride_t.c.from_time_str, DateTime).label('from_time_dt'),
                         # F.timezone(ride_ride_t.c.from_timezone_str, cast(ride_ride_t.c.from_time_str, DateTime).label('from_time_dt')),
                         )                  
ride_datetime_t = ride_datetime_t.select_from(ride_ride_t)
ride_datetime_t = ride_datetime_t.alias()
# ride_datetime_t = ride_datetime_t.limit(100)
# df = pd.read_sql(ride_datetime_t, engine)
# df

In [27]:
dispatch_status_t = select(ride_dispatch_t.c.id.label('ride_id'), ride_dispatch_t.c.stat.label('dispatch_status_id'), ride_enum_t.c.name.label('distpatch_status'))
dispatch_status_t = dispatch_status_t.select_from(ride_dispatch_t
                                          .join(ride_enum_t, ride_dispatch_t.c.stat == ride_enum_t.c.id, isouter=True))
dispatch_status_t = dispatch_status_t.alias()
#dispatch_status_t = dispatch_status_t.limit(10)
#df = pd.read_sql(dispatch_status_t, engine)
#df

In [28]:
auction_type_t = select(
    auction_ride_t.c.ride_id.label('ride_id'),
    auction_ride_t.c.auction_id.label('auction_id'),
    auction_fleet_t.c.fleet_id.label('auction_fleet_id'),
    ride_dispatch_t.c.to_fleet_id.label('dispatch_fleet_id'),
    ride_fleet_t.c.name.label('fleet'),
    case(
        (ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id, 'auction'),
        else_ = 'dispatch',
    ).label('dispatch_type')
)
auction_type_t = auction_type_t.select_from(auction_ride_t
                                            .join(auction_fleet_t, auction_ride_t.c.auction_id == auction_fleet_t.c.auction_id, isouter=True)
                                            .join(ride_dispatch_t, auction_ride_t.c.ride_id == ride_dispatch_t.c.ride_id, isouter=True)
                                            .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True))
auction_type_t = auction_type_t.alias()
# auction_type_t = auction_type_t.limit(100)
# df = pd.read_sql(auction_type_t, engine)
# df

In [29]:
dispatch_type_t = (select(
    ride_dispatch_t.c.id.label('ride_id'),
    ride_dispatch_t.c.to_fleet_id.label('dispatch_fleet_id'),
    auction_ride_t.c.auction_id.label('auction_id'),
    auction_fleet_t.c.fleet_id.label('auction_fleet_id'),
    case(
        (ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id, 'auction'), 
        else_ = 'dispatch',
    ).label('dispatch_type'),
    ride_fleet_t.c.name.label('fleet')
))
# .where(
#     ride_dispatch_t.c.to_fleet_id == auction_fleet_t.c.fleet_id
# ))
dispatch_type_t = dispatch_type_t.select_from(ride_dispatch_t
                                          .join(auction_ride_t, ride_dispatch_t.c.ride_id == auction_ride_t.c.ride_id, isouter=True)
                                          .join(auction_fleet_t, auction_ride_t.c.auction_id == auction_fleet_t.c.auction_id, isouter=True)
                                          .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True))
dispatch_type_t = dispatch_type_t.alias()
# dispatch_type_t = dispatch_type_t.limit(100)
# df = pd.read_sql(dispatch_type_t, engine)
# df

In [30]:
partner_id_t = select(ride_ride_t.c.id.label('ride_id'), ride_partner_tran_t.c.partner_id.label('partner_id'))
partner_id_t = partner_id_t.select_from(ride_ride_t
                                        .join(ride_partner_tran_t, ride_ride_t.c.partner_tran_id == ride_partner_tran_t.c.id, isouter=True))
partner_id_t = partner_id_t.alias()
#partner_id_t = partner_id_t.limit(10)
#df = pd.read_sql(partner_id_t, engine)
#df

In [31]:
partner_t = select(partner_id_t.c.ride_id, partner_id_t.c.partner_id, ride_partner_t.c.name.label('partner'))
partner_t = partner_t.select_from(partner_id_t
                                  .join(ride_partner_t, partner_id_t.c.partner_id == ride_partner_t.c.id, isouter=True))
partner_t = partner_t.alias()
# partner_t = partner_t.limit(10)
# df = pd.read_sql(partner_t,engine)
# df

In [32]:
from_place_t = select(ride_ride_t.c.id.label('ride_id'), 
                      ride_ride_t.c.from_place_id.label('start_place_id'), 
                      dim_place_t.c.name.label('start_place'), 
                      dim_place_t.c.lng.label('lng'), 
                      dim_place_t.c.lat.label('ltt'),
                      )
from_place_t = from_place_t.select_from(ride_ride_t
                                        .join(dim_place_t, ride_ride_t.c.from_place_id == dim_place_t.c.id, isouter=True))
from_place_t = from_place_t.alias()
#from_place_t = from_place_t.limit(10)
#df = pd.read_sql(from_place_t, engine)
#df

In [33]:
to_place_t = select(ride_ride_t.c.id.label('ride_id'), 
                      ride_ride_t.c.to_place_id.label('end_place_id'), 
                      dim_place_t.c.name.label('end_place'), 
                      dim_place_t.c.lng.label('lng'), 
                      dim_place_t.c.lat.label('ltt'))
to_place_t = to_place_t.select_from(ride_ride_t
                                        .join(dim_place_t, ride_ride_t.c.to_place_id == dim_place_t.c.id, isouter=True))
to_place_t = to_place_t.alias()
#to_place_t = to_place_t.limit(10)
#df = pd.read_sql(to_place_t, engine)
#df

In [34]:
vehicle_class_t = select(ride_ride_t.c.id.label('ride_id'),
                         ride_ride_t.c.vehicle_class_id.label('vehicle_class_id'), 
                         ride_vehicle_class_t.c.name.label('vehicle_class'))
vehicle_class_t = vehicle_class_t.select_from(ride_ride_t
                                              .join(ride_vehicle_class_t, ride_ride_t.c.vehicle_class_id == ride_vehicle_class_t.c.id, isouter=True))
vehicle_class_t = vehicle_class_t.alias()
#vehicle_class_t = vehicle_class_t.limit(10)
#df = pd.read_sql(vehicle_class_t, engine)
#df

In [35]:

data_from_date = '2024-01-01'


In [46]:
price_training_t = select(ride_ride_t.c.id.label('ride_id'),
                          ride_ride_t.c.trip_count,
                          ride_ride_t.c.from_utc,
                          ride_ride_t.c.from_time_str,
                          ride_ride_t.c.from_timezone_str,
                          ride_ride_t.c.to_time_str,
                          ride_ride_t.c.to_timezone_str,
                          ride_ride_t.c.passenger_count,
                          ride_ride_t.c.luggage_count,
                          ride_ride_t.c.children_count,
                          ride_ride_t.c.infant_count,
                          ride_ride_t.c.distance,
                          ride_ride_t.c.duration,
                          ride_dispatch_t.c.id.label('dispatch_id'),
                          ride_dispatch_t.c.trip_no,
                          ride_dispatch_t.c.amount.label('dispatch_amount'),
                          ride_dispatch_t.c.currency.label('dispatch_currency'),
                          ride_datetime_t.c.from_date_str,
                          ride_datetime_t.c.from_time_fix_str,
                          ride_datetime_t.c.from_datetime_fix_str,
                          # ride_datetime_t.c.day_of_week,
                          # text("EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(ride_ride_t.from_utc))").label('day_of_week'),
                          trip_type_t.c.trip_type_id,
                          trip_type_t.c.trip_type,
                          ride_status_t.c.ride_status_id,
                          ride_status_t.c.ride_status,
                          dispatch_status_t.c.dispatch_status_id,
                          dispatch_status_t.c.distpatch_status,
                          dispatch_type_t.c.dispatch_type,
                          ride_fleet_t.c.name.label('fleet'),
                          partner_t.c.partner_id,
                          partner_t.c.partner,
                          from_place_t.c.start_place_id,
                          from_place_t.c.start_place,
                          from_place_t.c.lng.label('start_lng'),
                          from_place_t.c.ltt.label('start_ltt'),
                          to_place_t.c.end_place_id,
                          to_place_t.c.end_place,
                          to_place_t.c.lng.label('end_lng'),
                          to_place_t.c.ltt.label('end_ltt'),
                          vehicle_class_t.c.vehicle_class_id,
                          vehicle_class_t.c.vehicle_class,
                          ).where(
                              and_(
                                  ride_ride_t.c.from_time_str > data_from_date,
                                  or_(
                                    ride_dispatch_t.c.currency == 'USD',
                                    # ride_dispatch_t.c.currency == 'CNY',
                                  ),
                                  # vehicle_class_t.c.vehicle_class== 'MPV-5',
                                  # or_ (dispatch_type_t.c.dispatch_type == 'auction',
                                  #      dispatch_type_t.c.dispatch_type == 'dispatch'),
                                  # F.lower(from_place_t.c.start_place).like('%orlando%'),
                                  # F.lower(to_place_t.c.end_place).like('%orlando%'),
                                  
                                  # not_(ride_datetime_t.c.day_of_week.in_([1, 7])),
                                  # not_(ride_ride_t.c.from_time_str.in_(exclude_dates)),
                              )
                          )

In [47]:

price_training_t = price_training_t.select_from(ride_ride_t
                                                .join(ride_dispatch_t, ride_ride_t.c.id == ride_dispatch_t.c.ride_id)
                                                .join(ride_datetime_t, ride_ride_t.c.id == ride_datetime_t.c.ride_id)
                                                .join(trip_type_t, ride_ride_t.c.id == trip_type_t.c.ride_id, isouter=True)
                                                .join(ride_status_t, ride_ride_t.c.id == ride_status_t.c.ride_id, isouter=True)
                                                .join(dispatch_status_t, ride_ride_t.c.id == dispatch_status_t.c.ride_id, isouter=True)
                                                .join(dispatch_type_t, ride_ride_t.c.id == dispatch_type_t.c.ride_id, isouter=True)
                                                .join(partner_t, ride_ride_t.c.id == partner_t.c.ride_id, isouter=True)
                                                .join(from_place_t, ride_ride_t.c.id == from_place_t.c.ride_id, isouter=True)
                                                .join(to_place_t, ride_ride_t.c.id == to_place_t.c.ride_id, isouter=True)
                                                .join(vehicle_class_t, ride_ride_t.c.id == vehicle_class_t.c.ride_id, isouter=True)
                                                .join(ride_fleet_t, ride_dispatch_t.c.to_fleet_id == ride_fleet_t.c.id, isouter=True)
                                                ).distinct(ride_dispatch_t.c.id)

In [38]:
# price_training_t = price_training_t.alias()

In [48]:
print(price_training_t)

SELECT DISTINCT "elife-data-warehouse-prod.ods.ride_ride_1".id AS ride_id, "elife-data-warehouse-prod.ods.ride_ride_1".trip_count, "elife-data-warehouse-prod.ods.ride_ride_1".from_utc, "elife-data-warehouse-prod.ods.ride_ride_1".from_time_str, "elife-data-warehouse-prod.ods.ride_ride_1".from_timezone_str, "elife-data-warehouse-prod.ods.ride_ride_1".to_time_str, "elife-data-warehouse-prod.ods.ride_ride_1".to_timezone_str, "elife-data-warehouse-prod.ods.ride_ride_1".passenger_count, "elife-data-warehouse-prod.ods.ride_ride_1".luggage_count, "elife-data-warehouse-prod.ods.ride_ride_1".children_count, "elife-data-warehouse-prod.ods.ride_ride_1".infant_count, "elife-data-warehouse-prod.ods.ride_ride_1".distance, "elife-data-warehouse-prod.ods.ride_ride_1".duration, "elife-data-warehouse-prod.ods.ride_dispatch_1".id AS dispatch_id, "elife-data-warehouse-prod.ods.ride_dispatch_1".trip_no, "elife-data-warehouse-prod.ods.ride_dispatch_1".amount AS dispatch_amount, "elife-data-warehouse-prod.ods

  print(price_training_t)


In [49]:
def read_sql_with_progress(query, engine, chunksize=1000):
    with tqdm(total=None, desc="Reading SQL") as pbar:
        df = pd.DataFrame()
        for chunk in pd.read_sql(query, engine, chunksize=chunksize):
            df = pd.concat([df, chunk], ignore_index=True)
            pbar.update(len(chunk))
        return df


In [51]:
# chunk = pd.read_sql(price_training_t,engine, chunksize=100)
# df = pd.concat(chunk, ignore_index=True)
# df

KeyboardInterrupt: 

In [50]:
df_raw_rides = read_sql_with_progress(price_training_t, engine, chunksize=1000)

Reading SQL: 0it [09:34, ?it/s]


RetryError: Timeout of 600.0s exceeded, last exception: 503 failed to connect to all addresses; last error: UNKNOWN: ipv4:142.250.217.106:443: Failed to connect to remote host: FD Shutdown

In [41]:
import pandas_gbq

In [42]:
df_raw_rides = pandas_gbq.read_gbq(price_training_t, credentials=credential, progress_bar_type='tqdm_notebook')

AttributeError: 'Select' object has no attribute 'strip'

# Use ORM to retrieve records


In [39]:
sample_count = session.query(F.count(price_training_t.c.ride_id)).scalar()
print(sample_count)


1061971


In [41]:
url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'

In [42]:

# params = {
#     'from_lat': 37.61911449999999,
#     'from_lng':-122.3816274,
#     'to_lat':37.3635295,
#     'to_lng':-121.9285932,
#     'from_utc':1727352000,
# }
# response = requests.get(url=url, params=params)
# response.json()


In [43]:
# res = response.json()
# fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
# print(fix_price_zones)

In [44]:
# ride_samples = []

In [45]:
# pt = price_training_q.first()
# pt

In [46]:
# params = {
#     'from_lat': (pt._mapping['start_ltt']),
#     'from_lng': (pt._mapping['start_lng']),
#     'to_lat': (pt._mapping['end_ltt']),
#     'to_lng': (pt._mapping['end_lng']),
#     # 'from_utc': int(pt._mapping['from_utc']),
# }
# params

# response = requests.get(url=url, params=params)
# res = response.json()
# res

In [47]:
from sqlalchemy import String,Integer,insert

In [49]:


sqlite_metadata = MetaData()
# fixed_zone_routes = Table('fixed_zone_routes', sqlite_metadata,
#                           Column('start', String),
#                           Column('end', String),
#                           Column('dispatch_id', Integer),
#                           )
sqlite_eng = create_engine('sqlite:///../data/dispatch_fix_zones.db', echo=False)
connection = sqlite_eng.connect()
sqlite_metadata.create_all(sqlite_eng)  


In [48]:

price_training_q = session.query(price_training_t).limit(500)
fix_zone_routes_list = []


In [50]:
i = 0
j = 0
for pt in price_training_q:
    # print('1')
    # ride_samples.append(pt)
    params = {
        'from_lat': pt._mapping['start_ltt'],
        'from_lng': pt._mapping['start_lng'],
        'to_lat': pt._mapping['end_ltt'],
        'to_lng': pt._mapping['end_lng'],
        # 'from_utc':pt._mapping['from_utc'],
    }
    try:
        response = requests.get(url=url, params=params)
    except requests.exceptions.Timeout:
        print('Timeout')
        continue
    except requests.exceptions.TooManyRedirects:
        print('TooManyRedirects')
        continue                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
        # Tell the user their URL was bad and try a different one
    except requests.exceptions.RequestException as e:
        print('RequestException, Catastrophic error!')
        # continue
        # catastrophic error. bail.
        raise SystemExit(e)    
        
    except Exception as e:
        print(f"request: {e}")
        continue
    # print('2')
    try:
        res = response.json()
    except Exception as e:
        print(f"json: {e}")
        continue
    # print('3')
    try:
        fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
    except KeyError as e:
        j = j +1
        print(f"{j} Not Fixed Price!")
        continue
    except IndexError as e:
        print("IndexError for fix_price_zones")
        continue
    except Exception as e:
        print(f"dict: {e}")
        continue
    # print('4')
    if not isinstance(fix_price_zones,dict):
        print(f"No fix price: {fix_price_zones}")
    else:
        try:
            route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
            fix_zone_routes_list.append(route)
            # ins = insert(fixed_zone_routes).values(
            #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt._mapping['dispatch_id'])
            i = i +1
        except KeyError as e:
            print("KeyError for route")
            continue

        # print('5')
        if i%50 == 0:
            # connection.execute(ins)
            df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
            df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
            fix_zone_routes_list = []
            print(f"Created {i} records")

    # print('6')
df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
fix_zone_routes_list = []
print(f"Created {i} records")
    # print('6')
    # print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # # print(pt._mapping['start_place'])
    # print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # # print(pt._mapping['end_place']) 
    # print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    # print("------------")

50

Created 50 records
1 Not Fixed Price!
2 Not Fixed Price!
3 Not Fixed Price!


50

Created 100 records
4 Not Fixed Price!
5 Not Fixed Price!
6 Not Fixed Price!


50

Created 150 records
7 Not Fixed Price!
8 Not Fixed Price!
9 Not Fixed Price!


50

Created 200 records
10 Not Fixed Price!
11 Not Fixed Price!
12 Not Fixed Price!


50

Created 250 records
13 Not Fixed Price!
14 Not Fixed Price!


50

Created 300 records
15 Not Fixed Price!
16 Not Fixed Price!
17 Not Fixed Price!
18 Not Fixed Price!


50

Created 350 records
19 Not Fixed Price!
20 Not Fixed Price!
21 Not Fixed Price!
22 Not Fixed Price!


50

Created 400 records
23 Not Fixed Price!


50

Created 450 records
24 Not Fixed Price!
25 Not Fixed Price!


25

Created 475 records


In [51]:
fix_zone_routes_list

[('HAN', '市区', 3244754),
 ('Las vegas stripe', 'LAS', 2409067),
 ('MCO', 'Four coners', 2603740),
 ('JFK', '曼哈顿', 3023710),
 ('NRT', '东京3', 2443358),
 ('FOR', 'Fortaleza', 2951565),
 ('BKK', '市区中心', 1869755),
 ('LIS', 'Lisbon', 2454102),
 ('SAW', 'European Side,İstanbul', 3066812),
 ('LAS', 'Las vegas stripe', 1953221)]

In [46]:
price_training_q = session.query(price_training_t).limit(10)
ride_samples = []
for pt in price_training_q:
    ride_samples.append(pt)
    print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
    # print(pt._mapping['start_place'])
    print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
    # print(pt._mapping['end_place']) 
    print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
    print("------------")



3174224 37 USD
-80.288455 25.796914
-80.1787252 25.7788675
------------
3139585 45.56 USD
-122.308849 47.450243
-122.3380565 47.6178573
------------
2867487 69.32 USD
-73.778236 40.641319
-73.984006 40.7614242
------------
3387752 47.15 USD
29.3168603 40.905371
28.9734003 41.0028873
------------
3092081 51.54 USD
-118.40714 33.942049
-118.3600099 34.1374974
------------
3492939 67.5 USD
13.2368773 -8.815474
13.2352778 -8.8480556
------------
3481148 36.8 USD
28.7050148 41.2628732
28.9787674 41.0138689
------------
3340201 9 USD
36.9260693 -1.3227102
36.8134193 -1.2750472
------------
3334898 117.12 CNY
121.808361 31.143941
121.3012581 31.1899364
------------
3554427 4.17 USD
108.2355635 16.0708675
108.199 16.0439
------------


In [47]:
ride_samples[0]._mapping

{'ride_id': 3174224, 'trip_count': 1, 'from_utc': 1706368200, 'from_time_str': '2024-01-27 10:10', 'from_timezone_str': 'America/New_York', 'to_time_str': None, 'to_timezone_str': None, 'passenger_count': 2, 'luggage_count': 2, 'children_count': None, 'infant_count': None, 'distance': 14782, 'duration': 1736, 'dispatch_id': 2057871, 'trip_no': 0, 'dispatch_amount': Decimal('37'), 'dispatch_currency': 'USD', 'from_date_str': '2024-01-27', 'from_time_fix_str': '10:10:00', 'from_datetime_fix_str': '2024-01-27 10:10:00', 'trip_type_id': 136708097, 'trip_type': 'point2point', 'ride_status_id': 134610945, 'ride_status': 'Pending', 'dispatch_status_id': 134217729, 'distpatch_status': 'Pending', 'dispatch_type': 'dispatch', 'fleet': 'SAFE TRASFER SERVICE LLC', 'partner_id': 2, 'partner': 'Booking', 'start_place_id': 1043298, 'start_place': 'Miami International Airport (MIA), 2100 NW 42nd Ave, Miami, FL 33142, EUA', 'start_lng': Decimal('-80.288455'), 'start_ltt': Decimal('25.796914'), 'end_pla

In [48]:
df = pd.DataFrame(ride_samples)
df

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_place_id,start_place,start_lng,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class
0,3174224,1,1706368200,2024-01-27 10:10,America/New_York,,,2,2,,...,1043298,"Miami International Airport (MIA), 2100 NW 42n...",-80.288455,25.796914,1043299,"Porto de Miami, 1015 N America Wy #2, Miami, F...",-80.1787252,25.7788675,6,MPV-4
1,3139585,1,1705474980,2024-01-16 23:03,Pacific Standard Time,,,2,2,,...,978711,"Seattle Tacoma International Airport (SEA), 17...",-122.308849,47.450243,978712,"Pan Pacific Seattle, 2190 9th Ave, Seattle, WA...",-122.3380565,47.6178573,3,MPV-5
2,2867487,1,1704828300,2024-01-09 14:25,America/New_York,,,2,2,,...,490365,"John F Kennedy International Airport (JFK), Qu...",-73.778236,40.641319,490366,"citizenM New York Times Square, New York, NY 1...",-73.984006,40.7614242,1,Sedan
3,3387752,1,1715619600,2024-05-13 20:00,Europe/Istanbul,,,2,2,,...,1458764,"Istanbul Aiport Sabiha Gökçen (SAW), Sanayi, 3...",29.3168603,40.905371,1458765,"Küçük Ayasofya, Akburçak Sk. No:19, 34122 Fati...",28.9734003,41.0028873,1,Sedan
4,3092081,1,1716142980,2024-05-19 11:23,America/Los_Angeles,,,2,2,,...,894856,"Los Angeles International Airport (LAX), 1 Wor...",-118.40714,33.942049,894857,"Sheraton Universal Hotel, 333 Universal Hollyw...",-118.3600099,34.1374974,1,Sedan
5,3492939,1,1717149600,2024-05-31 11:00,West Africa Standard Time,,,1,2,,...,1459315,"Hotel Trópico R. da Missão 103, Luanda, Angola",13.2368773,-8.815474,96260,"Luanda Airport Ñlio. Dtfralio Sesa, Luanda, An...",13.2352778,-8.8480556,1,Sedan
6,3481148,1,1719996300,2024-07-03 11:45,Europe/Istanbul,,,4,4,,...,1643065,"Istanbul Airport (IST), Tayakadın, Terminal Ca...",28.7050148,41.2628732,1643066,"Gülhanepark Hotel & Spa, Hoca Paşa, Nöbethane ...",28.9787674,41.0138689,6,MPV-4
7,3340201,1,1719514500,2024-06-27 21:55,Africa/Nairobi,,,1,1,,...,1368576,Nairobi Jomo Kenyatta International Airport (N...,36.9260693,-1.3227102,1368577,"Hotel Boulevard, P.O. Box 42381, Nairobi, Kenya",36.8134193,-1.2750472,1,Sedan
8,3334898,1,1718720400,2024-06-18 22:20,Asia/Shanghai,,,2,2,,...,1358051,"Shanghai Pudong International Airport (PVG), 4...",121.808361,31.143941,1358052,Guo Jia Hui Zhan Zhong Xin ( Shang Hai ) Ban G...,121.3012581,31.1899364,2,Business Sedan
9,3554427,1,1719293400,2024-06-25 12:30,Asia/Ho_Chi_Minh,,,2,2,,...,1786042,"Luxtery Hotel & Spa, An Hải, An Hải Bắc, Sơn T...",108.2355635,16.0708675,1786043,"Da Nang International Airport (DAD), Đ Nguyễn ...",108.199,16.0439,1,Sedan


In [50]:

rides_q = session.query(price_training_t).limit(10)

ArgumentError: Column expression, FROM clause, or other columns clause element expected, got <sqlalchemy.sql.selectable.Select object at 0x7683c5756b10>. To create a FROM clause from a <class 'sqlalchemy.sql.selectable.Select'> object, use the .subquery() method. (Background on this error at: https://sqlalche.me/e/20/89ve)

In [45]:

for ride in rides_q:
    print(ride)

(2168131, 134610947, 3826, 'TYS', 3005, 1610901000, '2021-01-17 11:30', 'Eastern Standard Time', 0, 1542, None, None, None, None, 3813, 66808, 4472, 3, 3, 4, 1, 0, datetime.datetime(2021, 1, 13, 23, 53, 15, tzinfo=datetime.timezone.utc), datetime.datetime(2021, 1, 14, 0, 10, 4, tzinfo=datetime.timezone.utc), 585, '', '', None, None, None, None, 0, 0, 408, None, {'uuid': '9354f93f-de0a-4927-9785-d2e311110010', 'source_timestamp': 1706161022000}, '9354f93f-de0a-4927-9785-d2e311110010', 1706161022000)
(2168027, 134610947, 3711, 'LAS', 131, 1610220600, '2021-01-09 11:30', 'Pacific Standard Time', 0, 314, 1610911800, '2021-01-17 11:30', 'Pacific Standard Time', 0, 3698, 5352, 619, 1, 2, 2, 2, 1, datetime.datetime(2021, 1, 5, 22, 33, 24, tzinfo=datetime.timezone.utc), datetime.datetime(2021, 1, 5, 22, 33, 38, tzinfo=datetime.timezone.utc), 263, 'Welcome Vermel Beatty !', '', None, 'AA1475', None, None, 0, 0, 62, None, {'uuid': '9354f93f-de0a-4927-9785-d2e310001010', 'source_timestamp': 17061

In [45]:
rides_q = session.query(ride_ride_t).limit(10)
for ride in rides_q:
    print(ride)

(2168131, 134610947, 3826, 'TYS', 3005, 1610901000, '2021-01-17 11:30', 'Eastern Standard Time', 0, 1542, None, None, None, None, 3813, 66808, 4472, 3, 3, 4, 1, 0, datetime.datetime(2021, 1, 13, 23, 53, 15, tzinfo=datetime.timezone.utc), datetime.datetime(2021, 1, 14, 0, 10, 4, tzinfo=datetime.timezone.utc), 585, '', '', None, None, None, None, 0, 0, 408, None, {'uuid': '9354f93f-de0a-4927-9785-d2e311110010', 'source_timestamp': 1706161022000}, '9354f93f-de0a-4927-9785-d2e311110010', 1706161022000)
(2168027, 134610947, 3711, 'LAS', 131, 1610220600, '2021-01-09 11:30', 'Pacific Standard Time', 0, 314, 1610911800, '2021-01-17 11:30', 'Pacific Standard Time', 0, 3698, 5352, 619, 1, 2, 2, 2, 1, datetime.datetime(2021, 1, 5, 22, 33, 24, tzinfo=datetime.timezone.utc), datetime.datetime(2021, 1, 5, 22, 33, 38, tzinfo=datetime.timezone.utc), 263, 'Welcome Vermel Beatty !', '', None, 'AA1475', None, None, 0, 0, 62, None, {'uuid': '9354f93f-de0a-4927-9785-d2e310001010', 'source_timestamp': 17061

In [None]:
# print(price_training_t)

In [54]:
# use core to retrieve records
# rp = connection.execute(price_training_t)
# for i, record in enumerate(rp):
#     print(i, record.ride_id)

0 2806505
1 3084481
2 3084481
3 3084481
4 3084481
5 3135833
6 3035388
7 3035388
8 3272386
9 3297609
10 3297609
11 3297609
12 3297609
13 2815527
14 2815527
15 3318375
16 3318375
17 3318375
18 3318375
19 3318375
20 3318375
21 3318375
22 3318375
23 3318375
24 3318375
25 3318375
26 3318375
27 3318375
28 3318375
29 3318375
30 3318375
31 3318375
32 3318375
33 2792645
34 2792645
35 3127714
36 3127714
37 3127714
38 3127714
39 3312816
40 3266949
41 3266949
42 3266949
43 3266949
44 3101135
45 3101135
46 3016261
47 3287559
48 3313441
49 3313441
50 3313441
51 3313441
52 3315547
53 3315547
54 3315547
55 3315547
56 2894069
57 3102445
58 3102445
59 3284060
60 3284060
61 3284060
62 3284060
63 3284060
64 3284060
65 3284060
66 3284060
67 3284060
68 3284060
69 3284060
70 3284060
71 3284060
72 3284060
73 3284060
74 3284060
75 3284060
76 3284060
77 3335613
78 3335613
79 3335613
80 3335613
81 3315386
82 3315386
83 3315386
84 3315386
85 3088964
86 3088964
87 3196171
88 3196171
89 3196171
90 3196171
91 319332

In [47]:
# results = rp.fetchall()
# results

[(3302172, 1, 1710240900, '2024-03-12 06:55', 'America/New_York', None, None, 2, 4, 0, 0, 26048, 1389, 2313088, 0, Decimal('42'), 'USD', '2024-03-12', '06:55:00', '2024-03-12 06:55:00', 136708097, 'point2point', 134610945, 'Pending', 134217729, 'Pending', 'dispatch', 'Safe Transfer Service LLC', 10, 'China Ctrip', 1293217, 'Orlando International Airport B(Orlando International Airport B)', Decimal('-81.3078871'), Decimal('28.4302277'), 1293218, '7389 Alpine Butterfly Lane(7389 Alpine Butterfly Ln, Orlando, FL 32819美国)', Decimal('-81.480123'), Decimal('28.441818'), 3, 'MPV-5'),
 (3302172, 1, 1710240900, '2024-03-12 06:55', 'America/New_York', None, None, 2, 4, 0, 0, 26048, 1389, 2313088, 0, Decimal('42'), 'USD', '2024-03-12', '06:55:00', '2024-03-12 06:55:00', 136708097, 'point2point', 134610945, 'Pending', 134217729, 'Pending', 'dispatch', 'Safe Transfer Service LLC', 10, 'China Ctrip', 1293217, 'Orlando International Airport B(Orlando International Airport B)', Decimal('-81.3078871'),

In [54]:
# df = pd.read_sql(price_training_t, engine)
# df

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_place_id,start_place,start_lng,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293055,Orlando Sanford International Airport,-81.234288,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293055,Orlando Sanford International Airport,-81.234288,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293055,Orlando Sanford International Airport,-81.234288,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293055,Orlando Sanford International Airport,-81.234288,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,1338363,Orlando International Airport,-81.310547,28.424599,1338364,Towneplace Suites Orlando Downtown,-81.378756,28.528344,3,MPV-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944935,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...",-81.502616,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944935,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...",-81.502616,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944935,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...",-81.502616,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,908266,DoubleTree Suites by Hilton Orlando - Disney S...,-81.506486,28.379053,908267,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5


In [57]:
sqlite_eng = create_engine('sqlite:///../data/price_training_from_gbq_raw.db', echo=False)

In [58]:
df.to_sql('price_training_orlando_mpv5', sqlite_eng, if_exists='replace')

844

# Pandas Processing


## Traffic peak time exclusion
source https://www.quora.com/What-is-the-trickiest-time-of-the-day-to-drive-in-Orlando

In [39]:
from workalendar.usa import Florida

In [40]:
cal_orlando = Florida()


In [41]:
exclude_dates = [d[0] for d in cal_orlando.holidays(2024)]
exclude_dates

[datetime.date(2024, 1, 1),
 datetime.date(2024, 1, 15),
 datetime.date(2024, 5, 27),
 datetime.date(2024, 7, 4),
 datetime.date(2024, 9, 2),
 datetime.date(2024, 11, 11),
 datetime.date(2024, 11, 28),
 datetime.date(2024, 11, 29),
 datetime.date(2024, 12, 25)]

In [25]:
from workalendar.usa import Florida 
import numpy as np
cal_florida = Florida()
exclude_dates_str = [str(d[0]) for d in cal_florida.holidays(2024)]
exclude_dates_str

['2024-01-01',
 '2024-01-15',
 '2024-05-27',
 '2024-07-04',
 '2024-09-02',
 '2024-11-11',
 '2024-11-28',
 '2024-11-29',
 '2024-12-25']

In [28]:
exclude_dates = [d[0] for d in cal_florida.holidays(2024)]
# exclude_dates
# res = df['from_datetime_utc'].apply(lambda x: x in exclude_dates)

In [29]:
# np.any(res.apply(lambda x: x in exclude_dates))
# res[0]=True
# res
# np.any(res)


In [59]:
from pricing.data.utils import validate_datetime_in_iso_format, validate_timezone_in_iana, get_timezone_abbreviation, fix_timezone

In [60]:
df_invalid_datetime = df[df.apply(lambda x: not validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_invalid_datetime

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_place_id,start_place,start_lng,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class


In [61]:
df_invalid_timezone = df[df.apply(lambda x: not validate_timezone_in_iana(x['from_timezone_str']), axis=1)]
df_invalid_timezone

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_place_id,start_place,start_lng,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class
91,3280576,1,1709583900,2024-03-04 15:25,Eastern Standard Time,,,2,2.0,0.0,...,695,Orlando International Airport,-81.308083,28.431158,1250311,"Rosen Inn at Pointe Orlando, 9000 Internationa...",-81.472724,28.433655,3,MPV-5
92,3280576,1,1709583900,2024-03-04 15:25,Eastern Standard Time,,,2,2.0,0.0,...,695,Orlando International Airport,-81.308083,28.431158,1250311,"Rosen Inn at Pointe Orlando, 9000 Internationa...",-81.472724,28.433655,3,MPV-5
93,3280576,1,1709583900,2024-03-04 15:25,Eastern Standard Time,,,2,2.0,0.0,...,695,Orlando International Airport,-81.308083,28.431158,1250311,"Rosen Inn at Pointe Orlando, 9000 Internationa...",-81.472724,28.433655,3,MPV-5
94,3280576,1,1709583900,2024-03-04 15:25,Eastern Standard Time,,,2,2.0,0.0,...,695,Orlando International Airport,-81.308083,28.431158,1250311,"Rosen Inn at Pointe Orlando, 9000 Internationa...",-81.472724,28.433655,3,MPV-5
121,3127714,1,1706126460,2024-01-24 15:01,Eastern Standard Time,,,5,5.0,,...,956728,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,956729,"Staybridge Suites Orlando Royale Parc Suites, ...",-81.519718,28.331611,3,MPV-5
122,3127714,1,1706126460,2024-01-24 15:01,Eastern Standard Time,,,5,5.0,,...,956728,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,956729,"Staybridge Suites Orlando Royale Parc Suites, ...",-81.519718,28.331611,3,MPV-5
123,3127714,1,1706126460,2024-01-24 15:01,Eastern Standard Time,,,5,5.0,,...,956728,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,956729,"Staybridge Suites Orlando Royale Parc Suites, ...",-81.519718,28.331611,3,MPV-5
124,3127714,1,1706126460,2024-01-24 15:01,Eastern Standard Time,,,5,5.0,,...,956728,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,956729,"Staybridge Suites Orlando Royale Parc Suites, ...",-81.519718,28.331611,3,MPV-5
154,3315653,1,1716796800,2024-05-27 04:00,Eastern Daylight Time,,,4,5.0,,...,30214,Wyndham Orlando Resort & Conference Center Cel...,-81.587489,28.337133,695,Orlando International Airport,-81.308083,28.431158,3,MPV-5
185,3294242,1,1710079200,2024-03-10 10:00,Eastern Daylight Time,,,4,2.0,,...,1277441,"Conrad Orlando 1500 Eastbeach Wy, Orlando, FL ...",-81.52734,28.40968,1277441,"Conrad Orlando 1500 Eastbeach Wy, Orlando, FL ...",-81.52734,28.40968,3,MPV-5


In [60]:
#orlando_airport = pd.read_csv('../../data/orlando_all_output.csv')
#orlando_airport.head()
#orlando_airport.dtypes
#orlando_airport.to_sql('orlando_airport', sqlite_eng, if_exists='append')

In [78]:
df['from_timezone_fix_str'] = df.apply(lambda x: fix_timezone(x['from_timezone_str']), axis=1) 
df

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,28.424599,1338364,Towneplace Suites Orlando Downtown,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,28.379053,908267,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York


In [79]:

df_valid_datetime = df[df.apply(lambda x: validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
df_valid_timezone = df_valid_datetime[df_valid_datetime.apply(lambda x: validate_timezone_in_iana(x['from_timezone_fix_str']), axis=1)]
df_valid_timezone


Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,start_ltt,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,28.775940,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,28.424599,1338364,Towneplace Suites Orlando Downtown,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,28.372109,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,28.379053,908267,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York


In [80]:

df_valid_timezone.loc[df_invalid_timezone.index, ['from_timezone_str', 'from_timezone_fix_str']]


Unnamed: 0,from_timezone_str,from_timezone_fix_str
91,Eastern Standard Time,US/Eastern
92,Eastern Standard Time,US/Eastern
93,Eastern Standard Time,US/Eastern
94,Eastern Standard Time,US/Eastern
121,Eastern Standard Time,US/Eastern
122,Eastern Standard Time,US/Eastern
123,Eastern Standard Time,US/Eastern
124,Eastern Standard Time,US/Eastern
154,Eastern Daylight Time,US/Eastern
185,Eastern Daylight Time,US/Eastern


In [81]:
from datetime import datetime
import pytz
df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x['from_timezone_fix_str'])), axis=1)
                            .apply(lambda x: x.strftime('%z')))
df_utc_offset.name = 'utc_offset'
df_utc_offset


0      -0400
1      -0400
2      -0400
3      -0400
4      -0400
       ...  
839    -0400
840    -0400
841    -0400
842    -0400
843    -0400
Name: utc_offset, Length: 844, dtype: object

In [82]:
df_dt_str = df_valid_timezone['from_datetime_fix_str']
df_valid_timezone['from_datetime_local'] = df_valid_timezone.apply(lambda x: (pd.to_datetime(x['from_datetime_fix_str']).to_datetime64()), axis=1)
df_valid_timezone

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,end_place_id,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str,from_datetime_local
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,1293056,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,1338364,Towneplace Suites Orlando Downtown,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-26 14:53:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,944936,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,908267,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-05 16:00:00


In [93]:
df_valid_timezone.dtypes

ride_id                           int64
trip_count                        int64
from_utc                          int64
from_time_str                    object
from_timezone_str                object
to_time_str                      object
to_timezone_str                  object
passenger_count                   int64
luggage_count                   float64
children_count                  float64
infant_count                    float64
distance                          int64
duration                          int64
dispatch_id                       int64
trip_no                           int64
dispatch_amount                 float64
dispatch_currency                object
from_date_str                    object
from_time_fix_str                object
from_datetime_fix_str            object
trip_type_id                    float64
trip_type                        object
ride_status_id                    int64
ride_status                      object
dispatch_status_id                int64


In [83]:

# df_valid_timezone['from_datetime_local_tz'] = df_valid_timezone.apply(lambda x: pytz.timezone(x.loc[:,'from_timezone_str']).localize(x.loc[:,'from_datetime_local']), axis=1)
df_valid_timezone['from_timezone'] = df_valid_timezone.apply(lambda x: pytz.timezone(x['from_timezone_fix_str']), axis=1)
df_valid_timezone


Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,end_place,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str,from_datetime_local,from_timezone
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,"12235 Regency Village Dr, Orlando, FL 32821, USA",-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,Towneplace Suites Orlando Downtown,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-26 14:53:00,America/New_York
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,"Orlando International Airport (MCO), 1 Jeff Fu...",-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-05 16:00:00,America/New_York


In [84]:
df_valid_timezone['from_datetime_tz'] = df_valid_timezone.apply(lambda x: x['from_timezone'].localize(x['from_datetime_local']), axis=1)
df_valid_timezone
                                    # .apply(lambda x: x.localize(x.loc[:,'from_timezone_str']), axis=1))
# df_valid_timezone


Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str,from_datetime_local,from_timezone,from_datetime_tz
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-26 14:53:00,America/New_York,2024-03-26 14:53:00-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-05 16:00:00,America/New_York,2024-01-05 16:00:00-05:00


# filtering out peak traffic time

In [95]:
peak_time_str = [('07:00:00', '09:00:00'), ('16:00:00', '19:00:00')]
night_time_str = [('22:00:00', '6:00:00')]  # Shanghai, US usually no overtime extra fees New York 8pm ~ 6am
ind = []
td = []
for pt in peak_time_str:
    ind.append(pd.DatetimeIndex(pt))
ind
for i in ind:
    i[1]-i[0]

[DatetimeIndex(['2024-09-18 07:00:00', '2024-09-18 09:00:00'], dtype='datetime64[ns]', freq=None),
 DatetimeIndex(['2024-09-18 16:00:00', '2024-09-18 19:00:00'], dtype='datetime64[ns]', freq=None)]

Timedelta('0 days 02:00:00')

Timedelta('0 days 03:00:00')

In [96]:
peak_time = []
for pt in peak_time_str:
    peak_time.append(pd.date_range(pt[0], pt[1], freq='h'))
for pt in peak_time:
    print(pt, pt.time)

DatetimeIndex(['2024-09-18 07:00:00', '2024-09-18 08:00:00',
               '2024-09-18 09:00:00'],
              dtype='datetime64[ns]', freq='h') [datetime.time(7, 0) datetime.time(8, 0) datetime.time(9, 0)]
DatetimeIndex(['2024-09-18 16:00:00', '2024-09-18 17:00:00',
               '2024-09-18 18:00:00', '2024-09-18 19:00:00'],
              dtype='datetime64[ns]', freq='h') [datetime.time(16, 0) datetime.time(17, 0) datetime.time(18, 0)
 datetime.time(19, 0)]


In [35]:
peak_time_str

[('07:00:00', '09:00:00'), ('16:00:00', '19:00:00')]

In [99]:

df_peak_traffic_time = df_valid_timezone[
    df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1])
]
df_peak_traffic_time

Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str,from_datetime_local,from_timezone,from_datetime_tz
16,2946548,1,1711540800,2024-03-27 08:00,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-27 08:00:00,America/New_York,2024-03-27 08:00:00-04:00
36,3110745,1,1707346500,2024-02-07 17:55,America/New_York,,,5,0.0,,...,-81.650126,28.361338,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-02-07 17:55:00,America/New_York,2024-02-07 17:55:00-05:00
37,3110745,1,1707346500,2024-02-07 17:55,America/New_York,,,5,0.0,,...,-81.650126,28.361338,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-02-07 17:55:00,America/New_York,2024-02-07 17:55:00-05:00
46,3108855,1,1720478100,2024-07-08 18:35,America/New_York,,,3,3.0,,...,-81.472724,28.433655,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-07-08 18:35:00,America/New_York,2024-07-08 18:35:00-04:00
47,3108855,1,1720478100,2024-07-08 18:35,America/New_York,,,3,3.0,,...,-81.472724,28.433655,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-07-08 18:35:00,America/New_York,2024-07-08 18:35:00-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
840,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
841,3121321,1,1705789800,2024-01-20 17:30,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-20 17:30:00,America/New_York,2024-01-20 17:30:00-05:00
842,3099498,1,1704488400,2024-01-05 16:00,America/New_York,,,2,2.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-01-05 16:00:00,America/New_York,2024-01-05 16:00:00-05:00


In [100]:

df_out_of_peak_traffic_time = df_valid_timezone[
    ~ (df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
    | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1]))
]
df_out_of_peak_traffic_time


Unnamed: 0,ride_id,trip_count,from_utc,from_time_str,from_timezone_str,to_time_str,to_timezone_str,passenger_count,luggage_count,children_count,...,end_lng,end_ltt,vehicle_class_id,vehicle_class,timezone_fixed,from_timezone_fixed_str,from_timezone_fix_str,from_datetime_local,from_timezone,from_datetime_tz
0,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
1,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
2,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
3,3302091,1,1710251700,2024-03-12 09:55,America/New_York,,,4,0.0,,...,-81.486903,28.388792,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-12 09:55:00,America/New_York,2024-03-12 09:55:00-04:00
4,3324918,1,1711479180,2024-03-26 14:53,America/New_York,,,4,4.0,,...,-81.378756,28.528344,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-26 14:53:00,America/New_York,2024-03-26 14:53:00-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,3245633,1,1713607200,2024-04-20 06:00,America/New_York,,,4,4.0,,...,-81.308332,28.429425,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-04-20 06:00:00,America/New_York,2024-04-20 06:00:00-04:00
830,3329995,1,1711050900,2024-03-21 15:55,America/New_York,,,5,0.0,,...,-81.463575,28.459513,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-21 15:55:00,America/New_York,2024-03-21 15:55:00-04:00
831,3329995,1,1711050900,2024-03-21 15:55,America/New_York,,,5,0.0,,...,-81.463575,28.459513,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-21 15:55:00,America/New_York,2024-03-21 15:55:00-04:00
832,3329995,1,1711050900,2024-03-21 15:55,America/New_York,,,5,0.0,,...,-81.463575,28.459513,3,MPV-5,America/New_York,America/New_York,America/New_York,2024-03-21 15:55:00,America/New_York,2024-03-21 15:55:00-04:00


In [100]:

# df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])))
#                             .apply(lambda x: x.strftime('%z')))
# df_utc_offset.name = 'utc_offset'
# df_utc_offset
# 

# filter out round trip (time reservation) with feature distance = 1 (m)

In [113]:
df_no_round_trip = df_valid_timezone[df_valid_timezone['distance'] > 1]

In [123]:
df_training = df_no_round_trip.loc[:,['ride_id', 'trip_type', 'trip_no', 'trip_count', 'ride_status', 'partner', 'fleet', 
                'start_place', 'end_place',
                'passenger_count', 'luggage_count',
                'dispatch_amount', 'dispatch_currency',
                'distance', 'duration', 'vehicle_class', 
                'from_datetime_tz']]
 
df_training['cent_price_per_km'] = df_training['dispatch_amount'] / df_training['distance']*100.0
df_training

Unnamed: 0,ride_id,trip_type,trip_no,trip_count,ride_status,partner,fleet,start_place,end_place,passenger_count,luggage_count,dispatch_amount,dispatch_currency,distance,duration,vehicle_class,from_datetime_tz,cent_price_per_km
0,3302091,point2point,0,1,Pending,Book Taxi,Leonardo Carvalhal de Aguiar,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,83.36,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.107935
1,3302091,point2point,0,1,Pending,Book Taxi,Leonardo Carvalhal de Aguiar,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,83.36,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.107935
2,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,159.60,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.206650
3,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,159.60,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.206650
4,3324918,point2point,-1,1,Pending,KKDay,Elife,Orlando International Airport,Towneplace Suites Orlando Downtown,4,4.0,67.00,USD,17924,1323,MPV-5,2024-03-26 14:53:00-04:00,0.373800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,42.00,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.155285
840,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,51.00,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.188561
841,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,51.00,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.188561
842,3099498,,0,1,Cancelled,Booking,Safe Transfer Service LLC,DoubleTree Suites by Hilton Orlando - Disney S...,"Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,42.00,USD,28354,1344,MPV-5,2024-01-05 16:00:00-05:00,0.148127


In [117]:
def average_cent_per_km(x):
    x['average_cent_per_km'] = x['cent_price_per_km'].mean()
    return x

In [141]:
def fleet_trip_no(x):
    x['fleet_trip_count'] = len(x)
    return x

In [142]:
df_fleet_statistics = df_training.loc[:, ['ride_id','fleet']]
df_fleet_statistics

Unnamed: 0,ride_id,fleet
0,3302091,Leonardo Carvalhal de Aguiar
1,3302091,Leonardo Carvalhal de Aguiar
2,3302091,Elife
3,3302091,Elife
4,3324918,Elife
...,...,...
839,3121321,Safe Transfer Service LLC
840,3121321,Elife
841,3121321,Elife
842,3099498,Safe Transfer Service LLC


In [151]:

df_fleet_trip_no = df_fleet_statistics.groupby('fleet').aggregate([len])
df_fleet_trip_no.sort_values(by=('ride_id', 'len'), ascending=False, inplace=True)
df_fleet_trip_no


Unnamed: 0_level_0,ride_id
Unnamed: 0_level_1,len
fleet,Unnamed: 1_level_2
Elife,379
Safe Transfer Service LLC,134
Yorvis Hernández,107
xiaorong wu,46
Lumasini Transportation,36
CARLOS RODRIGO BELTRAN LOPEZ,27
Maudeline DiogeneCharles,21
Joao Moraes,19
Ucruise Luxury Transportation,10
wagner Valladao de Araujo Filho,8


In [161]:
df_big_fleets = df_fleet_trip_no[df_fleet_trip_no[('ride_id','len')] >100]
df_big_fleets

Unnamed: 0_level_0,ride_id
Unnamed: 0_level_1,len
fleet,Unnamed: 1_level_2
Elife,379
Safe Transfer Service LLC,134
Yorvis Hernández,107


In [163]:
df_big_fleet_data = []
for f in df_big_fleets.index:
    print(f)
    df_big_fleet_data.append(df_training[df_training['fleet'] == f])

Elife
Safe Transfer Service LLC
Yorvis Hernández


In [164]:
df_big_fleet_data[0]

Unnamed: 0,ride_id,trip_type,trip_no,trip_count,ride_status,partner,fleet,start_place,end_place,passenger_count,luggage_count,dispatch_amount,dispatch_currency,distance,duration,vehicle_class,from_datetime_tz,cent_price_per_km
2,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,159.60,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.206650
3,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",4,0.0,159.60,USD,77232,3328,MPV-5,2024-03-12 09:55:00-04:00,0.206650
4,3324918,point2point,-1,1,Pending,KKDay,Elife,Orlando International Airport,Towneplace Suites Orlando Downtown,4,4.0,67.00,USD,17924,1323,MPV-5,2024-03-26 14:53:00-04:00,0.373800
5,3324918,point2point,-1,1,Pending,KKDay,Elife,Orlando International Airport,Towneplace Suites Orlando Downtown,4,4.0,67.00,USD,17924,1323,MPV-5,2024-03-26 14:53:00-04:00,0.373800
8,3074203,,-1,1,Cancelled,Book Taxi,Elife,Orlando International Airport,"Hard Rock Hotel at Universal Orlando, 5800 Uni...",5,0.0,60.20,USD,27843,1370,MPV-5,2024-02-18 12:00:00-05:00,0.216212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,3274112,point2point,-1,1,Cancelled,Booking,Elife,Universal’s Endless Summer Resort - Dockside I...,"Orlando International Airport (MCO), 1 Jeff Fu...",3,3.0,40.88,USD,24652,1470,MPV-5,2024-03-23 16:15:00-04:00,0.165828
835,3274112,point2point,-1,1,Cancelled,Booking,Elife,Universal’s Endless Summer Resort - Dockside I...,"Orlando International Airport (MCO), 1 Jeff Fu...",3,3.0,40.88,USD,24652,1470,MPV-5,2024-03-23 16:15:00-04:00,0.165828
840,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,51.00,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.188561
841,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,51.00,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.188561


In [165]:
df_big_fleet_data[1]

Unnamed: 0,ride_id,trip_type,trip_no,trip_count,ride_status,partner,fleet,start_place,end_place,passenger_count,luggage_count,dispatch_amount,dispatch_currency,distance,duration,vehicle_class,from_datetime_tz,cent_price_per_km
9,3156301,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"Buena Vista Suites Orlando, 8203 World Center ...","Brightline Orlando Station, 10705 Jeff Fuqua B...",2,2.0,49.5,USD,25297,1829,MPV-5,2024-01-21 06:45:00-05:00,0.195675
10,3156301,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"Buena Vista Suites Orlando, 8203 World Center ...","Brightline Orlando Station, 10705 Jeff Fuqua B...",2,2.0,49.5,USD,25297,1829,MPV-5,2024-01-21 06:45:00-05:00,0.195675
25,3136707,,0,1,Cancelled,Book Taxi,Safe Transfer Service LLC,Orlando International Airport,The Grove Resort & Water Park Orlando,5,0.0,70.0,USD,45658,2352,MPV-5,2024-02-23 20:23:00-05:00,0.153314
26,3136707,,0,1,Cancelled,Book Taxi,Safe Transfer Service LLC,Orlando International Airport,The Grove Resort & Water Park Orlando,5,0.0,70.0,USD,45658,2352,MPV-5,2024-02-23 20:23:00-05:00,0.153314
33,3238580,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,Universal’s Endless Summer Resort - Dockside I...,"Orlando International Airport (MCO), 1 Jeff Fu...",5,5.0,42.0,USD,24652,1528,MPV-5,2024-04-13 14:30:00-04:00,0.170372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,3329995,point2point,0,1,Pending,Book Taxi,Safe Transfer Service LLC,Orlando International Airport,"G5 - BEST WESTERN Orlando Gateway, Orlando, FL...",5,0.0,45.0,USD,24324,1349,MPV-5,2024-03-21 15:55:00-04:00,0.185002
833,3329995,point2point,0,1,Pending,Book Taxi,Safe Transfer Service LLC,Orlando International Airport,"G5 - BEST WESTERN Orlando Gateway, Orlando, FL...",5,0.0,45.0,USD,24324,1349,MPV-5,2024-03-21 15:55:00-04:00,0.185002
838,3121321,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,42.0,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.155285
839,3121321,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",2,2.0,42.0,USD,27047,1286,MPV-5,2024-01-20 17:30:00-05:00,0.155285


In [None]:
df_big_fleet_data[2]

In [118]:

# df_training['average_cent_per_km'] = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis = df_training.groupby('fleet').apply(average_cent_per_km)
df_analysis


  df_analysis = df_training.groupby('fleet').apply(average_cent_per_km)


Unnamed: 0_level_0,Unnamed: 1_level_0,ride_id,trip_type,trip_no,trip_count,ride_status,partner,fleet,start_place,end_place,passenger_count,luggage_count,dispatch_amount,dispatch_currency,distance,duration,vehicle_class,from_datetime_tz,cent_price_per_km,average_cent_per_km,fleet_trip_no
fleet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ab Chauffeur,587,3084481,,0,1,Pending,Booking,Ab Chauffeur,"Orlando International Airport (MCO), 1 Jeff Fu...","Universal's Loews Portofino Bay Hotel, 5601 Un...",2,2.0,45.0,USD,29746,1452,MPV-5,2024-01-16 23:20:00-05:00,0.151281,0.151281,1
Bingsheng luo,689,3154426,point2point,0,1,Pending,China Ctrip,Bingsheng luo,Orlando International Airport B (Orlando Inter...,奥兰多 - 迪士尼之泉®区假日酒店 - IHG 旗下酒店(Holiday Inn Orlan...,3,3.0,65.0,USD,28898,1431,MPV-5,2024-01-21 00:19:00-05:00,0.224929,0.224929,2
Bingsheng luo,690,3154426,point2point,0,1,Pending,China Ctrip,Bingsheng luo,Orlando International Airport B (Orlando Inter...,奥兰多 - 迪士尼之泉®区假日酒店 - IHG 旗下酒店(Holiday Inn Orlan...,3,3.0,65.0,USD,28898,1431,MPV-5,2024-01-21 00:19:00-05:00,0.224929,0.224929,2
CARLOS RODRIGO BELTRAN LOPEZ,55,3062624,,0,1,Cancelled,Book Taxi,CARLOS RODRIGO BELTRAN LOPEZ,Orlando International Airport,"7460 International Dr, Orlando, FL 32819, USA",5,0.0,43.0,USD,24160,1274,MPV-5,2024-01-20 16:48:00-05:00,0.177980,0.159873,27
CARLOS RODRIGO BELTRAN LOPEZ,56,3062624,,0,1,Cancelled,Book Taxi,CARLOS RODRIGO BELTRAN LOPEZ,Orlando International Airport,"7460 International Dr, Orlando, FL 32819, USA",5,0.0,43.0,USD,24160,1274,MPV-5,2024-01-20 16:48:00-05:00,0.177980,0.159873,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xinghanzhou,155,3272264,point2point,0,1,Pending,China Ctrip,xinghanzhou,Terminal C of Orlando International Airport(Te...,奥兰多环球影城温德姆花园酒店 - 国际大道(Wyndham Garden Orlando U...,6,6.0,55.0,USD,30548,1471,MPV-5,2024-03-02 11:27:00-05:00,0.180045,0.194471,5
xinghanzhou,156,3272264,point2point,0,1,Pending,China Ctrip,xinghanzhou,Terminal C of Orlando International Airport(Te...,奥兰多环球影城温德姆花园酒店 - 国际大道(Wyndham Garden Orlando U...,6,6.0,55.0,USD,30548,1471,MPV-5,2024-03-02 11:27:00-05:00,0.180045,0.194471,5
xinghanzhou,746,3143936,point2point,0,1,Pending,China Ctrip,xinghanzhou,Terminal C of Orlando International Airport (T...,Signia by Hilton Orlando Bonnet Creek (迪士尼度假区/...,5,5.0,65.0,USD,31849,1740,MPV-5,2024-01-17 13:54:00-05:00,0.204088,0.194471,5
xinghanzhou,747,3143936,point2point,0,1,Pending,China Ctrip,xinghanzhou,Terminal C of Orlando International Airport (T...,Signia by Hilton Orlando Bonnet Creek (迪士尼度假区/...,5,5.0,65.0,USD,31849,1740,MPV-5,2024-01-17 13:54:00-05:00,0.204088,0.194471,5


In [108]:

df_training

Unnamed: 0,ride_id,trip_type,trip_no,trip_count,ride_status,partner,fleet,start_place,end_place,distance,duration,vehicle_class,passenger_count,luggage_count,dispatch_amount,dispatch_currency,from_datetime_tz,cent_price_per_km
0,3302091,point2point,0,1,Pending,Book Taxi,Leonardo Carvalhal de Aguiar,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",77232,3328,MPV-5,4,0.0,83.36,USD,2024-03-12 09:55:00-04:00,0.107935
1,3302091,point2point,0,1,Pending,Book Taxi,Leonardo Carvalhal de Aguiar,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",77232,3328,MPV-5,4,0.0,83.36,USD,2024-03-12 09:55:00-04:00,0.107935
2,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",77232,3328,MPV-5,4,0.0,159.60,USD,2024-03-12 09:55:00-04:00,0.206650
3,3302091,point2point,-1,1,Pending,Book Taxi,Elife,Orlando Sanford International Airport,"12235 Regency Village Dr, Orlando, FL 32821, USA",77232,3328,MPV-5,4,0.0,159.60,USD,2024-03-12 09:55:00-04:00,0.206650
4,3324918,point2point,-1,1,Pending,KKDay,Elife,Orlando International Airport,Towneplace Suites Orlando Downtown,17924,1323,MPV-5,4,4.0,67.00,USD,2024-03-26 14:53:00-04:00,0.373800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,3121321,point2point,0,1,Pending,Booking,Safe Transfer Service LLC,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",27047,1286,MPV-5,2,2.0,42.00,USD,2024-01-20 17:30:00-05:00,0.155285
840,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",27047,1286,MPV-5,2,2.0,51.00,USD,2024-01-20 17:30:00-05:00,0.188561
841,3121321,point2point,-1,1,Pending,Booking,Elife,"13351 FL-535, 13351 FL-535, Orlando, FL 32821,...","Orlando International Airport (MCO), 1 Jeff Fu...",27047,1286,MPV-5,2,2.0,51.00,USD,2024-01-20 17:30:00-05:00,0.188561
842,3099498,,0,1,Cancelled,Booking,Safe Transfer Service LLC,DoubleTree Suites by Hilton Orlando - Disney S...,"Orlando International Airport (MCO), 1 Jeff Fu...",28354,1344,MPV-5,2,2.0,42.00,USD,2024-01-05 16:00:00-05:00,0.148127


In [60]:
#| hide
import nbdev; nbdev.nbdev_export()