In [None]:
#| default_exp data.database.processing

In [None]:
#| hide
from nbdev.showdoc import *
from pyasn1_modules.rfc3279 import id_fieldType

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#| export
import os
import pandas as pd 

In [None]:
#| export
from sqlalchemy import MetaData, create_engine, asc, desc, and_, or_, not_, case, extract, cast, Numeric, text, distinct, Column, update, bindparam, Engine
from sqlalchemy.types import DateTime, Date, Time, String
from sqlalchemy.schema import *
from sqlalchemy.sql import func as F, Selectable, select, union, insert
from sqlalchemy.dialects import registry
from sqlalchemy.engine.row import Row
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker


In [None]:
#| export
import requests
from tqdm.notebook import tqdm

# local table from local sqlite database

## Sql Lookup by join a lookup table

In [None]:
sql_eng = create_engine('sqlite:///../data/price_training_raw.db', echo=False)
connection = sql_eng.connect()
metadata = MetaData()
Session = sessionmaker(bind=sql_eng)
session = Session()


In [None]:
routes_dispatches = pd.read_csv('../data/route_dispatches.csv',index_col=['route_start', 'route_end'])
# routes_dispatches.sort_values(by=['count'], ascending=False, inplace=True)
routes_dispatches

In [None]:
# routes_dispatches.to_sql('route_dispatches', sql_eng, if_exists='replace')
routes_dispatches.iloc[0,1]

# Connect to Google BigQuery

In [None]:
#| export
from google.oauth2 import service_account
import os
from google.oauth2.service_account import Credentials

#| hide
from google.cloud import bigquery
from dotenv import load_dotenv

In [None]:

load_dotenv("../.env")
os.environ


In [None]:
key_ods = os.getenv('GC_QUOTE_API_CREDENTIALS')
credential_ods = service_account.Credentials.from_service_account_file("../" + key_ods)
key_test = os.getenv('GC_TEST_API_CREDENTIALS')
credential_test = service_account.Credentials.from_service_account_file("../" + key_test)

In [None]:
registry.register('bigquery', 'sqlalchemy_bigquery', 'BigQueryDialect')
engine_ods = create_engine('bigquery://quote-api-365206',
                       credentials_path='../' + key_ods )

metadata_ods = MetaData()


In [None]:
# engine_test = create_engine('bigquery://elife-data-warehouse-test',
engine_test = create_engine('bigquery://quote-api-365206',
                       credentials_path='../' + key_test )

metadata_test = MetaData()


In [None]:
type(engine_test)

In [None]:

def get_table(project_name: str, dataset_name: str, table_name: str, engine: Engine) -> Table:
    table = Table(f'{project_name}.{dataset_name}.{table_name}', metadata, autoload_with=engine)
    return table

In [None]:

fpz_lut_t = get_table('elife-data-warehouse-test', 'price', 'fixed_zone_routes ', engine_test).alias()


In [None]:
ride_dispatch_t = get_table('elife-data-warehouse-prod', 'ods', 'ride_dispatch', engine_ods).alias()
ride_dispatch_t = ride_dispatch_t.alias()
ride_ride_t = get_table('elife-data-warehouse-prod','ods', 'ride_ride', engine_ods)
ride_ride_t = ride_ride_t.alias()
ride_partner_tran_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner_tran', engine_ods).alias()
ride_partner_t = get_table('elife-data-warehouse-prod','ods', 'ride_partner', engine_ods).alias()



In [None]:
partner_id_t = select(ride_ride_t.c.id.label('ride_id'), ride_partner_tran_t.c.partner_id.label('partner_id'))
partner_id_t = partner_id_t.select_from(ride_ride_t
                                        .join(ride_partner_tran_t,
                                              ride_ride_t.c.partner_tran_id == ride_partner_tran_t.c.id, isouter=True))
partner_id_t = partner_id_t.alias()
partner_t = select(partner_id_t.c.ride_id, partner_id_t.c.partner_id, ride_partner_t.c.name.label('partner'))
partner_t = partner_t.select_from(partner_id_t
                                  .join(ride_partner_t, partner_id_t.c.partner_id == ride_partner_t.c.id, isouter=True))
partner_t = partner_t.alias()

In [None]:
ride_ride_t0 = select(ride_ride_t)
ride_ride_t0 = ride_ride_t0.limit(3)
df = pd.read_sql(ride_ride_t0, engine_ods)
df


In [None]:
ride_dispatch_t0 = select(ride_dispatch_t)
ride_dispatch_t0 = ride_dispatch_t0.limit(3)
df = pd.read_sql(ride_dispatch_t0, engine_ods)
df

In [None]:
import ast
x = routes_dispatches.iloc[0,1]
l = ast.literal_eval(routes_dispatches.iloc[0,1])
len(l)

In [None]:
route_dispatches_partner_t = select(
    ride_dispatch_t.c.id.label('dispatch_id'),
    ride_dispatch_t.c.amount.label('dispatch_amount'),
    ride_dispatch_t.c.currency.label('dispatch_currency'),
    partner_t.c.partner_id,
    partner_t.c.partner,
).where(ride_dispatch_t.c.id.in_(ast.literal_eval(routes_dispatches.iloc[0,1])))

route_dispatches_partner_t = route_dispatches_partner_t.select_from(ride_dispatch_t
                                                                    .join(partner_t, ride_dispatch_t.c.ride_id == partner_t.c.ride_id, isouter=True))

In [None]:
df = pd.read_sql(route_dispatches_partner_t.limit(1000), engine_ods)
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [None]:
routes_count = routes_dispatches['count']
routes_count = routes_count.apply(lambda x: np.log(x))

In [None]:
routes_count.plot(figsize=(20,6))
# routes_count


In [None]:
cumulative_count = routes_count.cumsum()
cumulative_count /= cumulative_count.iloc[-1]
cumulative_count = cumulative_count.apply(lambda x: np.log(x))

In [None]:

cumulative_count.plot(figsize=(20,6))

In [None]:
# df_routes = df.dropna(subset=['route_start', 'route_end'],how='all')
# len(df_routes)

In [None]:
# # df_routes.set_index(['dispatch_id', 'route_end'], inplace=True)
# df_routes.iloc[:5,:]

In [None]:
# cols = df.columns
# cols

In [None]:

# cols = [c.name for c in query.subquery().columns]
# result = [dict(zip(cols, row)) for row in df.to_numpy()]
# # result = [row for row in df.to_numpy()]
# result

In [None]:

# from concurrent.futures import ThreadPoolExecutor, as_completed
# from concurrent import futures
# from enum import Enum
# 
# chunk_size = 1000
# non_fpz_count = 0
# max_concurrent = 24
# 
# connection2 = sql_eng.connect()


In [None]:
# query1000= query.limit(1000)
# chunk_size=100
# raw_total_rows = 1000
# # df = pd.read_sql(query1000, sql_eng)
# # df
# for i,chunk in enumerate(tqdm(pd.read_sql(query1000,connection,index_col='dispatch_id', chunksize=chunk_size), total=raw_total_rows//chunk_size+1, desc='Overall Progress')):
#     # for chunk in tqdm(pd.read_csv(csv_file_path, chunksize=chunk_size), total=total_lines//chunk_size +1):
#     # chunk.to_sql("price_training_labeled_2024_usd", sql_eng, if_exists='append', index=True)
#     dict_to_insert = [dict(zip(cols,row)) for row in chunk.to_numpy()]
#     insert_stmt = insert(label_archive).values(dict_to_insert)
#     session.execute(insert_stmt)
#     session.commit()



In [None]:
# FPZQueryStatus = Enum('FPZQueryStatus', 'JsonError KeyError DictError NoFixedPrice Success')
# fpz = {'route_start': None, 'route_end': None}
# def get_one_fpz(start_ltt_lp, start_lng_lp, end_ltt_lp, end_lng_lp) -> FPZQueryStatus:
#     global fpz, df_lut
#     idx = (start_ltt_lp, start_lng_lp, end_ltt_lp, end_lng_lp)
#     try:
#         fpz = dict(df_lut.loc[idx,['route_start', 'route_end']])
#     except KeyError as e:
#         fpz = {'route_start': None, 'route_end': None}
#         return FPZQueryStatus.KeyError
#     except Exception as exc:
#         raise exc
# 
#     return FPZQueryStatus.Success

# for i,chunk in enumerate(tqdm(pd.read_sql(query,connection,index_col='dispatch_id', chunksize=chunk_size), total=raw_total_rows//chunk_size+1, desc='Overall Progress')):
#     try:
#         # pd.DataFrame(data=route_list, columns=['dispatch_id', 'route_start', 'route_end']).to_csv(result_csv,     mode='a', header=False)
#         # if i<500:
#         #     file_path = "../data/price_training_labeled_2024_usd1.csv"       
#         # else:
#         #     file_path = "../data/price_training_labeled_2024_usd2.csv"
#         file_path = "../data/price_training_labeled_2024_usd.csv"
#         with open(file_path, 'a') as f:
#             # chunk.to_csv(f, header=f.tell()==0,chunksize=chunk_size)
#             chunk.to_sql(result_table[0], sql_eng, if_exists='append', index=True)
#     except Exception as e:
#         print(f"\r {e}")
#         continue

In [None]:
# df1 = df_lut.iloc[:10,:]
# df1
# len(df1)
# idx = df1.index.drop_duplicates(keep='first')
# df1[~df1.index.duplicated(keep='first')]
# df1

In [None]:
# # df1 = df_lut.iloc[:10,:]
# # df1
# # len(df1)
# # idx = df1.index.drop_duplicates(keep='first')
# # df1[~df1.index.duplicated(keep='first')]
# # df1
# route_t = Table('route_lp', metadata, autoload_with=sql_eng)
# route_t = select(route_t)
# # noinspection PyUnboundLocalVariable
# connection = sql_eng.connect()


In [None]:
# s = route_t.limit(10)
# df = pd.read_sql(s,sql_eng)
# df

In [None]:
# chunk_size = 10000

In [None]:
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from concurrent import futures
# from enum import Enum

In [None]:
# csv_lp_route_result = '../data/lp_route_result.csv'


In [None]:
# non_fpz_count = 0
# max_concurrent = 24
# FPZQueryStatus = Enum('FPZQueryStatus', 'JsonError KeyError DictError NoFixedPrice Success')
# fpz = {'route_start': None, 'route_end': None}
# def get_one_fpz(start_ltt_lp, start_lng_lp, end_ltt_lp, end_lng_lp) -> FPZQueryStatus:
#     global fpz, df_lut
#     idx = (start_ltt_lp, start_lng_lp, end_ltt_lp, end_lng_lp)
#     try:
#         fpz = dict(df_lut.loc[idx,['route_start', 'route_end']])
#     except KeyError as e:
#         fpz = {'route_start': None, 'route_end': None}
#         return FPZQueryStatus.KeyError
#     except Exception as exc:
#         raise exc
#     
#     return FPZQueryStatus.Success
# 
# for i,chunk in enumerate(tqdm(pd.read_sql(route_t,connection,index_col='dispatch_id', chunksize=chunk_size), total=total_rows//chunk_size+1, desc='Overall Progress')):
#     chunk = chunk.astype({'start_ltt_lp': float, 'start_lng_lp': float, 'end_ltt_lp': float, 'end_lng_lp': float, 'route_start': str, 'route_end': str})
#     non_fpz_count_chunk = 0
#     # get_fixed_zone_chunk(chunk)
#     # for ind,r in chunk.iterrows():
#     #     result = get_one_fpz(r['start_ltt_lp'], r['start_lng_lp'],r['end_ltt_lp'],r['end_lng_lp'])
#         
#     
#     with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
#         to_do_map = {} # list[futures.Future] = [] 
#         # for ind,r in tqdm(chunk.iterrows(),total=chunk_size,desc='chunk progress', leave=False):
#         for ind,r in chunk.iterrows():
#             future = executor.submit(get_one_fpz, r['start_ltt_lp'], r['start_lng_lp'],r['end_ltt_lp'],r['end_lng_lp'])
#             to_do_map[future] = ind
#         done_iter = as_completed(to_do_map)
#         done_iter = tqdm(done_iter, total=len(to_do_map), desc='chunk progress', leave=False)
# 
#         for future in done_iter:
#             try:
#                 status = future.result()
#             except Exception as e:
#                 # print(f"Exception: {e}")
#                 continue
# 
#             if status == FPZQueryStatus.Success:
#                 ind = to_do_map[future]
#                 try:
#                     chunk.at[ind, 'route_start'] = fpz['route_start']
#                     chunk.at[ind, 'route_end'] = fpz['route_end']
#                 except KeyError as e:
#                     non_fpz_count_chunk += 1
#                     # print(f"\r {e}: {non_fzp_count_chunk}/{chunk_size}")
#                     continue
#                 except Exception as e:
#                     non_fpz_count_chunk += 1
#                     # print(f"\r {e}: {non_fzp_count_chunk}/{chunk_size}")
#                     continue
#             else:
#                 non_fpz_count_chunk += 1
#                 # print(f"\r non fzp chunk: {non_fzp_count_chunk}/{chunk_size}")
#                 continue
#         
#     non_fpz_count += non_fpz_count_chunk
#     scanned_row_number = (i+1)*chunk_size
#     try:
#         # pd.DataFrame(data=route_list, columns=['dispatch_id', 'route_start', 'route_end']).to_csv(result_csv,     mode='a', header=False)
#         with open(csv_lp_route_result, 'a') as f:
#             chunk.to_csv(f, header=f.tell()==0,chunksize=chunk_size)
#         # chunk.to_csv(path_or_buf=csv_result_file_list[0],mode='a',chunksize=chunk_size)
#         # chunk.to_sql(result_table[0], sql_eng, if_exists='append', index=True)
#         print(f"\r Non_FZP count: {non_fpz_count}/{scanned_row_number}")
#     except Exception as e:
#         print(f"\r {e}, Non_FZP count: {non_fpz_count}/{scanned_row_number} ")
#         continue

# inserted two new columns for fixed price zone start_zone and end_zone when creating sql table from csv file

In [None]:
# # engine.execute('ALTER TABLE price_training_raw ADD COLUMN start_zone TEXT')
# # engine.execute('ALTER TABLE price_training_raw ADD COLUMN end_zone TEXT')
# start_zone_column = Column('start_zone', String)
# end_zone_column = Column('end_zone', String)
# add_column_op = AddColumn(start_zone_column, raw)


In [None]:
# url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'


In [None]:
# # Query the table in chunks
# query = session.query(raw)
# chunk_size =1000

In [None]:
# batch_stmt = (
#     update(raw)  # 'raw' is your table object
#     .where(raw.c.dispatch_id == bindparam('b_dispatch_id'))
#     .values(
#         route_start=bindparam('route_start'),
#         route_end=bindparam('route_end')
#     )
# )
# print(batch_stmt)
# compiled = batch_stmt.compile()
# print(compiled.params)


In [None]:
# result_csv = '../data/dispatch_fixed_zones_all.csv'
# for chunk in tqdm(pd.read_sql(query.statement, conn, index_col='dispatch_id', chunksize=chunk_size), total=total_rows//chunk_size+1, desc='Overall Processing'):
#     # [chunk[r] for r in chunk]
#     # l = [r for r in chunk.iterrows()]
#     # l
#     # print(chunk.dtypes)
#     route_list = []
#     for i,r in tqdm(chunk.iterrows(),total=chunk_size, desc='Chunk Processing', leave=False):
#         # l = [i, r['start_ltt'], r['start_lng'], r['end_ltt'], r['end_lng']]
#         # print(l)
#         params = {
#             'from_lat': r['start_ltt'],
#             'from_lng': r['start_lng'],
#             'to_lat': r['end_ltt'],
#             'to_lng': r['end_lng'],
#         }
#         try:
#             response = requests.get(url=url, params=params)
#         except requests.exceptions.Timeout:
#             print('Timeout')
#             continue
#         except requests.exceptions.TooManyRedirects:
#             print('TooManyRedirects')
#             continue
#             # Tell the user their URL was bad and try a different one
#         except requests.exceptions.RequestException as e:
#             print('RequestException, Catastrophic error!')
#             continue
#             # catastrophic error. bail.
#             # raise SystemExit(e)
# 
#         except Exception as e:
#             print(f"request: {e}")
#             continue
#         # print('2')
#         try:
#             res = response.json()
#         except Exception as e:
#             print(f"json: {e}")
#             continue
#         # print('3')
#         try:
#             fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
#         except KeyError as e:
#             print(f"No Fixed Price!")
#             continue
#         except IndexError as e:
#             print("IndexError for fix_price_zones")
#             continue
#         except Exception as e:
#             print(f"dict: {e}")
#             continue
#         # print('4')
#         if not isinstance(fix_price_zones,dict):
#             print(f"No fix price: {fix_price_zones}")
#         else:
#             try:
#                 route_list.append((i, fix_price_zones['from'], fix_price_zones['to']))
#                 # fix_zone_routes_list.append(route)
#                 # chunk.at[i, 'route_start'] = fix_price_zones['from']
#                 # chunk.at[i, 'route_end'] = fix_price_zones['to']
#                 # ins = insert(fixed_zone_routes).values(
#                 #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt        ._mapping['dispatch_id'])
#                 # stmt = (
#                 #     update(raw)  # 'raw' is your table object
#                 #     .where(raw.c.dispatch_id == int(i))
#                 #     .values(
#                 #         route_start = fix_price_zones['from'],
#                 #         route_end = fix_price_zones['to']
#                 #     )
#                 # )
#                 # conn.execute(stmt)
#                 # conn.commit()
#             except KeyError as e:
#                 print("KeyError for route")
#                 continue
#     
#     try:
#         pd.DataFrame(data=route_list, columns=['dispatch_id', 'route_start', 'route_end']).to_csv(result_csv, mode='a', header=False)
#     except Exception as e:
#         print(f"csv: {e}")
#         continue
#     
#     # with sqlite_eng.begin() as conn:
#     #     conn.execute(
#     #         stmt, 
#     #         [
#     #             {
#     #                 'b_dispatch_id': i,
#     #                 'route_start': r['route_start'],
#     #                 'route_end': r['route_end']
#     #             }
#     #             for i,r in chunk.iterrows()
#     #         ],
#     #     )
#     #     conn.commit()


In [None]:
# len(result)
# result[0]

In [None]:

# with sql_eng.begin() as conn:
#     conn.execute(
#         batch_stmt, 
#         [
#             {
#                 'b_dispatch_id': i,
#                 'route_start': r['route_start'],
#                 'route_end': r['route_end']
#             }
#             for i,r in res.iterrows()
#         ],
#     )
#     conn.commit()


In [None]:
# for chunk in pd.read_sql(query.statement, local_conn, chunksize=20):
#     # [chunk[r] for r in chunk]
#     # l = [r for r in chunk.iterrows()]
#     # l
#     for i,r in chunk.iterrows():
#         # l = [r[0], r[1]['start_ltt'], r[1]['start_lng'], r[1]['end_ltt'], r[1]['end_lng']]
#         params = {
#             'from_lat': r['start_ltt'],
#             'from_lng': r['start_lng'],
#             'to_lat': r['end_ltt'],
#             'to_lng': r['end_lng'],
#         }
#         try:
#             response = requests.get(url=url, params=params)
#         except requests.exceptions.Timeout:
#             print('Timeout')
#             continue
#         except requests.exceptions.TooManyRedirects:
#             print('TooManyRedirects')
#             continue
#             # Tell the user their URL was bad and try a different one
#         except requests.exceptions.RequestException as e:
#             print('RequestException, Catastrophic error!')
#             continue
#             # catastrophic error. bail.
#             # raise SystemExit(e)
#         
#         except Exception as e:
#             print(f"request: {e}")
#             continue
#         # print('2')
#         try:
#             res = response.json()
#         except Exception as e:
#             print(f"json: {e}")
#             continue
#         # print('3')
#         try:
#             fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
#         except KeyError as e:
#             j = j +1
#             print(f"{j} Not Fixed Price!")
#             continue
#         except IndexError as e:
#             print("IndexError for fix_price_zones")
#             continue
#         except Exception as e:
#             print(f"dict: {e}")
#             continue
#         # print('4')
#         if not isinstance(fix_price_zones,dict):
#             print(f"No fix price: {fix_price_zones}")
#         else:
#             try:
#                 # route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
#                 # fix_zone_routes_list.append(route)
#                 chunk.at[i, 'route_start']
#                 
#                 # ins = insert(fixed_zone_routes).values(
#                 #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt        ._mapping['dispatch_id'])
#                 i = i +1
#             except KeyError as e:
#                 print("KeyError for route")
#                 continue
#         
#         price_training_q = session.query(price_training_t).limit(500)
# fix_zone_routes_list = []


In [None]:
# i = 0
# j = 0
# for pt in price_training_q:
#     # print('1')
#     # ride_samples.append(pt)
#     params = {
#         'from_lat': pt._mapping['start_ltt'],
#         'from_lng': pt._mapping['start_lng'],
#         'to_lat': pt._mapping['end_ltt'],
#         'to_lng': pt._mapping['end_lng'],
#         # 'from_utc':pt._mapping['from_utc'],
#     }
#     try:
#         response = requests.get(url=url, params=params)
#     except requests.exceptions.Timeout:
#         print('Timeout')
#         continue
#     except requests.exceptions.TooManyRedirects:
#         print('TooManyRedirects')
#         # Tell the user their URL was bad and try a different one
#     except requests.exceptions.RequestException as e:
#         print('RequestException, Catastrophic error!')
#         # continue
#         # catastrophic error. bail.
#         raise SystemExit(e)    
#         
#     except Exception as e:
#         print(f"request: {e}")
#         continue
#     # print('2')
#     try:
#         res = response.json()
#     except Exception as e:
#         print(f"json: {e}")
#         continue
#     # print('3')
#     try:
#         fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
#     except KeyError as e:
#         j = j +1
#         print(f"{j} Not Fixed Price!")
#         continue
#     except IndexError as e:
#         print("IndexError for fix_price_zones")
#         continue
#     except Exception as e:
#         print(f"dict: {e}")
#         continue
#     # print('4')
#     if not isinstance(fix_price_zones,dict):
#         print(f"No fix price: {fix_price_zones}")
#     else:
#         try:
#             route = (fix_price_zones['from'], fix_price_zones['to'],pt._mapping['dispatch_id'])
#             fix_zone_routes_list.append(route)
#             # ins = insert(fixed_zone_routes).values(
#             #     start=fix_price_zones['from'], end=fix_price_zones['from'], dispatch_id= pt._mapping['dispatch_id'])
#             i = i +1
#         except KeyError as e:
#             print("KeyError for route")
#             continue
# 
#         # print('5')
#         if i%50 == 0:
#             # connection.execute(ins)
#             df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
#             df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
#             fix_zone_routes_list = []
#             print(f"Created {i} records")
# 
#     # print('6')
# df = pd.DataFrame(fix_zone_routes_list, columns=['start', 'end', 'dispatch_id'])
# df.to_sql('fixed_zone_routes', sqlite_eng, if_exists='append')
# fix_zone_routes_list = []
# print(f"Created {i} records")
#     # print('6')
#     # print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
#     # # print(pt._mapping['start_place'])
#     # print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
#     # # print(pt._mapping['end_place']) 
#     # print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
#     # print("------------")

# Use ORM to retrieve records


In [None]:
# sample_count = session.query(F.count(price_training_t.c.ride_id)).scalar()
# print(sample_count)
# 

In [None]:
# url = 'https://j1j495o5pk.execute-api.us-east-2.amazonaws.com/upncoming/ride-pricings'

In [None]:

# params = {
#     'from_lat': 37.61911449999999,
#     'from_lng':-122.3816274,
#     'to_lat':37.3635295,
#     'to_lng':-121.9285932,
#     'from_utc':1727352000,
# }
# response = requests.get(url=url, params=params)
# response.json()


In [None]:
# res = response.json()
# fix_price_zones = res['fleets'][0]['vehicle_classes'][0]['price_detail']['base_pricing']['fix_price_detail']
# print(fix_price_zones)

In [None]:
# ride_samples = []

In [None]:
# pt = price_training_q.first()
# pt

In [None]:
# params = {
#     'from_lat': (pt._mapping['start_ltt']),
#     'from_lng': (pt._mapping['start_lng']),
#     'to_lat': (pt._mapping['end_ltt']),
#     'to_lng': (pt._mapping['end_lng']),
#     # 'from_utc': int(pt._mapping['from_utc']),
# }
# params

# response = requests.get(url=url, params=params)
# res = response.json()
# res

In [None]:
from sqlalchemy import String,Integer,insert

In [None]:
# fix_zone_routes_list

In [None]:
# price_training_q = session.query(price_training_t).limit(10)
# ride_samples = []
# for pt in price_training_q:
#     ride_samples.append(pt)
#     print(pt._mapping['ride_id'], pt._mapping['dispatch_amount'], pt._mapping['dispatch_currency'])
#     # print(pt._mapping['start_place'])
#     print(pt._mapping['start_lng'], pt._mapping['start_ltt'])
#     # print(pt._mapping['end_place']) 
#     print(pt._mapping['end_lng'], pt._mapping['end_ltt'])
#     print("------------")



In [None]:
# ride_samples[0]._mapping

In [None]:
# df = pd.DataFrame(ride_samples)
# df

In [None]:

# rides_q = session.query(price_training_t).limit(10)

In [None]:

# for ride in rides_q:
#     print(ride)

In [None]:
# rides_q = session.query(ride_ride_t).limit(10)
# for ride in rides_q:
#     print(ride)

In [None]:
# print(price_training_t)

In [None]:
# use core to retrieve records
# rp = connection.execute(price_training_t)
# for i, record in enumerate(rp):
#     print(i, record.ride_id)

In [None]:
# results = rp.fetchall()
# results

In [None]:
# df = pd.read_sql(price_training_t, engine)
# df

In [None]:
# sqlite_eng = create_engine('sqlite:///../data/price_training_from_gbq_raw.db', echo=False)

In [None]:
# df.to_sql('price_training_orlando_mpv5', sqlite_eng, if_exists='replace')

# Pandas Processing


## Traffic peak time exclusion
source https://www.quora.com/What-is-the-trickiest-time-of-the-day-to-drive-in-Orlando

In [None]:
# from workalendar.usa import Florida

In [None]:
# cal_orlando = Florida()


In [None]:
# exclude_dates = [d[0] for d in cal_orlando.holidays(2024)]
# exclude_dates

In [None]:
# from workalendar.usa import Florida 
# import numpy as np
# cal_florida = Florida()
# exclude_dates_str = [str(d[0]) for d in cal_florida.holidays(2024)]
# exclude_dates_str

In [None]:
# exclude_dates = [d[0] for d in cal_florida.holidays(2024)]
# exclude_dates
# res = df['from_datetime_utc'].apply(lambda x: x in exclude_dates)

In [None]:
# np.any(res.apply(lambda x: x in exclude_dates))
# res[0]=True
# res
# np.any(res)


In [None]:
# from pricing.data.utils import validate_datetime_in_iso_format, validate_timezone_in_iana, get_timezone_abbreviation, fix_timezone

In [None]:
# df_invalid_datetime = df[df.apply(lambda x: not validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
# df_invalid_datetime

In [None]:
# df_invalid_timezone = df[df.apply(lambda x: not validate_timezone_in_iana(x['from_timezone_str']), axis=1)]
# df_invalid_timezone

In [None]:
#orlando_airport = pd.read_csv('../../data/orlando_all_output.csv')
#orlando_airport.head()
#orlando_airport.dtypes
#orlando_airport.to_sql('orlando_airport', sqlite_eng, if_exists='append')

In [None]:
# df['from_timezone_fix_str'] = df.apply(lambda x: fix_timezone(x['from_timezone_str']), axis=1) 
# df

In [None]:

# df_valid_datetime = df[df.apply(lambda x: validate_datetime_in_iso_format(x['from_datetime_fix_str']), axis=1)]
# df_valid_timezone = df_valid_datetime[df_valid_datetime.apply(lambda x: validate_timezone_in_iana(x['from_timezone_fix_str']), axis=1)]
# df_valid_timezone


In [None]:

# df_valid_timezone.loc[df_invalid_timezone.index, ['from_timezone_str', 'from_timezone_fix_str']]


In [None]:
# from datetime import datetime
# import pytz
# df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x['from_timezone_fix_str'])), axis=1)
#                             .apply(lambda x: x.strftime('%z')))
# df_utc_offset.name = 'utc_offset'
# df_utc_offset


In [None]:
# df_dt_str = df_valid_timezone['from_datetime_fix_str']
# df_valid_timezone['from_datetime_local'] = df_valid_timezone.apply(lambda x: (pd.to_datetime(x['from_datetime_fix_str']).to_datetime64()), axis=1)
# df_valid_timezone

In [None]:
# df_valid_timezone.dtypes

In [None]:

# df_valid_timezone['from_datetime_local_tz'] = df_valid_timezone.apply(lambda x: pytz.timezone(x.loc[:,'from_timezone_str']).localize(x.loc[:,'from_datetime_local']), axis=1)
# df_valid_timezone['from_timezone'] = df_valid_timezone.apply(lambda x: pytz.timezone(x['from_timezone_fix_str']), axis=1)
# df_valid_timezone


In [None]:
# df_valid_timezone['from_datetime_tz'] = df_valid_timezone.apply(lambda x: x['from_timezone'].localize(x['from_datetime_local']), axis=1)
# df_valid_timezone
                                    # .apply(lambda x: x.localize(x.loc[:,'from_timezone_str']), axis=1))
# df_valid_timezone


# filtering out peak traffic time

In [None]:
# peak_time_str = [('07:00:00', '09:00:00'), ('16:00:00', '19:00:00')]
# night_time_str = [('22:00:00', '6:00:00')]  # Shanghai, US usually no overtime extra fees New York 8pm ~ 6am
# ind = []
# td = []
# for pt in peak_time_str:
#     ind.append(pd.DatetimeIndex(pt))
# ind
# for i in ind:
#     i[1]-i[0]

In [None]:
# peak_time = []
# for pt in peak_time_str:
#     peak_time.append(pd.date_range(pt[0], pt[1], freq='h'))
# for pt in peak_time:
#     print(pt, pt.time)

In [None]:
# peak_time_str

In [None]:

# df_peak_traffic_time = df_valid_timezone[
#     df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
#     | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1])
# ]
# df_peak_traffic_time

In [None]:

# df_out_of_peak_traffic_time = df_valid_timezone[
#     ~ (df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[0])
#     | df_valid_timezone['from_datetime_tz'].apply(lambda x: x.strftime('%H:%M:%S')).between(*peak_time_str[1]))
# ]
# df_out_of_peak_traffic_time


In [None]:

# df_utc_offset = (df_valid_timezone.apply(lambda x: datetime.now(pytz.timezone(x.loc['timezone'])))
#                             .apply(lambda x: x.strftime('%z')))
# df_utc_offset.name = 'utc_offset'
# df_utc_offset
# 

# filter out round trip (time reservation) with feature distance = 1 (m)

In [None]:
# df_no_round_trip = df_valid_timezone[df_valid_timezone['distance'] > 1]

In [None]:
# df_training = df_no_round_trip.loc[:,['ride_id', 'trip_type', 'trip_no', 'trip_count', 'ride_status', 'partner', 'fleet', 
#                 'start_place', 'end_place',
#                 'passenger_count', 'luggage_count',
#                 'dispatch_amount', 'dispatch_currency',
#                 'distance', 'duration', 'vehicle_class', 
#                 'from_datetime_tz']]
#  
# df_training['cent_price_per_km'] = df_training['dispatch_amount'] / df_training['distance']*100.0
# df_training

In [None]:
# def average_cent_per_km(x):
#     x['average_cent_per_km'] = x['cent_price_per_km'].mean()
#     return x

In [None]:
# def fleet_trip_no(x):
#     x['fleet_trip_count'] = len(x)
#     return x

In [None]:
# df_fleet_statistics = df_training.loc[:, ['ride_id','fleet']]
# df_fleet_statistics

In [None]:

# df_fleet_trip_no = df_fleet_statistics.groupby('fleet').aggregate([len])
# df_fleet_trip_no.sort_values(by=('ride_id', 'len'), ascending=False, inplace=True)
# df_fleet_trip_no


In [None]:
# df_big_fleets = df_fleet_trip_no[df_fleet_trip_no[('ride_id','len')] >100]
# df_big_fleets

In [None]:
# df_big_fleet_data = []
# for f in df_big_fleets.index:
#     print(f)
#     df_big_fleet_data.append(df_training[df_training['fleet'] == f])

In [None]:
# df_big_fleet_data[0]

In [None]:
# df_big_fleet_data[1]

In [None]:
# df_big_fleet_data[2]

In [None]:

# # df_training['average_cent_per_km'] = df_training.groupby('fleet').apply(average_cent_per_km)
# df_analysis = df_training.groupby('fleet').apply(average_cent_per_km)
# df_analysis


In [None]:

# df_training

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()