# Propdesk Data Pipeline - OHLCV

In [4]:
%load_ext autoreload
%autoreload 2
from propdesk_tardis.tardis_transfero import tardis_transfero as tardis
from propdesk_estimators.exchange_storage import ExchangeStorage
from propdesk_azure_services.azure_databricks import single_run_job
from propdesk_azure_services.azure_databricks import list_databricks_src_files

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Job Type and Dataset Type (don't need to change this)

In [7]:
job_type = 'io_intensive'
dataset_type = 'ohlcv'
script_to_run = 'spark_ohlcv.py'

In [6]:
from propdesk_azure_services.azure_databricks import create_periodic_job

exchange = 'binance'
# daily job
job_name = f'daily_ohlcv_btcusdt_binance'
script_to_run = 'spark_ohlcv.py'

##############
# make sure to have a schedule that is compatible with the period
# you can validate it in https://www.freeformatter.com/cron-expression-generator-quartz.html
# this one reads 'everyday at 4AM'
# defautl timezone is AMERICAS-SAO PAULO
job_schedule = "0 0 6 * * ?"
#############

period = 'daily'
# no need for start_date and end_date in daily jobs 
job_params = {
    'exchange': 'binance',
    'pair': 'btcusdt',
    'resampling_rule': '1T'
}


create_periodic_job(job_name_str=job_name,
                    filename_str=script_to_run,
                    params_dict=job_params,
                    period=period,
                    cron_expression_str=job_schedule)

{'job': {'job_id': 1059812247481555},
 'schedule_job': {'job_id': 291503446434075}}

### Quick and Dirty - Get Data

In [5]:
from propdesk_estimators.exchange_storage import get_dataframe_by_params

In [5]:
exchange_str = 'binance'

query_dict = {
    'dataset_type': 'ohlcv',
    'pair': 'btcusdt', 
    'date_from': '2022-01-01', 
    'date_to': '2022-02-01', 
    'resampling_rule': '1H',
}
query_dict

# pass the flag keep_local to keep raw files instead of downloading them again if needed
# tip: use params_dict define above after running the job to get that data
ohlcv_df = get_dataframe_by_params(exchange_str=exchange_str, params_dict=query_dict, keep_local=True)

files saved to: /tmp/tmpd7wlpoxa


In [6]:
ohlcv_df

Unnamed: 0,datetime,open,high,low,close,volume
0,2022-01-01 00:00:00,46216.93,46731.39,46208.37,46656.14,1503.31036
1,2022-01-01 01:00:00,46656.13,46949.99,46574.06,46778.14,943.79747
2,2022-01-01 02:00:00,46778.14,46928.94,46721.96,46811.76,485.17359
3,2022-01-01 03:00:00,46811.77,46916.63,46760.12,46813.20,562.91090
4,2022-01-01 04:00:00,46813.20,46887.33,46591.23,46711.04,861.88096
...,...,...,...,...,...,...
739,2022-01-31 19:00:00,38468.29,38744.00,38383.91,38415.79,1944.85030
740,2022-01-31 20:00:00,38415.80,38563.37,38236.69,38450.62,1639.71534
741,2022-01-31 21:00:00,38450.62,38531.53,38335.00,38410.09,923.74653
742,2022-01-31 22:00:00,38410.09,38737.99,38398.58,38412.45,927.84485


### Checking for missing data to compute

In [9]:
exchange_str = 'binance'

In [10]:
exchange_stg = ExchangeStorage(exchange_str) # -- ExchangeStorage('binance')

Check for datasets that were already computed

In [20]:
exchange_stg.amend_datasets_by_params(query_dict)


Missing datasets: []


False

In [11]:
from propdesk_tardis.tardis_transfero.tardis_transfero import get_exchange_dataset_info_from_symbol

### Quick and dirty - Process data: paste params_dict here and go

In [14]:
params_dict = {'dataset_type': 'ohlcv', 
               # 'pair': pair,
               'pair': 'btcusdt',
                'exchange': 'binance',
                'start_date': '2022-05-01', 
                'end_date': '2022-05-08', 
                'resampling_rule': '1T', 
                   }
    job_name = f'{params_dict["pair"]}_{params_dict["exchange"]}_{params_dict["dataset_type"]}'
    j = single_run_job(job_name, script_to_run, params_dict, job_type=job_type)
    print(j)

    # query_dict = {
    # 'dataset_type': 'ohlcv',
    # 'pair': pair, 
    # 'date_from': '2022-01-01', 
    # 'date_to': '2022-01-02', 
    # 'resampling_rule': '1t',
    # }
    # print(pair)
    # # pass the flag keep_local to keep raw files instead of downloading them again if needed
    # # tip: use params_dict define above after running the job to get that data
    # ohlcv_df = get_dataframe_by_params(exchange_str='binance', params_dict=query_dict, keep_local=True)
    # print(ohlcv_df)

{'run_id': 1534330, 'job_id': 31115347136908, 'run_page_url': 'https://adb-3928083337264192.12.azuredatabricks.net/?o=3928083337264192#job/31115347136908/run/1534330'}
{'run_id': 1535948, 'job_id': 228537747564885, 'run_page_url': 'https://adb-3928083337264192.12.azuredatabricks.net/?o=3928083337264192#job/228537747564885/run/1535948'}


In [None]:
job_name = f'{params_dict["pair"]}_{params_dict["exchange"]}_{params_dict["dataset_type"]}'

# UNCOMMENT HERE
# single_run_job(job_name, script_to_run, params_dict, job_type=job_type)

# Success :)

## That's it. **check Databricks UI to make sure everything is ok**

### Have fun, move fast, break things, buy btc (or dcr or algorand) ⚡.
#### -- Propdesk Transfero