# Propdesk Data Pipeline - Volatility

In [2]:
%load_ext autoreload
%autoreload 2
from propdesk_services.exchange_storage import ExchangeStorage
from propdesk_services.azure_databricks import single_run_job
from propdesk_services.azure_databricks import list_databricks_src_files

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Job Type and Dataset Type (don't need to change this)

In [3]:
job_type = 'cpu_intensive'
dataset_type = 'volatility_zma_mqc'
script_to_run = 'spark_volatility_zma.py'

### Quick and Dirty - Get Data
Note: volatility_zma_mqc is the default zma dataset type from 2022-04-13

In [4]:
from propdesk_services.exchange_storage import get_dataframe_by_params
exchange_str = 'binance'

query_dict = {
    'dataset_type': dataset_type,
    'pair': 'btcbrl', 
    'date_from': '2021-12-01', 
    'date_to': '2022-01-01', 
}
query_dict

{'dataset_type': 'volatility_zma_mqc',
 'pair': 'btcbrl',
 'date_from': '2021-12-01',
 'date_to': '2022-01-01'}

In [5]:
# pass the flag keep_local to keep raw files instead of downloading them again if needed
# tip: use params_dict define above after running the job to get that data
volatility_df = get_dataframe_by_params(exchange_str=exchange_str, params_dict=query_dict, keep_local=True)
volatility_df

files saved to: /tmp/tmp32qaefxh


Unnamed: 0,datetime,volatility_estimation
0,2021-12-01 00:00:00.000,1.054974e-08
1,2021-12-01 00:00:00.100,1.054974e-08
2,2021-12-01 00:00:00.200,1.054974e-08
3,2021-12-01 00:00:00.300,1.054974e-08
4,2021-12-01 00:00:00.400,1.054974e-08
...,...,...
26763350,2021-12-31 23:59:59.500,3.879932e-09
26763351,2021-12-31 23:59:59.600,3.879932e-09
26763352,2021-12-31 23:59:59.700,3.879932e-09
26763353,2021-12-31 23:59:59.800,3.894929e-09


### Checking for missing data to compute

In [6]:
exchange_str = 'binance'
exchange_stg = ExchangeStorage(exchange_str) # -- ExchangeStorage('binance')

Check for datasets that were already computed

In [7]:
exchange_stg.amend_datasets_by_params(query_dict)


Missing datasets: []


False

### Quick and dirty - Process data: paste params_dict here and go
#### ! Uncomment last line to compute 

In [4]:
pairs = ['adabrl','bnbbrl','btcbrl','busdbrl','dogebrl','dotbrl','ethbrl','ftmbrl','galabrl','ltcbrl','shibbrl','solbrl','usdtbrl']

In [15]:
for pair in pairs:
    # params_dict = {
    #            'pair': pair,
    #            'start_date': '2022-05-01', 
    #            'end_date': '2022-05-16',
    #            'exchange': 'binance'}

    # job_name = f'{params_dict["pair"]}_{params_dict["exchange"]}_{dataset_type}'

    # UNCOMMENT HERE TO COMPUTE
    # single_run_job(job_name, script_to_run, params_dict, job_type=job_type)
    print(pair)
    query_dict = {
               'pair': pair,
               'date_from': '2022-05-01', 
               'date_to': '2022-05-16',
               'dataset_type': dataset_type}
    x = exchange_stg.amend_datasets_by_params(query_dict)
    print(x)
    print('----------------')
for params_dict in [
    {'pair': 'busdbrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-15', 'end_date': '2022-05-16', 'exchange': 'binance'},
    {'pair': 'dogebrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-15', 'end_date': '2022-05-16', 'exchange': 'binance'},
    {'pair': 'ethbrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-13', 'end_date': '2022-05-14', 'exchange': 'binance'},
    {'pair': 'ltcbrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-15', 'end_date': '2022-05-16', 'exchange': 'binance'},]:
    job_name = f'{params_dict["pair"]}_{params_dict["exchange"]}_{dataset_type}'
    
    print(single_run_job(job_name, script_to_run, params_dict, job_type=job_type))

adabrl

Missing datasets: []
False
----------------
bnbbrl

Missing datasets: []
False
----------------
btcbrl

Missing datasets: []
False
----------------
busdbrl

Missing datasets: ['2022-05-15']

Run appropriate job with these parameters:
{'pair': 'busdbrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-15', 'end_date': '2022-05-16', 'exchange': 'binance'}
True
----------------
dogebrl

Missing datasets: ['2022-05-15']

Run appropriate job with these parameters:
{'pair': 'dogebrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-15', 'end_date': '2022-05-16', 'exchange': 'binance'}
True
----------------
dotbrl

Missing datasets: []
False
----------------
ethbrl

Missing datasets: ['2022-05-13']

Run appropriate job with these parameters:
{'pair': 'ethbrl', 'dataset_type': 'volatility_zma_mqc', 'start_date': '2022-05-13', 'end_date': '2022-05-14', 'exchange': 'binance'}
True
----------------
ftmbrl

Missing datasets: []
False
----------------
galabrl

Miss

## Creating a periodic job
Now, after using, checking data, etc., we can create a periodic (weekly or daily) job to keep this dataset updated. We'll setup it to run weekly on Mondays at 04:00 AM Sao Paulo time, to use the cluster in an idle time. 

## Ideally, **check Databricks UI to see if there are jobs scheduled for those times and day to avoid overloading the cluster**

In [None]:
from propdesk_services.azure_databricks import create_periodic_job

pair = 'btcusdt'
exchange = 'binance'

# weekly job
job_name = f'daily_volatility_zma_mqc_{pair}_{exchange}'

# make sure to have a schedule that is compatible with the period
job_schedule = "0 15 1 ? * *"

period = 'daily'
# no need for start_date and end_date in weekly jobs 
job_params = {
    'exchange': exchange,
    'pair': pair,
    'bandwidth': 5,
    'lookback_seconds':60,
    'log_price': True,
    'annualized_volatility': False,
    'resampling_rule': '100mS'
}

create_periodic_job(job_name_str=job_name,
                    filename_str=script_to_run,
                    params_dict=job_params,
                    period=period,
                    cron_expression_str=job_schedule)

{'job': {'job_id': 837105722299197},
 'schedule_job': {'job_id': 874249017617680}}

# Success :)

## That's it. **check Databricks UI to make sure everything is ok**

### Have fun, move fast, break things, buy btc (or dcr or algorand) ⚡.
#### -- Propdesk Transfero