In [1]:
import pandas as pd
import os
import json
import subprocess
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm

plt.style.use('classic')


In [2]:
if os.getcwd().split('/')[-1] != 'market-congestion':
    os.chdir('..')

print(f'working directory: {os.getcwd()}')


working directory: /Users/crinstaniev/Dev/market-congestion


In [3]:
timestamp_file = open('data/blocks_timestamp.json', 'r')
timestamp_list = json.loads(timestamp_file.read())
timestamp_df = pd.DataFrame(timestamp_list)

display(timestamp_df.head())


Unnamed: 0,date,block,timestamp
0,2022-04-29T00:00:00Z,14675906,1651190458
1,2022-04-29T01:00:00Z,14676172,1651194000
2,2022-04-29T02:00:00Z,14676441,1651197605
3,2022-04-29T03:00:00Z,14676696,1651201211
4,2022-04-29T04:00:00Z,14676966,1651204817


In [4]:
block_max = timestamp_df['block'].max()
block_min = timestamp_df['block'].min()

print(f'block range: {block_min} to {block_max}')


block range: 14675906 to 14695011


In [5]:
# fetch_blocks_command = f'ethereumetl export_blocks_and_transactions\
#         --start-block {block_min}\
#         --end-block {block_max}\
#         --blocks-output data/blocks.csv\
#         --provider-uri https://mainnet.infura.io/v3/b5502deb425f4629a1c886601e332e56'

# # we collect data during the drop
# fetch_blocks_command = [x for x in fetch_blocks_command.split(' ') if x != '']


In [6]:
# subprocess.check_output(fetch_blocks_command)


preprocess blocks data

In [7]:
blocks_df = pd.read_csv('data/blocks.csv')
# exclude object types
blocks_df = blocks_df.select_dtypes(
    exclude=['object']).sort_values(by='number')
blocks_df['timestamp'] = pd.to_datetime(blocks_df['timestamp'], unit='s')
blocks_df.head()


Unnamed: 0,number,difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas
700,14675906,13344403225862636,102974,30029295,30015490,2022-04-29 00:00:58,396,31828676324
701,14675907,13344540664816108,40173,30000000,29963755,2022-04-29 00:01:15,224,35803603077
702,14675908,13351193992766072,104088,30000000,29989140,2022-04-29 00:01:18,482,40268239281
703,14675909,13357850569411324,142087,30029295,9419465,2022-04-29 00:01:19,134,45298124915
704,14675910,13364510396338141,1116,30000000,110060,2022-04-29 00:01:28,4,43188091516


In [8]:
blocks_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 19106 entries, 700 to 18805
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   number             19106 non-null  int64         
 1   difficulty         19106 non-null  int64         
 2   size               19106 non-null  int64         
 3   gas_limit          19106 non-null  int64         
 4   gas_used           19106 non-null  int64         
 5   timestamp          19106 non-null  datetime64[ns]
 6   transaction_count  19106 non-null  int64         
 7   base_fee_per_gas   19106 non-null  int64         
dtypes: datetime64[ns](1), int64(7)
memory usage: 1.3 MB


In [9]:
CONGESTION_THRESHOLD = 0.95

blocks_df['theta'] = blocks_df['gas_used'] / blocks_df['gas_limit']
blocks_df['congested'] = blocks_df['theta'] >= CONGESTION_THRESHOLD
# blocks_df['timestamp'] = pd.to_datetime(blocks_df['timestamp'])

blocks_df


Unnamed: 0,number,difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,theta,congested
700,14675906,13344403225862636,102974,30029295,30015490,2022-04-29 00:00:58,396,31828676324,0.999540,True
701,14675907,13344540664816108,40173,30000000,29963755,2022-04-29 00:01:15,224,35803603077,0.998792,True
702,14675908,13351193992766072,104088,30000000,29989140,2022-04-29 00:01:18,482,40268239281,0.999638,True
703,14675909,13357850569411324,142087,30029295,9419465,2022-04-29 00:01:19,134,45298124915,0.313676,False
704,14675910,13364510396338141,1116,30000000,110060,2022-04-29 00:01:28,4,43188091516,0.003669,False
...,...,...,...,...,...,...,...,...,...,...
18801,14695007,13861510659046042,164026,29970705,29963991,2022-05-01 23:58:50,315,45620868512,0.999776,True
18802,14695008,13854879782248027,542,29999972,0,2022-05-01 23:59:08,0,51320922473,0.000000,False
18803,14695009,13861782299220174,28878,30000000,8460641,2022-05-01 23:59:09,104,44905807164,0.282021,False
18804,14695010,13868688186561937,109617,30029295,30016587,2022-05-01 23:59:13,324,42458680546,0.999577,True


train test split

In [10]:
from sklearn.model_selection import TimeSeriesSplit


In [11]:
tss = TimeSeriesSplit()
for train_idx, test_idx in tss.split(blocks_df):
    pass

train_df = blocks_df.filter(items=train_idx, axis=0)
test_df = blocks_df.filter(items=test_idx, axis=0)


In [12]:
MA_WINDOW = 100


In [13]:
def prepare_ma_prev(df, window):
    return df['theta'].rolling(window=window).mean()


In [14]:
train_df['theta_ma_prev'] = prepare_ma_prev(train_df, MA_WINDOW)
train_df.dropna(inplace=True)

test_df['theta_ma_prev'] = prepare_ma_prev(test_df, MA_WINDOW)
test_df.dropna(inplace=True)


In [15]:
train_df.to_pickle('data/train.pkl')
test_df.to_pickle('data/test.pkl')


## Discrete time series data

In [16]:
train_df_clean = train_df[[
    'size', 'transaction_count', 'theta', 'theta_ma_prev', 'congested'
]]

test_df_clean = test_df[[
    'size', 'transaction_count', 'theta', 'theta_ma_prev', 'congested'
]]


In [17]:
ROLLING_WINDOW = 10


In [18]:
X_train_dis = []
y_train_dis = []

for i in tqdm(range(len(train_df_clean))):
    if i < ROLLING_WINDOW:
        continue

    X_train_dis.append(
        train_df_clean.iloc[i-ROLLING_WINDOW:i].values.flatten())
    y_train_dis.append(train_df_clean.iloc[i])

X_train_dis = np.array(X_train_dis)
y_train_dis = pd.DataFrame(y_train_dis)


100%|██████████| 15823/15823 [00:01<00:00, 10681.46it/s]


In [19]:
X_test_dis = []
y_test_dis = []

for i in tqdm(range(len(test_df_clean))):
    if i < ROLLING_WINDOW:
        continue

    X_test_dis.append(test_df_clean.iloc[i-ROLLING_WINDOW:i].values.flatten())
    y_test_dis.append(test_df_clean.iloc[i])

X_test_dis = np.array(X_test_dis)
y_test_dis = pd.DataFrame(y_test_dis)


100%|██████████| 3085/3085 [00:00<00:00, 10488.91it/s]


In [20]:
f = open('data/train_test_dis.pkl', 'wb')
train_test_dis = dict(
    X_train_dis=X_train_dis,
    y_train_dis=y_train_dis,
    X_test_dis=X_test_dis,
    y_test_dis=y_test_dis
)
pickle.dump(train_test_dis, f)


In [21]:
train_df['theta_ma_future'] = train_df['theta'].shift(1 - MA_WINDOW)\
    .rolling(window=MA_WINDOW).mean()
test_df['theta_ma_future'] = test_df['theta'].shift(1 - MA_WINDOW)\
    .rolling(window=MA_WINDOW).mean()


In [22]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)


In [23]:
train_df.to_pickle('data/regression_train_df.pkl')
test_df.to_pickle('data/regression_test_df.pkl')


## Introduce Volatility

In [24]:
prices_df = pd.read_csv(
    'data/price.csv')[['block_number', 'timestamp', 'price']]
prices_df['timestamp'] = pd.to_datetime(prices_df['timestamp'])

prices_df = prices_df.sort_values(by=['block_number', 'timestamp']
                                  ).groupby('block_number').last().reset_index()

prices_df.columns = ['number', 'timestamp', 'price']
prices_df = prices_df[['number', 'price']]
prices_df


Unnamed: 0,number,price
0,14675974,2936.587290
1,14675977,2936.587290
2,14675979,2936.587290
3,14675987,2936.880949
4,14675990,2938.055878
...,...,...
861,14694952,2828.226965
862,14694958,2828.226965
863,14694974,2828.226965
864,14694988,2828.226965


train dataset

In [25]:
train_df_with_price = train_df.merge(right=prices_df, how='left', on='number')
train_df_with_price['price'] = train_df_with_price['price'].interpolate(
    method='nearest')

train_df_with_price = train_df_with_price.dropna()
train_df_with_price['volatility'] = (
    np.log(train_df_with_price['price'] /
           train_df_with_price['price'].shift(1))
).rolling(window=MA_WINDOW).std()

train_df_with_price = train_df_with_price.dropna()
train_df_with_price.head()


Unnamed: 0,number,difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,theta,congested,theta_ma_prev,theta_ma_future,price,volatility
198,14676202,13410159236673811,46834,29970705,6210660,2022-04-29 01:06:47,37,44315023112,0.207224,False,0.510784,0.53999,2933.945683,0.000139
199,14676203,13416844604942065,150861,29999972,11478912,2022-04-29 01:06:51,144,41071433316,0.382631,False,0.511508,0.545139,2933.945683,0.000139
200,14676204,13416982043895537,153787,29970677,29964551,2022-04-29 01:07:02,365,39866302558,0.999796,True,0.514764,0.541313,2933.945683,0.000139
201,14676205,13404016961321769,26332,29999944,4504524,2022-04-29 01:07:33,44,44847553544,0.150151,False,0.51526,0.541249,2933.945683,0.000139
202,14676406,13509974915803665,51444,30029295,7865495,2022-04-29 01:50:57,95,52022209326,0.261927,False,0.509954,0.541559,2933.945683,5e-05


test dataset

In [26]:
test_df_with_price = test_df.merge(right=prices_df, how='left', on='number')
test_df_with_price['price'] = test_df_with_price['price'].interpolate(
    method='nearest')

test_df_with_price = test_df_with_price.dropna()
test_df_with_price['volatility'] = (
    np.log(test_df_with_price['price'] /
           test_df_with_price['price'].shift(1))
).rolling(window=MA_WINDOW).std()

test_df_with_price = test_df_with_price.dropna()
test_df_with_price.head()


Unnamed: 0,number,difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,theta,congested,theta_ma_prev,theta_ma_future,price,volatility
178,14692304,13912858796216222,11156,29970590,1854711,2022-05-01 13:45:12,24,47322372777,0.061884,False,0.522559,0.509312,2762.817994,2e-05
179,14692305,13919789623253783,89171,29999857,17814572,2022-05-01 13:45:13,193,42139204955,0.593822,False,0.522963,0.515832,2762.817994,2e-05
180,14692406,13926497177739541,175688,29999972,29993664,2022-05-01 14:08:09,990,59199101169,0.99979,True,0.531884,0.516308,2762.817994,2e-05
181,14692407,13892634379442673,326267,29970677,29970474,2022-05-01 14:09:08,271,66595876912,0.999993,True,0.537365,0.507174,2762.817994,2e-05
182,14692408,13899555331276732,96124,29941410,2615028,2022-05-01 14:09:14,24,74920249313,0.087338,False,0.528239,0.500203,2762.817994,2e-05


In [27]:
# save data
train_df_with_price.to_pickle('data/train_volatility.pkl')
test_df_with_price.to_pickle('data/test_volatility.pkl')
