### Step 1: Load the datasets

In [18]:
import pandas as pd

fills_data = pd.read_parquet("datasets/fills_data.parq")
market_data = pd.read_parquet("datasets/market_data.parq")

### Step 2: Preprocessing
Here, we will be merging the `fills_data` and `market_data` datasets based on `timestamp`

In [None]:
fills_data.index = pd.to_datetime(fills_data.index)
market_data.index = pd.to_datetime(market_data.index)

if fills_data.index.tz is None:
	fills_data.index = fills_data.index.tz_localize('UTC')
if market_data.index.tz is None:
	market_data.index = market_data.index.tz_localize('UTC')

fills_data['side'] = fills_data['side'].astype('category')
fills_data['liquidity'] = fills_data['liquidity'].astype('category')

fills_data = fills_data.sort_values('timestamp')
market_data = market_data.sort_values('timestamp')

merged_data = pd.merge_asof(
	fills_data.reset_index(),
	market_data.reset_index(),
	on='timestamp',
	direction='backward'
)

### Step 3: Feature Engineering

In [21]:
merged_data['mid_price'] = (merged_data['bid_prc'] + merged_data['ask_prc']) / 2
merged_data['q_i'] = merged_data['fill_qty'] * merged_data['side'].map({'B': 1, 'S': -1}).astype(int)
merged_data['trade_pnl'] = merged_data['q_i'] * (merged_data['mid_price'] - merged_data['fill_prc'])
merged_data['cumulative_pnl'] = merged_data['trade_pnl'].cumsum()

merged_data['trade_pnl_lag1'] = merged_data['trade_pnl'].shift(1)
merged_data['cumulative_pnl_lag1'] = merged_data['cumulative_pnl'].shift(1)
merged_data['balance_lag1'] = merged_data['balance'].shift(1)

merged_data['volatility'] = merged_data['mid_price'].rolling(window=20).std()
merged_data['volatility_lag1'] = merged_data['volatility'].shift(1)

mean_balance = merged_data['balance_lag1'].mean()
merged_data['dev_from_mean_balance_lag1'] = merged_data['balance_lag1'] - mean_balance
merged_data['dev_from_mean_balance_lag1_diff'] = merged_data['dev_from_mean_balance_lag1'].diff()

mean_mid_price = merged_data['mid_price'].rolling(window=100).mean().shift(1)
merged_data['dev_from_mean_mid_price_lag1'] = merged_data['mid_price'] - mean_mid_price
merged_data['dev_from_mean_mid_price_lag1_diff'] = merged_data['dev_from_mean_mid_price_lag1'].diff()