In [1]:
import os
import numpy as np
import pandas as pd

directory = '../input/g-research-crypto-forecasting'
file_path = os.path.join(directory, 'train.csv')
dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
    'Count': np.int32,
    'Open': np.float64,
    'High': np.float64,
    'Low': np.float64,
    'Close': np.float64,
    'Volume': np.float64,
    'VWAP': np.float64,
    'Target': np.float64,
}
data = pd.read_csv(file_path, dtype=dtypes)
data['Time'] = pd.to_datetime(data['timestamp'], unit='s')
data.drop('timestamp', axis=1, inplace=True)

file_path = os.path.join(directory, 'asset_details.csv')
details = pd.read_csv(file_path)

In [2]:
price_column = 'Close'
ids = list(details.Asset_ID)
chunks = []
for id in ids:    
    asset = data[data.Asset_ID == id].copy()
    asset.sort_values(by='Time', inplace=True)
    asset.set_index(keys='Time', inplace=True)
    asset['p1'] = asset[price_column].shift(freq='-1T')
    asset['p16'] = asset[price_column].shift(freq='-16T')
    asset['r'] = np.log(asset.p16/asset.p1)
    asset.drop(['p1', 'p16'], axis=1, inplace=True)
    asset.reset_index(inplace=True)
    chunks.append(asset)

data = pd.concat(chunks)
data.sort_values(by='Time', inplace=True)

In [3]:
data['w'] = data['Asset_ID'].map(details.set_index(keys='Asset_ID')['Weight'])
weight_sum = details.Weight.sum()

data['weighted_asset_r'] = data.w * data.r
time_group = data.groupby('Time')

m = time_group['weighted_asset_r'].sum() / time_group['w'].sum()
#m = time_group['weighted_asset_r'].sum() / weight_sum

data.set_index(keys=['Time'], inplace=True)
data['m'] = m
data.reset_index(inplace=True)

In [4]:
data['m2'] = data.m ** 2
data['mr'] = data.r * data.m

chunks = []
for id in ids:
    # type: pd.DataFrame
    asset = data[data.Asset_ID == id].copy()
    asset.sort_values(by='Time', inplace=True)
    asset.set_index(keys='Time', inplace=True)
    asset['mr_rolling'] = asset['mr'].rolling(window='3750T', min_periods=1).mean()
    asset['m2_rolling'] = asset['m2'].rolling(window='3750T', min_periods=1).mean()
    asset.reset_index(inplace=True)
    chunks.append(asset)
    debug = 1

data = pd.concat(chunks)
data.sort_values(by='Time', inplace=True)
data['beta'] = data['mr_rolling'] / data['m2_rolling']

In [5]:
data['Target_recreated'] = data['r'] - data['beta'] * data['m']

In [6]:
data['Target_diff'] = np.abs(data['Target'] - data['Target_recreated'])

print(f'Average absolute error {data.Target_diff.mean()}')
print(f'Max absolute error {data.Target_diff.max()}')
print(f'Total absolute error {data.Target_diff.sum()}')

Average absolute error 0.0009917834290718956
Max absolute error 2.441505610472013
Total absolute error 23292.329383219516


In [7]:
data

Unnamed: 0,Time,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,...,w,weighted_asset_r,m,m2,mr,mr_rolling,m2_rolling,beta,Target_recreated,Target_diff
0,2018-01-01 00:01:00,2,40,2376.580000,2399.500000,2357.14000,2374.590000,1.923301e+01,2373.116392,-0.004218,...,2.397895,-0.010136,-0.009731,0.000095,0.000041,0.000041,0.000095,0.434390,0.000000e+00,0.004218
0,2018-01-01 00:01:00,9,167,225.330000,227.780000,222.98000,225.206667,4.118966e+02,225.197944,-0.009791,...,2.397895,-0.023595,-0.009731,0.000095,0.000096,0.000096,0.000095,1.011162,1.734723e-18,0.009791
0,2018-01-01 00:01:00,6,173,738.302500,746.000000,732.51000,738.507500,3.359879e+02,738.839291,-0.004809,...,5.894403,-0.028412,-0.009731,0.000095,0.000047,0.000047,0.000095,0.495342,0.000000e+00,0.004809
0,2018-01-01 00:01:00,7,5,25.920000,25.920000,25.87400,25.877000,1.210873e+02,25.891363,-0.008264,...,2.079442,-0.017255,-0.009731,0.000095,0.000081,0.000081,0.000095,0.852717,0.000000e+00,0.008264
0,2018-01-01 00:01:00,5,32,7.659600,7.659600,7.65670,7.657600,6.626713e+03,7.657713,-0.013922,...,1.386294,-0.019436,-0.009731,0.000095,0.000136,0.000136,0.000095,1.440777,0.000000e+00,0.013922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956029,2021-09-21 00:00:00,9,775,157.181571,157.250000,156.70000,156.943857,4.663725e+03,156.994319,,...,2.397895,,0.000000,0.000000,,0.000024,0.000024,0.996955,,
1942618,2021-09-21 00:00:00,0,429,364.115000,364.500000,363.70000,363.757500,1.000677e+03,364.161317,,...,4.304065,,0.000000,0.000000,,0.000022,0.000024,0.932803,,
1874559,2021-09-21 00:00:00,13,380,0.091390,0.091527,0.09126,0.091349,2.193732e+06,0.091388,,...,1.791759,,0.000000,0.000000,,0.000026,0.000024,1.095707,,
1953536,2021-09-21 00:00:00,2,403,542.093333,542.620000,539.63000,541.045000,5.828307e+02,541.684504,,...,2.397895,,0.000000,0.000000,,0.000021,0.000024,0.897499,,
