In [1]:
data_location = 'sqlite:///../../../data/data.db'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce


import pickle

In [3]:
def vol_ohlc(df, lookback=10):
    o = df.open
    h = df.high
    l = df.low
    c = df.close
    
    k = 0.34 / (1.34 + (lookback+1)/(lookback-1))
    cc = np.log(c/c.shift(1))
    ho = np.log(h/o)
    lo = np.log(l/o)
    co = np.log(c/o)
    oc = np.log(o/c.shift(1))
    oc_sq = oc**2
    cc_sq = cc**2
    rs = ho*(ho-co)+lo*(lo-co)
    close_vol = cc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    open_vol = oc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    window_rs = rs.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    result = (open_vol + k * close_vol + (1-k) * window_rs).apply(np.sqrt) * np.sqrt(252)
    result[:lookback-1] = np.nan
    
    return result

In [4]:
ohlc = pd.read_sql('SELECT * FROM ohlc', data_location)
ohlc.shape

(11627, 9)

In [5]:
ohlc.head()

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
0,2021-11-01 00:00:00,61421.37,61669.14,61239.6,61343.68,256.433869,15757510.0,BTC,BTC
1,2021-11-01 01:00:00,61346.17,61709.82,61171.22,61610.93,332.481185,20445580.0,BTC,BTC
2,2021-11-01 02:00:00,61610.94,61779.87,61299.89,61333.17,314.25072,19353900.0,BTC,BTC
3,2021-11-01 03:00:00,61333.17,61457.28,60050.0,60589.06,1059.931358,64146250.0,BTC,BTC
4,2021-11-01 04:00:00,60590.23,60655.0,59752.92,59971.89,621.419878,37447440.0,BTC,BTC


In [6]:
ohlc.describe()

Unnamed: 0,open,high,low,close,volume,volumeUSD
count,11627.0,11627.0,11627.0,11627.0,11627.0,11627.0
mean,5708.198992,5737.512791,5676.641523,5706.967946,778289.3,9847622.0
std,16518.161143,16599.532113,16430.972527,16514.73153,2057505.0,17690570.0
min,0.9999,1.0,0.9951,0.9999,6.713,1960.784
25%,4.5559,4.611,4.49605,4.55435,2565.695,966475.5
50%,92.59,93.71,91.0,92.6,46242.3,3420994.0
75%,307.9245,309.7,305.501,307.7965,176843.6,10683090.0
max,68638.47,69000.0,68456.5,68639.63,39788950.0,398803500.0


To predict SOL return, I need to feature engineer the most relevant elements. 

In [7]:
tokens = ohlc.token.unique()

In [8]:
def df_merge(left, right):
    return pd.merge(left, right, on='ts', how='inner')

X = reduce(df_merge, [
    (lambda df: 
    (
        df
        .assign(
            vol=vol_ohlc(df).fillna(0),
            ret=df.close.pct_change()
        )[['ts', 'vol', 'ret']]
        .rename(columns={
            col: f'{col}_{token}' for col in ['ts', 'vol', 'ret'] if col != 'ts'
        })
    ))(ohlc[ohlc.token == token])
    for token in tokens
]).set_index('ts')

In [9]:
X.tail()

Unnamed: 0_level_0,vol_BTC,ret_BTC,vol_ETH,ret_ETH,vol_USDT,ret_USDT,vol_SOL,ret_SOL,vol_ADA,ret_ADA,...,vol_AVAX,ret_AVAX,vol_ATOM,ret_ATOM,vol_CRV,ret_CRV,vol_AAVE,ret_AAVE,vol_COMP,ret_COMP
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-14 20:00:00,0.136358,0.00481,0.158369,0.005961,0.002463,0.0001,0.21674,0.002798,0.225286,0.014437,...,0.239258,0.008227,0.254118,-0.000471,0.255464,0.003049,0.207758,0.009387,0.284268,0.006813
2021-12-14 21:00:00,0.142237,0.019797,0.170096,0.016737,0.002652,0.0,0.218492,0.025892,0.224116,0.012141,...,0.241603,0.019207,0.25435,0.019303,0.263456,0.021277,0.222014,0.02249,0.281497,0.017473
2021-12-14 22:00:00,0.151148,0.010414,0.172081,0.004623,0.002684,0.0,0.246122,0.015624,0.232362,0.009295,...,0.268875,0.026851,0.253472,0.012933,0.270895,0.02381,0.220598,0.006242,0.275083,0.009868
2021-12-14 23:00:00,0.149424,-0.000302,0.170257,-0.003195,0.002823,0.0001,0.238235,-0.010027,0.231115,-0.000157,...,0.301778,0.032506,0.245708,-0.009576,0.268758,-0.011628,0.219474,-0.003013,0.271229,-0.000864
2021-12-15 00:00:00,0.143079,-0.001448,0.159883,-0.000641,0.002816,0.0,0.228867,0.002517,0.220068,-0.007715,...,0.296444,0.005576,0.234522,-0.003223,0.242546,-0.002941,0.21336,0.001659,0.246067,-0.001027


Add more feature information from SOL (open, high, low, close, volume, volumeUSD) for its return prediction. Meanwhile, I will use some of technical analysis indicators for prediction purpose.

In [10]:
SOL_df = ohlc[ohlc.token == 'SOL'].set_index('ts')

In [11]:
SOL_df

Unnamed: 0_level_0,open,high,low,close,volume,volumeUSD,token,chain
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-11-01 00:00:00,202.194,203.225,200.701,202.673,38155.628,7.720360e+06,SOL,SOL
2021-11-01 01:00:00,202.673,209.736,202.087,208.218,157689.578,3.260716e+07,SOL,SOL
2021-11-01 02:00:00,208.349,210.203,205.693,206.161,112186.097,2.337273e+07,SOL,SOL
2021-11-01 03:00:00,206.161,206.788,199.212,201.689,141171.549,2.858246e+07,SOL,SOL
2021-11-01 04:00:00,201.628,203.995,200.562,201.076,63368.043,1.280768e+07,SOL,SOL
...,...,...,...,...,...,...,...,...
2021-12-14 20:00:00,153.660,154.510,152.890,154.100,44436.935,6.826619e+06,SOL,SOL
2021-12-14 21:00:00,154.110,158.090,154.020,158.090,72284.661,1.127975e+07,SOL,SOL
2021-12-14 22:00:00,158.090,163.360,157.360,160.560,104942.890,1.686540e+07,SOL,SOL
2021-12-14 23:00:00,160.560,161.110,158.350,158.950,51647.440,8.259753e+06,SOL,SOL


In [12]:
!pip install pandas_ta



In [13]:
import pandas_ta as ta

In [14]:
SOL_df.ta.sma(length=10, append=True)

ts
2021-11-01 00:00:00        NaN
2021-11-01 01:00:00        NaN
2021-11-01 02:00:00        NaN
2021-11-01 03:00:00        NaN
2021-11-01 04:00:00        NaN
                        ...   
2021-12-14 20:00:00    156.632
2021-12-14 21:00:00    156.874
2021-12-14 22:00:00    157.112
2021-12-14 23:00:00    157.072
2021-12-15 00:00:00    157.204
Name: SMA_10, Length: 1057, dtype: float64

In [15]:
SOL_df.ta.macd(append=True)

Unnamed: 0_level_0,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-11-01 00:00:00,,,
2021-11-01 01:00:00,,,
2021-11-01 02:00:00,,,
2021-11-01 03:00:00,,,
2021-11-01 04:00:00,,,
...,...,...,...
2021-12-14 20:00:00,-1.266631,0.172408,-1.439038
2021-12-14 21:00:00,-0.980853,0.366548,-1.347401
2021-12-14 22:00:00,-0.548738,0.638931,-1.187669
2021-12-14 23:00:00,-0.332366,0.684242,-1.016608


In [16]:
SOL_df.ta.rsi(append=True)

ts
2021-11-01 00:00:00          NaN
2021-11-01 01:00:00          NaN
2021-11-01 02:00:00          NaN
2021-11-01 03:00:00          NaN
2021-11-01 04:00:00          NaN
                         ...    
2021-12-14 20:00:00    42.919353
2021-12-14 21:00:00    52.038937
2021-12-14 22:00:00    56.655598
2021-12-14 23:00:00    53.069689
2021-12-15 00:00:00    53.851201
Name: RSI_14, Length: 1057, dtype: float64

In [17]:
SOL_df.shape

(1057, 13)

In [18]:
X = X.join(SOL_df)

In [19]:
X.head(50)

Unnamed: 0_level_0,vol_BTC,ret_BTC,vol_ETH,ret_ETH,vol_USDT,ret_USDT,vol_SOL,ret_SOL,vol_ADA,ret_ADA,...,close,volume,volumeUSD,token,chain,SMA_10,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI_14
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-01 00:00:00,0.0,,0.0,,0.0,,0.0,,0.0,,...,202.673,38155.628,7720360.0,SOL,SOL,,,,,
2021-11-01 01:00:00,0.0,0.004357,0.0,0.006874,0.0,0.0,0.0,0.027359,0.0,0.003203,...,208.218,157689.578,32607160.0,SOL,SOL,,,,,
2021-11-01 02:00:00,0.0,-0.004508,0.0,-0.005322,0.0,-0.0002,0.0,-0.009879,0.0,-0.008667,...,206.161,112186.097,23372730.0,SOL,SOL,,,,,
2021-11-01 03:00:00,0.0,-0.012132,0.0,-0.013126,0.0,0.0001,0.0,-0.021692,0.0,-0.007618,...,201.689,141171.549,28582460.0,SOL,SOL,,,,,
2021-11-01 04:00:00,0.0,-0.010186,0.0,-0.010679,0.0,0.0,0.0,-0.003039,0.0,-0.006903,...,201.076,63368.043,12807680.0,SOL,SOL,,,,,
2021-11-01 05:00:00,0.0,0.009699,0.0,0.008355,0.0,0.0001,0.0,0.007863,0.0,0.001349,...,202.657,85632.361,17210960.0,SOL,SOL,,,,,
2021-11-01 06:00:00,0.0,0.001454,0.0,-0.000376,0.0,-0.0001,0.0,0.003938,0.0,0.002487,...,203.455,46693.992,9522210.0,SOL,SOL,,,,,
2021-11-01 07:00:00,0.0,0.002006,0.0,0.003566,0.0,-0.0001,0.0,0.020442,0.0,0.004289,...,207.614,61474.116,12618250.0,SOL,SOL,,,,,
2021-11-01 08:00:00,0.0,0.02363,0.0,0.014728,0.0,0.0,0.0,0.011189,0.0,0.017598,...,209.937,102762.42,21473370.0,SOL,SOL,,,,,
2021-11-01 09:00:00,0.0,-0.003965,0.0,0.00189,0.0,-0.0001,0.0,-0.012218,0.0,-0.000253,...,207.372,66040.166,13825700.0,SOL,SOL,205.0852,,,,


Use this dataframe as the input data

In [20]:
y = X.ret_SOL.shift(-1)[:-1]
X = X[:-1]

In [23]:
X = X.drop(columns=['token', 'chain'])

In [26]:
pd.isnull(X).sum()

vol_BTC           0
ret_BTC           1
vol_ETH           0
ret_ETH           1
vol_USDT          0
ret_USDT          1
vol_SOL           0
ret_SOL           1
vol_ADA           0
ret_ADA           1
vol_DOT           0
ret_DOT           1
vol_AVAX          0
ret_AVAX          1
vol_ATOM          0
ret_ATOM          1
vol_CRV           0
ret_CRV           1
vol_AAVE          0
ret_AAVE          1
vol_COMP          0
ret_COMP          1
open              0
high              0
low               0
close             0
volume            0
volumeUSD         0
SMA_10            9
MACD_12_26_9     25
MACDh_12_26_9    33
MACDs_12_26_9    33
RSI_14           14
dtype: int64

I decided to drop the first row, as returns are not availabe. Meanwhile, for technical analysis indicators, I will need to impute their values due to high N.A. 

In [27]:
X=X[1:]
y=y[1:]

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import learning_curve

In [29]:
X = X.bfill(axis ='rows') #backfill the data as they are correlated along the time axis

In [30]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]


In [31]:
def evaluate_model(model, X, y, test_size=0.2):
    cv =  TimeSeriesSplit(n_splits=int(y.shape[0] * test_size), test_size=1)
    scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)
    
    return np.mean(cross_validate(model, X, y, cv=cv, scoring=scorer, n_jobs=-1)['test_score'])

In [32]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=15)),
    ('model', Ridge(alpha=1.0))
])

evaluate_model(pipeline, X, y)

-0.008791592816754252

In [33]:
pipeline = Pipeline([
    ('model', DecisionTreeRegressor(random_state=0))
])

#scaling might not be a good thing for this model

evaluate_model(pipeline, X, y)

-0.012431111845871479

In [34]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('model', Ridge(alpha=1.0))
])

evaluate_model(pipeline, X, y)

-0.008733135956658982

In [35]:
X.columns

Index(['vol_BTC', 'ret_BTC', 'vol_ETH', 'ret_ETH', 'vol_USDT', 'ret_USDT',
       'vol_SOL', 'ret_SOL', 'vol_ADA', 'ret_ADA', 'vol_DOT', 'ret_DOT',
       'vol_AVAX', 'ret_AVAX', 'vol_ATOM', 'ret_ATOM', 'vol_CRV', 'ret_CRV',
       'vol_AAVE', 'ret_AAVE', 'vol_COMP', 'ret_COMP', 'open', 'high', 'low',
       'close', 'volume', 'volumeUSD', 'SMA_10', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'RSI_14'],
      dtype='object')

In [57]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('model', Ridge(alpha=1.0))
])

evaluate_model(pipeline, X, y)

-0.008733135956658982

In [41]:
from sklearn import svm

In [42]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('model', svm.SVR())
])

evaluate_model(pipeline, X, y)

-0.008546201345938906

In [43]:
pipeline = Pipeline([
    ('feature_selector', FeatureSelector(['ret_SOL', 'SMA_10', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'RSI_14'])),
    ('scale', StandardScaler()),
    ('model', svm.SVR())
])

evaluate_model(pipeline, X, y)

-0.008546201345938906

For svm, it looks like the featureselector doesn't change the score

Try to optimize the hyperparameter

In [46]:
svm.SVC().get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [52]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('model', svm.SVR())
])

evaluate_model(pipeline, X, y)

test_size = 0.2
cv = TimeSeriesSplit(n_splits=int(y.shape[0] * test_size), test_size=1)
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

search = GridSearchCV(pipeline, {
    'model__C': [0.1, 0.5, 1, 2],
    'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}, scoring=scorer, refit=True, cv=cv, n_jobs=-1)
#here you can search the hyperparameter of the pipeline; 
#the name of the hyperparameter is 'name of the pipeline item'__'hyperparameter_name' 

search.fit(X, y)

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=211, test_size=1),
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('model', SVR())]),
             n_jobs=-1,
             param_grid={'model__C': [0.1, 0.5, 1, 2],
                         'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))

In [53]:
search.best_params_

{'model__C': 0.1, 'model__gamma': 1}

In [54]:
best_model = search.best_estimator_

In [55]:
evaluate_model(best_model, X, y)

-0.008546201345938906

In [56]:
pipeline = Pipeline([
    ('feature_selector', FeatureSelector(['vol_BTC', 'ret_BTC', 'vol_ETH', 'ret_ETH', 'vol_USDT', 'ret_USDT',
       'vol_SOL', 'ret_SOL', 'vol_ADA', 'ret_ADA', 'vol_DOT', 'ret_DOT',
       'vol_AVAX', 'ret_AVAX', 'vol_ATOM', 'ret_ATOM', 'vol_CRV', 'ret_CRV',
       'vol_AAVE', 'ret_AAVE', 'vol_COMP', 'ret_COMP'])),
    ('scale', StandardScaler()),
    ('model', svm.SVR())
])

evaluate_model(pipeline, X, y)



-0.008546201345938906