In [24]:
import pandas as pd
from src.get_data import get_coinmetrics_data, get_binance_data
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [25]:
pairs = ['ETHUSDT']


# 1. Get Data 

In [26]:
binance_data = get_binance_data(pairs)

In [27]:
coinmetrics_data = pd.read_parquet('coinmrkets_data.parquet')

In [28]:
# Process coinmetrics data
types = (
    coinmetrics_data.set_index(['time','asset'])
    .dtypes
)
get_ride_of_these_columns = list(types[types=='object'].index)
coinmetrics_data = (
    coinmetrics_data
    .drop(columns=get_ride_of_these_columns)
)
coinmetrics_data = (
    coinmetrics_data
    .set_index(['time','asset'])
    .pct_change()
)

  .pct_change()


In [29]:
coinmetrics_data = (
    coinmetrics_data[
        coinmetrics_data.index.get_level_values(1)=='eth'
    ]
    .pct_change()
)


In [30]:
translate_cm_to_binance = {
    'eth':'ETHUSDT',
    'btc':'BTCUSDT',
    'xrp':'XRPUSDT'}
translate_binance_to_cm = {v:k for (k,v) in translate_cm_to_binance.items()}
fees_free_pairs = {
    'ETH' :'FDUSD',
    'BTC':'FDUSD',
    'XRP':'FDUSD'
}

In [31]:
# create features
close = (
    binance_data
    .pivot_table(
        index='dateTime',
        columns='ticker',
        values='close'
    )
)
returns = (
    close
    .pct_change()
)
target = (
    returns
    .shift(-1)
)
vols = (
    returns
    .rolling(20)
    .std()
)
# Transform features to series & add a name to them (which would be used as columns names when the are concatenads)
returns = (
    returns
    .unstack()
)
returns.name = 'returns'
vols = (
    vols
    .unstack()
)
vols.name = 'vols_20D'
target = (
    target
    .unstack()
)
target.name = 'target'
binance_data = (
    pd.concat(
        [returns,vols,target],
        axis=1
    )
    .reset_index()
)
binance_data['asset'] = binance_data['ticker'] 

In [32]:
binance_data['asset'] = (
    binance_data['ticker']
    .map(translate_binance_to_cm)
)
# Make the time column time zone aware
binance_data['time'] = (
    binance_data['dateTime']
    .dt.tz_localize('UTC')
)

In [33]:
merged_features = (
    coinmetrics_data
    .merge(
        binance_data,
        how='inner',
        on=['asset','time']
    )
)
# Remove last row as its target is nan
all_data = (
    merged_features
    .iloc[:-len(pairs),:]
)
all_data = (
    all_data[
        all_data['returns'].notna()
    ]
    .set_index(['time','asset','ticker','dateTime'])
)

In [36]:
# Continue from  here

In [None]:
all_data.replace([np.inf, -np.inf], np.nan, inplace=True)
all_data = all_data.fillna(-10000)

In [None]:
proption = 0.8
nb_obs = len(all_data.index.get_level_values(0).unique())
cut_off = all_data.index.get_level_values(0).unique()[int(nb_obs*proption)]

In [None]:
#all_data = all_data[all_data.index.get_level_values(1)=='eth']

In [None]:
y = (all_data['target']>0).astype(int)
X = all_data.copy().drop(columns='target')
X_train, X_test = X.loc[:cut_off],  X.loc[cut_off:]
y_train, y_test = y.loc[:cut_off],  y.loc[cut_off:]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [None]:

names = [

    "Neural Net",
    "AdaBoost",
]

classifiers = [

    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
   
]


In [None]:
cols = ['AdrActCnt',
 'AdrBalCnt',
 'AssetEODCompletionTime',
 'BlkCnt',
 'BlkSizeMeanByte',
 'CapAct1yrUSD',
 'CapMVRVCur',
 'CapMVRVFF',
 'CapMrktCurUSD',
 'CapMrktFFUSD',
 'CapRealUSD',
 'DiffLast',
 'DiffMean',
 'FeeByteMeanNtv',
 'FeeMeanNtv',
 'FeeMeanUSD',
 'FeeMedNtv',
 'FeeMedUSD',
 'FeeTotNtv',
 'FeeTotUSD',
 'FlowInExNtv',
 'FlowInExUSD',
 'FlowOutExNtv',
 'FlowOutExUSD',
 'FlowTfrFromExCnt',
 'GasLmtBlk',
 'GasLmtBlkMean',
 'GasLmtTx',
 'GasLmtTxMean',
 'GasUsedTx',
 'GasUsedTxMean',
 'IssContNtv',
 'IssContPctAnn',
 'IssContPctDay',
 'IssContUSD',
 'IssTotNtv',
 'IssTotUSD',
 'NDF',
 'NVTAdj',
 'NVTAdjFF',
 'PriceBTC',
 'PriceUSD',
 'ReferenceRate',
 'ReferenceRateBTC',
 'ReferenceRateEUR',
 'ReferenceRateUSD',
 'RevNtv',
 'RevUSD',
 'SER',
 'SplyAct10yr',
 'SplyAct180d',
 'SplyAct1d',
 'SplyAct1yr',
 'SplyAct2yr',
 'SplyAct30d',
 'SplyAct3yr',
 'SplyAct4yr',
 'SplyAct5yr',
 'SplyAct7d',
 'SplyAct90d',
 'SplyActEver',
 'SplyActPct1yr',
 'SplyAdrBal1in100K',
 'SplyAdrBal1in100M',
 'SplyAdrBal1in10B',
 'SplyAdrBal1in10K',
 'SplyAdrBal1in10M',
 'SplyAdrBal1in1B',
 'SplyAdrBal1in1K',
 'SplyAdrBal1in1M',
 'SplyAdrBalNtv0.001',
 'SplyAdrBalNtv0.01',
 'SplyAdrBalNtv0.1',
 'SplyAdrBalNtv1',
 'SplyAdrBalNtv10',
 'SplyAdrBalNtv100',
 'SplyAdrBalNtv100K',
 'SplyAdrBalNtv10K',
 'SplyAdrBalNtv1K',
 'SplyAdrBalNtv1M',
 'SplyAdrBalUSD1',
 'SplyAdrBalUSD10',
 'SplyAdrBalUSD100',
 'SplyAdrBalUSD100K',
 'SplyAdrBalUSD10K',
 'SplyAdrBalUSD10M',
 'SplyAdrBalUSD1K',
 'SplyAdrBalUSD1M',
 'SplyAdrTop100',
 'SplyAdrTop10Pct',
 'SplyAdrTop1Pct',
 'SplyCur',
 'SplyExpFut10yr',
 'SplyFF',
 'TxCnt',
 'TxCntSec',
 'TxTfrCnt',
 'TxTfrValAdjNtv',
 'TxTfrValAdjUSD',
 'TxTfrValMeanNtv',
 'TxTfrValMeanUSD',
 'TxTfrValMedNtv',
 'TxTfrValMedUSD',
 'VelCur1yr']

In [None]:
set(cols)-set(list(coinmetrics_data.columns))

{'GasLmtBlk',
 'GasLmtBlkMean',
 'GasLmtTx',
 'GasLmtTxMean',
 'GasUsedTx',
 'GasUsedTxMean',
 'ReferenceRateBTC'}

In [None]:
trained_classifiers = []

for name, clf in zip(names, classifiers):
    # Train the classifier
    #clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    trained_classifiers.append(clf)

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
res = trained_classifiers[0].predict_proba(X_test).dot([-1,1])*all_data.loc[cut_off:]['target']

In [None]:
(res.reset_index()[['time','target']].groupby('time').sum()+1).cumprod().plot()

In [None]:
res.reset_index()[['time','target']].groupby('time').sum().mean()/res.reset_index()[['time','target']].groupby('time').sum().std()*np.sqrt(365)