# Load rankings and categories

In [3]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json,os,pdb
import tqdm
import warnings
import yfinance as yf

# Load data


In [2]:
df = pd.read_parquet('~/work/project/daily_20221004.parquet.gzip') # modify the path according to your needs
df.columns = [c.split('.')[-1].replace('_','') for c in df.columns]
df.index = df.index.set_levels([df.index.levels[0], pd.to_datetime(df.index.levels[1])])

Load the listing file and check for the rankings

In [3]:
with open('listing_map_20221004.json',encoding='utf-8') as json_data:
    data = json.load(json_data)
listmap = pd.DataFrame(data['data'])
listmap.sort_values('cmc_rank',inplace=True)
listmap.set_index('cmc_rank',inplace=True)
listmap['symbol_id'] = listmap['symbol']+ '_'+ listmap['id'].astype(str)

In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timeopen,timehigh,timelow,open,high,low,close,volume,marketcap,timestamp,symbol,id
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MPH_7742,2020-11-23 00:59:59.999000+00:00,2020-11-23T00:00:00.000Z,2020-11-23T00:00:07.000Z,2020-11-23T00:37:05.000Z,90.554933,90.554933,79.85233,85.599205,16008510.0,0.0,2020-11-23T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-24 00:59:59.999000+00:00,2020-11-24T00:00:00.000Z,2020-11-24T00:00:07.000Z,2020-11-24T00:40:06.000Z,85.16103,85.16103,80.694133,81.645754,10097380.0,0.0,2020-11-24T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-25 00:59:59.999000+00:00,2020-11-25T00:00:00.000Z,2020-11-25T00:00:06.000Z,2020-11-25T00:58:05.000Z,57.527366,57.527366,53.679986,53.704963,8899399.0,0.0,2020-11-25T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-26 00:59:59.999000+00:00,2020-11-26T00:00:00.000Z,2020-11-26T00:56:05.000Z,2020-11-26T00:13:06.000Z,39.466839,40.121814,38.980499,40.056399,4225065.0,0.0,2020-11-26T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-27 00:59:59.999000+00:00,2020-11-27T00:00:00.000Z,2020-11-27T00:57:05.000Z,2020-11-27T00:16:05.000Z,30.002092,30.391962,29.690305,30.383005,2933446.0,0.0,2020-11-27T00:59:59.999Z,MPH,7742


# Begining Analysis

In [None]:
df2 = df.copy() # makes copy of original df
df2.reset_index(inplace = True)
df2['year'] = pd.DatetimeIndex(df2['timeopen']).year # adds year column to data frame
df2['month'] = pd.DatetimeIndex(df2['timeopen']).month  # adds month column to data frame
df2['day'] = pd.DatetimeIndex(df2['timeopen']).day  # adds day column to data frame
df2_first = df2.drop_duplicates(subset='symbol_id') #finds the first entry for each symbol
df2_last = df2.drop_duplicates(subset='symbol_id', keep='last').copy() #finds the last symbol for each symbol
df2_last['market_perc'] = df2_last['marketcap']/ df2_last['marketcap'].sum() # finds the market cap percentage. Used last known marketcap to calc
df2_last = df2_last.sort_values('market_perc', ascending=False) # sort from highest market cap percent to lowest
df2_last['cum_perc'] = df2_last['market_perc'].cumsum() # creates a column for the cumulative percentage

In [None]:
df2_last.head(10) #shows the top 10 highest market cap coins

In [None]:
df_cutoff = df2_last.loc[df2_last['cum_perc'] <= .91]
tethers = df_cutoff.loc[(df_cutoff['close'] > .99) & (df_cutoff['close'] < 1.01)]['symbol_id'].values # removes 4 tether coins
df_cutoff = df_cutoff.loc[~df_cutoff['symbol_id'].isin(tethers)]
df_cutoff = df_cutoff.loc[(df_cutoff['year'] == 2022) & (df_cutoff['month'] >= 10)]
df2_cutoff = df2_first.loc[df2_first['symbol_id'].isin(df_cutoff['symbol_id'])]
symbols = df2_cutoff.loc[df2_cutoff['year'] > 2020]['symbol_id'].values
symbols2 = df2_cutoff.loc[(df2_cutoff['year'] == 2020) & (df2_cutoff['month'] > 8)]['symbol_id'].values
all_symbols = np.concatenate((symbols, symbols2))

In [None]:
plt.figure(1)
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [15, 15]
sns.barplot(data=df_cutoff.loc[~df_cutoff['symbol_id'].isin(all_symbols)], x="symbol_id", y="market_perc")
plt.xlabel('Symbol ID')
plt.ylabel('Market Cap Percentage')
plt.title('Market Cap Percentage for all Coins in List')
plt.xticks(rotation=90)
plt.show()

In [None]:
print(len(df2_cutoff))

In [None]:
print(len(all_symbols))

In [None]:
df3 = df.copy()
df3.reset_index(inplace = True)
df3 = df3.loc[df3['symbol_id'].isin(df_cutoff['symbol_id'])]
df3 = df3.loc[~df3['symbol_id'].isin(all_symbols)]

In [None]:
df3['month'] = df3['time_close'].dt.month
df3['day'] = df3['time_close'].dt.day
df3['weekday'] = df3['time_close'].dt.weekday
df3['day of week'] = np.where(df3['weekday'] > 4, 0, 1)

In [None]:
df3['time_close'] = df3['time_close'].dt.date
df3 = df3[(df3['time_close'] >= df3.loc[df3['symbol_id'] == 'DOT_6636']['time_close'].min())]
df3.dropna(subset='volume', inplace=True)
df3 = df3[(df3['time_close'] >= df3['time_close'].min())]
all_syms = df3['symbol_id'].unique()

In [None]:
df3 = df3.drop_duplicates(subset=['symbol_id', 'time_close'])
for syms_in in all_syms:
    miss = df3.loc[df3['symbol_id'] == syms_in]
    df3 = df3[df3['time_close'].isin(miss['time_close'])]

In [None]:
df3['price change'] = df3['close'].pct_change()
df3['volume change'] = df3['volume'].pct_change()
df3['tomorrow change'] = df3['close'].pct_change().shift(-1)

df3.drop(df3.groupby('symbol_id')['price change'].head(1).index, inplace=True)
df3.drop(df3.groupby('symbol_id')['tomorrow change'].tail(1).index, inplace=True) 

In [None]:
def get_info(tick):
    TICKER = tick
    START_DATE = '2020-09-24'
    END_DATE = '2022-10-3'
    stock = yf.download(TICKER, 
                     start=START_DATE, 
                     end=END_DATE,
                     progress=False)
    df3[tick] = stock['Adj Close']
    df3[tick] = df3.groupby(['symbol_id'])[tick].ffill()

In [None]:
df3.set_index(['time_close'],inplace = True)
ticks = ['SPY', '^VIX', '^TNX', 'GC=F', 'CL=F', 'EURUSD=X']
for tick in ticks:
    get_info(tick)
df3 = df3.rename(columns={'^TNX': '10Y Treasury',
                          '^VIX': 'VIX',
                          'GC=F': 'Gold',
                          'CL=F': 'Oil',
                          'EURUSD=X': 'EUR_USD'})

In [None]:
df3.drop(columns=['timeopen', 'timehigh', 'timelow', 'open', 'high', 'low', 'timestamp', 'symbol', 'id'], inplace = True)
df3.reset_index(inplace = True)
df3.set_index(['symbol_id', 'time_close'],inplace = True)

## Got rid of all nan volumes, first and last entries for each coin. Starts at 2020-09-24 ends at 2022-10-3

In [None]:
df3.head()

In [None]:
# df3.to_csv('clean_df.csv',index=True)

In [181]:
df3 = pd.read_csv("clean_df.csv")
df = df3.set_index(["symbol_id","time_close"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,close,volume,marketcap,month,day,weekday,day of week,price change,volume change,tomorrow change,SPY,VIX,10Y Treasury,Gold,Oil,EUR_USD
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
BTCB_4023,2020-09-24,10475.684721,1.138708e+04,0.000000e+00,9,24,3,1,-0.010546,1.915524,0.039526,314.251007,28.510000,0.666,1868.300049,40.310001,1.165854
BTCB_4023,2020-09-25,10889.743248,1.384086e+03,0.000000e+00,9,25,4,1,0.039526,-0.878451,-0.029131,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
BTCB_4023,2020-09-26,10572.518703,3.690903e+03,0.000000e+00,9,26,5,0,-0.029131,1.666671,0.019034,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
BTCB_4023,2020-09-27,10773.752004,6.771302e+03,0.000000e+00,9,27,6,0,0.019034,0.834592,0.006752,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
BTCB_4023,2020-09-28,10846.499092,3.646642e+03,0.000000e+00,9,28,0,1,0.006752,-0.461456,-0.013387,324.635376,26.190001,0.663,1872.800049,40.599998,1.163535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FTT_4195,2022-09-29,24.068050,7.976498e+07,3.224523e+09,9,29,3,1,0.015881,-0.219820,0.009143,362.790009,31.840000,3.747,1658.500000,81.230003,0.970817
FTT_4195,2022-09-30,24.288116,7.723532e+07,3.254006e+09,9,30,4,1,0.009143,-0.031714,-0.002232,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
FTT_4195,2022-10-01,24.233912,7.605522e+07,3.246744e+09,10,1,5,0,-0.002232,-0.015279,0.000811,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
FTT_4195,2022-10-02,24.253569,4.738931e+07,3.249378e+09,10,2,6,0,0.000811,-0.376909,-0.007096,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956


In [182]:
df.index.unique(level='symbol_id')

Index(['BTCB_4023', 'CRO_3635', 'LEO_3957', 'XMR_328', 'DOGE_74', 'QNT_3155',
       'ADA_2010', 'XLM_512', 'ALGO_4030', 'SOL_5426', 'BCH_1831', 'TRX_1958',
       'HEX_5015', 'BNB_1839', 'WBTC_3717', 'XRP_52', 'SHIB_5994', 'VET_3077',
       'DOT_6636', 'LTC_2', 'BTC_1', 'ATOM_3794', 'LINK_1975', 'FIL_2280',
       'MATIC_3890', 'ETH_1027', 'ETC_1321', 'FTT_4195'],
      dtype='object', name='symbol_id')

In [189]:
ETH = df.loc['ETH_1027']
ETH = ETH.reset_index()
ETH

Unnamed: 0,time_close,close,volume,marketcap,month,day,weekday,day of week,price change,volume change,tomorrow change,SPY,VIX,10Y Treasury,Gold,Oil,EUR_USD
0,2020-09-24,322.801017,1.406652e+10,3.638855e+10,9,24,3,1,-0.059389,-0.001510,0.078154,314.251007,28.510000,0.666,1868.300049,40.310001,1.165854
1,2020-09-25,348.029329,1.531143e+10,3.923720e+10,9,25,4,1,0.078154,0.088501,0.010122,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
2,2020-09-26,351.552190,1.362715e+10,3.963910e+10,9,26,5,0,0.010122,-0.110002,0.021704,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
3,2020-09-27,359.182175,1.245056e+10,4.050423e+10,9,27,6,0,0.021704,-0.086342,-0.002503,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
4,2020-09-28,358.283277,1.363118e+10,4.040762e+10,9,28,0,1,-0.002503,0.094825,-0.008407,324.635376,26.190001,0.663,1872.800049,40.599998,1.163535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,2022-09-29,1333.964086,1.884855e+10,1.635036e+11,9,29,3,1,0.000887,0.064061,0.002108,362.790009,31.840000,3.747,1658.500000,81.230003,0.970817
685,2022-09-30,1336.776474,1.412469e+10,1.638675e+11,9,30,4,1,0.002108,-0.250622,-0.006693,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
686,2022-10-01,1327.829013,1.365689e+10,1.627897e+11,10,1,5,0,-0.006693,-0.033120,-0.012576,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
687,2022-10-02,1311.130753,6.175119e+09,1.607612e+11,10,2,6,0,-0.012576,-0.547839,-0.022535,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956


In [184]:
Lagged_variables = ETH["tomorrow change"]
Lagged_variables.index = Lagged_variables.index + 1
ETH['lagged change'] = Lagged_variables
ETH


Unnamed: 0,time_close,close,volume,marketcap,month,day,weekday,day of week,price change,volume change,tomorrow change,SPY,VIX,10Y Treasury,Gold,Oil,EUR_USD,lagged change
0,2020-09-24,322.801017,1.406652e+10,3.638855e+10,9,24,3,1,-0.059389,-0.001510,0.078154,314.251007,28.510000,0.666,1868.300049,40.310001,1.165854,
1,2020-09-25,348.029329,1.531143e+10,3.923720e+10,9,25,4,1,0.078154,0.088501,0.010122,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324,0.078154
2,2020-09-26,351.552190,1.362715e+10,3.963910e+10,9,26,5,0,0.010122,-0.110002,0.021704,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324,0.010122
3,2020-09-27,359.182175,1.245056e+10,4.050423e+10,9,27,6,0,0.021704,-0.086342,-0.002503,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324,0.021704
4,2020-09-28,358.283277,1.363118e+10,4.040762e+10,9,28,0,1,-0.002503,0.094825,-0.008407,324.635376,26.190001,0.663,1872.800049,40.599998,1.163535,-0.002503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,2022-09-29,1333.964086,1.884855e+10,1.635036e+11,9,29,3,1,0.000887,0.064061,0.002108,362.790009,31.840000,3.747,1658.500000,81.230003,0.970817,0.000887
685,2022-09-30,1336.776474,1.412469e+10,1.638675e+11,9,30,4,1,0.002108,-0.250622,-0.006693,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956,0.002108
686,2022-10-01,1327.829013,1.365689e+10,1.627897e+11,10,1,5,0,-0.006693,-0.033120,-0.012576,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956,-0.006693
687,2022-10-02,1311.130753,6.175119e+09,1.607612e+11,10,2,6,0,-0.012576,-0.547839,-0.022535,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956,-0.012576


In [123]:
ETH.columns

Index(['time_close', 'close', 'volume', 'marketcap', 'month', 'day', 'weekday',
       'day of week', 'price change', 'volume change', 'tomorrow change',
       'SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil', 'EUR_USD'],
      dtype='object')

In [201]:

X = ETH[['SPY','VIX','10Y Treasury','Gold','Oil','EUR_USD']]
X

Unnamed: 0,SPY,VIX,10Y Treasury,Gold,Oil,EUR_USD
0,314.251007,28.510000,0.666,1868.300049,40.310001,1.165854
1,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
2,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
3,319.331482,26.379999,0.659,1857.699951,40.250000,1.167324
4,324.635376,26.190001,0.663,1872.800049,40.599998,1.163535
...,...,...,...,...,...,...
684,362.790009,31.840000,3.747,1658.500000,81.230003,0.970817
685,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
686,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956
687,357.179993,31.620001,3.804,1662.400024,79.489998,0.982956


In [210]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
X

array([[-2.64163448,  1.15923175, -1.55612578,  0.74095543, -1.79744512,
         0.44393679],
       [-2.50025879,  0.73946326, -1.56515739,  0.59196342, -1.80033554,
         0.46531213],
       [-2.50025879,  0.73946326, -1.56515739,  0.59196342, -1.80033554,
         0.46531213],
       ...,
       [-1.44703844,  1.77213298,  2.49260068, -2.15311774,  0.08995605,
        -2.21587516],
       [-1.44703844,  1.77213298,  2.49260068, -2.15311774,  0.08995605,
        -2.21587516],
       [-1.44703844,  1.77213298,  2.49260068, -2.15311774,  0.08995605,
        -2.26948947]])

In [207]:
y = ETH["tomorrow change"].values
X = ETH[['SPY','VIX','10Y Treasury','Gold','Oil','EUR_USD']].values

In [211]:
y

array([ 7.81543759e-02,  1.01223102e-02,  2.17037050e-02, -2.50262293e-03,
       -8.40729632e-03,  7.54903838e-03,  5.98871352e-03, -2.27273817e-02,
       -1.98105828e-02,  7.48560397e-03,  1.98573353e-02, -5.08189303e-03,
       -3.69563149e-02,  2.14396646e-03,  2.73080016e-02,  4.97872690e-02,
        8.80601436e-03,  1.02920594e-02,  3.32829438e-02, -3.52488425e-03,
       -1.66415177e-02, -7.50556186e-04, -3.66759286e-02,  1.42985147e-02,
        1.45485765e-02,  1.00240029e-02, -1.59384039e-02,  6.28402463e-02,
        3.03572029e-02, -4.79298358e-02,  5.34736466e-03, -1.72300238e-02,
        7.59421245e-03,  3.94077674e-02, -4.74603480e-02,  4.88614818e-03,
        4.38624854e-02,  5.20879502e-02,  9.30920855e-02, -3.96407210e-02,
        4.05348838e-02,  2.22324206e-02, -2.42060796e-02, -2.61799847e-02,
        3.43436467e-02,  4.43356874e-02, -1.19434457e-02, -1.44964186e-02,
        9.49466789e-02,  5.68928894e-02,  1.44939497e-02, -6.44150502e-02,
        3.64612644e-02,  

In [159]:
for i in range(len(y)):
    if y[i] > 0.05:
        y[i]= 3
    if (y[i] <0.05) &(y[i]>=0):
        y[i] = 2
    if (y[i]<0) & (y[i]>-0.05):
        y[i] = 1
    if y[i]<=-0.05:
        y[i] = 0
y

array([3., 2., 2., 1., 1., 2., 2., 1., 1., 2., 2., 1., 1., 2., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 2., 2., 2., 1., 3., 2., 1., 2., 1., 2., 2.,
       1., 2., 2., 3., 3., 1., 2., 2., 1., 1., 2., 2., 1., 1., 3., 3., 2.,
       0., 2., 3., 2., 0., 3., 1., 0., 2., 2., 2., 3., 1., 3., 2., 1., 3.,
       3., 3., 3., 0., 0., 0., 3., 1., 2., 2., 2., 3., 0., 3., 0., 1., 0.,
       3., 1., 0., 3., 3., 3., 1., 3., 1., 1., 3., 2., 1., 2., 2., 1., 1.,
       1., 1., 3., 2., 2., 1., 1., 0., 0., 3., 0., 2., 1., 1., 3., 1., 3.,
       0., 2., 3., 2., 2., 2., 1., 2., 1., 3., 0., 2., 2., 1., 2., 1., 1.,
       1., 1., 1., 2., 2., 2., 1., 3., 2., 2., 2., 3., 1., 2., 2., 1., 0.,
       3., 1., 2., 2., 2., 3., 3., 2., 1., 0., 1., 0., 3., 3., 1., 1., 1.,
       3., 2., 3., 2., 2., 3., 2., 3., 2., 2., 2., 2., 3., 2., 1., 3., 0.,
       1., 3., 0., 0., 1., 2., 0., 3., 1., 0., 3., 1., 3., 1., 0., 0., 3.,
       3., 0., 2., 2., 1., 0., 3., 0., 0., 3., 0., 1., 2., 2., 2., 1., 0.,
       0., 1., 2., 0., 2.

In [185]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




X_train, X_test, y_train, y_test =\
            train_test_split(X, y, 
                             test_size=0.2, 
                             random_state=1,
                             stratify=y)
y

array([3., 2., 2., 1., 1., 2., 2., 1., 1., 2., 2., 1., 1., 2., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 2., 2., 2., 1., 3., 2., 1., 2., 1., 2., 2.,
       1., 2., 2., 3., 3., 1., 2., 2., 1., 1., 2., 2., 1., 1., 3., 3., 2.,
       0., 2., 3., 2., 0., 3., 1., 0., 2., 2., 2., 3., 1., 3., 2., 1., 3.,
       3., 3., 3., 0., 0., 0., 3., 1., 2., 2., 2., 3., 0., 3., 0., 1., 0.,
       3., 1., 0., 3., 3., 3., 1., 3., 1., 1., 3., 2., 1., 2., 2., 1., 1.,
       1., 1., 3., 2., 2., 1., 1., 0., 0., 3., 0., 2., 1., 1., 3., 1., 3.,
       0., 2., 3., 2., 2., 2., 1., 2., 1., 3., 0., 2., 2., 1., 2., 1., 1.,
       1., 1., 1., 2., 2., 2., 1., 3., 2., 2., 2., 3., 1., 2., 2., 1., 0.,
       3., 1., 2., 2., 2., 3., 3., 2., 1., 0., 1., 0., 3., 3., 1., 1., 1.,
       3., 2., 3., 2., 2., 3., 2., 3., 2., 2., 2., 2., 3., 2., 1., 3., 0.,
       1., 3., 0., 0., 1., 2., 0., 3., 1., 0., 3., 1., 3., 1., 0., 0., 3.,
       3., 0., 2., 2., 1., 0., 3., 0., 0., 3., 0., 1., 2., 2., 2., 1., 0.,
       0., 1., 2., 0., 2.

In [171]:
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import BaggingRegressor
# from sklearn.tree import DecisionTreeRegressor
# # for comparison
# tree = DecisionTreeClassifier(criterion='entropy', 
#                               max_depth=None,
#                               random_state=1)

# bag = BaggingClassifier(base_estimator=tree,
#                         n_estimators=500, 
#                         max_samples=1.0, 
#                         max_features=1.0, 
#                         bootstrap=True, 
#                         bootstrap_features=False, 
#                         n_jobs=1, 
#                         random_state=1)

In [212]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
# for comparison
tree = DecisionTreeRegressor( criterion='squared_error',
                                splitter='best',
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features=None,
                                random_state=1,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                ccp_alpha=0.0)

bag = BaggingRegressor(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

In [172]:
# from sklearn.metrics import accuracy_score

# tree = tree.fit(X_train, y_train)
# y_train_pred = tree.predict(X_train)
# y_test_pred = tree.predict(X_test)

# tree_train = accuracy_score(y_train, y_train_pred)
# tree_test = accuracy_score(y_test, y_test_pred)
# print('Decision tree train/test accuracies %.3f/%.3f'
#       % (tree_train, tree_test))

# bag = bag.fit(X_train, y_train)
# y_train_pred = bag.predict(X_train)
# y_test_pred = bag.predict(X_test)

# bag_train = accuracy_score(y_train, y_train_pred) 
# bag_test = accuracy_score(y_test, y_test_pred) 
# print('Bagging train/test accuracies %.3f/%.3f'
#       % (bag_train, bag_test))

Decision tree train/test accuracies 0.853/0.377
Bagging train/test accuracies 0.853/0.406


In [213]:
from sklearn.metrics import r2_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = r2_score(y_train, y_train_pred)
tree_test = r2_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = r2_score(y_train, y_train_pred) 
bag_test = r2_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

Decision tree train/test accuracies 0.775/-0.934
Bagging train/test accuracies 0.674/-0.209


In [175]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=1)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=1)

In [176]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))

Decision tree train/test accuracies 0.379/0.377
AdaBoost train/test accuracies 0.475/0.377


In [None]:
from sklearn.ensemble import AdaBoostRegressor

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=1)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=1)

In [None]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))

In [None]:
plt.figure(2)
btc.plot(y=['close', 'volume', 'marketcap',
            'SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil'], 
        logy=True, figsize=(15, 10),
        cmap = 'tab10')
plt.xlabel('Date')
plt.ylabel('Price or volume')
plt.title('BTC price plotted against other prices and volume')
plt.show()

In [None]:
plt.figure(3)
q3, q1 = btc['tomorrow change'].quantile([0.99, 0.01])
btc.loc[(btc['tomorrow change'] < q3) & (btc['tomorrow change'] > q1)]['tomorrow change'].plot.hist(density=1,bins=55)
plt.title('BTC price change histogram')
plt.show()

In [None]:
plt.figure(4)
sns.boxplot(y='tomorrow change',x='weekday',data=btc,showfliers=False)
plt.xlabel('Day of the week')
plt.ylabel('Price change')
plt.title('Boxplot for price change and day of week')
plt.show()

In [None]:
plt.figure(5)
corrmat = btc.corr()
sns.heatmap(corrmat,cmap="RdBu",linewidths=.5, vmin=-1, vmax=1, annot=True)
plt.title('BTC correlation matrix')
plt.show()

In [None]:
plt.figure(6)
non_day_columns = ['close', 'volume', 'marketcap','SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil']
plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
plt.suptitle("Boxplot for all BTC columns", fontsize=18, y = 0.95)
for n, column in enumerate(non_day_columns):
    ax = plt.subplot(2,4, n + 1)
    sns.boxplot(y=btc[column], showfliers=False)
    ax.set_title(column)
    ax.set_ylabel("Price ($)", labelpad = 0.02)