# Load rankings and categories

In [1]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json,os,pdb
import tqdm
import warnings
import yfinance as yf

# Load data


In [2]:
df = pd.read_parquet('~/work/project/daily_20221004.parquet.gzip') # modify the path according to your needs
df.columns = [c.split('.')[-1].replace('_','') for c in df.columns]
df.index = df.index.set_levels([df.index.levels[0], pd.to_datetime(df.index.levels[1])])

Load the listing file and check for the rankings

In [3]:
with open('listing_map_20221004.json',encoding='utf-8') as json_data:
    data = json.load(json_data)
listmap = pd.DataFrame(data['data'])
listmap.sort_values('cmc_rank',inplace=True)
listmap.set_index('cmc_rank',inplace=True)
listmap['symbol_id'] = listmap['symbol']+ '_'+ listmap['id'].astype(str)

In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timeopen,timehigh,timelow,open,high,low,close,volume,marketcap,timestamp,symbol,id
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MPH_7742,2020-11-23 00:59:59.999000+00:00,2020-11-23T00:00:00.000Z,2020-11-23T00:00:07.000Z,2020-11-23T00:37:05.000Z,90.554933,90.554933,79.85233,85.599205,16008510.0,0.0,2020-11-23T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-24 00:59:59.999000+00:00,2020-11-24T00:00:00.000Z,2020-11-24T00:00:07.000Z,2020-11-24T00:40:06.000Z,85.16103,85.16103,80.694133,81.645754,10097380.0,0.0,2020-11-24T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-25 00:59:59.999000+00:00,2020-11-25T00:00:00.000Z,2020-11-25T00:00:06.000Z,2020-11-25T00:58:05.000Z,57.527366,57.527366,53.679986,53.704963,8899399.0,0.0,2020-11-25T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-26 00:59:59.999000+00:00,2020-11-26T00:00:00.000Z,2020-11-26T00:56:05.000Z,2020-11-26T00:13:06.000Z,39.466839,40.121814,38.980499,40.056399,4225065.0,0.0,2020-11-26T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-27 00:59:59.999000+00:00,2020-11-27T00:00:00.000Z,2020-11-27T00:57:05.000Z,2020-11-27T00:16:05.000Z,30.002092,30.391962,29.690305,30.383005,2933446.0,0.0,2020-11-27T00:59:59.999Z,MPH,7742


# Begining Analysis

In [None]:
df2 = df.copy() # makes copy of original df
df2.reset_index(inplace = True)
df2['year'] = pd.DatetimeIndex(df2['timeopen']).year # adds year column to data frame
df2['month'] = pd.DatetimeIndex(df2['timeopen']).month  # adds month column to data frame
df2['day'] = pd.DatetimeIndex(df2['timeopen']).day  # adds day column to data frame
df2_first = df2.drop_duplicates(subset='symbol_id') #finds the first entry for each symbol
df2_last = df2.drop_duplicates(subset='symbol_id', keep='last').copy() #finds the last symbol for each symbol
df2_last['market_perc'] = df2_last['marketcap']/ df2_last['marketcap'].sum() # finds the market cap percentage. Used last known marketcap to calc
df2_last = df2_last.sort_values('market_perc', ascending=False) # sort from highest market cap percent to lowest
df2_last['cum_perc'] = df2_last['market_perc'].cumsum() # creates a column for the cumulative percentage

In [None]:
df2_last.head(10) #shows the top 10 highest market cap coins

In [None]:
df_cutoff = df2_last.loc[df2_last['cum_perc'] <= .91]
tethers = df_cutoff.loc[(df_cutoff['close'] > .99) & (df_cutoff['close'] < 1.01)]['symbol_id'].values # removes 4 tether coins
df_cutoff = df_cutoff.loc[~df_cutoff['symbol_id'].isin(tethers)]
df_cutoff = df_cutoff.loc[(df_cutoff['year'] == 2022) & (df_cutoff['month'] >= 10)]
df2_cutoff = df2_first.loc[df2_first['symbol_id'].isin(df_cutoff['symbol_id'])]
symbols = df2_cutoff.loc[df2_cutoff['year'] > 2020]['symbol_id'].values
symbols2 = df2_cutoff.loc[(df2_cutoff['year'] == 2020) & (df2_cutoff['month'] > 8)]['symbol_id'].values
all_symbols = np.concatenate((symbols, symbols2))

In [None]:
plt.figure(1)
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [15, 15]
sns.barplot(data=df_cutoff.loc[~df_cutoff['symbol_id'].isin(all_symbols)], x="symbol_id", y="market_perc")
plt.xlabel('Symbol ID')
plt.ylabel('Market Cap Percentage')
plt.title('Market Cap Percentage for all Coins in List')
plt.xticks(rotation=90)
plt.show()

In [None]:
print(len(df2_cutoff))

In [None]:
print(len(all_symbols))

In [None]:
df3 = df.copy()
df3.reset_index(inplace = True)
df3 = df3.loc[df3['symbol_id'].isin(df_cutoff['symbol_id'])]
df3 = df3.loc[~df3['symbol_id'].isin(all_symbols)]

In [None]:
df3['month'] = df3['time_close'].dt.month
df3['day'] = df3['time_close'].dt.day
df3['weekday'] = df3['time_close'].dt.weekday
df3['day of week'] = np.where(df3['weekday'] > 4, 0, 1)

In [None]:
df3['time_close'] = df3['time_close'].dt.date
df3 = df3[(df3['time_close'] >= df3.loc[df3['symbol_id'] == 'DOT_6636']['time_close'].min())]
df3.dropna(subset='volume', inplace=True)
df3 = df3[(df3['time_close'] >= df3['time_close'].min())]
all_syms = df3['symbol_id'].unique()

In [None]:
df3 = df3.drop_duplicates(subset=['symbol_id', 'time_close'])
for syms_in in all_syms:
    miss = df3.loc[df3['symbol_id'] == syms_in]
    df3 = df3[df3['time_close'].isin(miss['time_close'])]

In [None]:
df3['price change'] = df3['close'].pct_change()
df3['volume change'] = df3['volume'].pct_change()
df3['tomorrow change'] = df3['close'].pct_change().shift(-1)

df3.drop(df3.groupby('symbol_id')['price change'].head(1).index, inplace=True)
df3.drop(df3.groupby('symbol_id')['tomorrow change'].tail(1).index, inplace=True) 

In [None]:
def get_info(tick):
    TICKER = tick
    START_DATE = '2020-09-24'
    END_DATE = '2022-10-3'
    stock = yf.download(TICKER, 
                     start=START_DATE, 
                     end=END_DATE,
                     progress=False)
    df3[tick] = stock['Adj Close']
    df3[tick] = df3.groupby(['symbol_id'])[tick].ffill()

In [None]:
df3.set_index(['time_close'],inplace = True)
ticks = ['SPY', '^VIX', '^TNX', 'GC=F', 'CL=F', 'EURUSD=X']
for tick in ticks:
    get_info(tick)
df3 = df3.rename(columns={'^TNX': '10Y Treasury',
                          '^VIX': 'VIX',
                          'GC=F': 'Gold',
                          'CL=F': 'Oil',
                          'EURUSD=X': 'EUR_USD'})

In [None]:
df3.drop(columns=['timeopen', 'timehigh', 'timelow', 'open', 'high', 'low', 'timestamp', 'symbol', 'id'], inplace = True)
df3.reset_index(inplace = True)
df3.set_index(['symbol_id', 'time_close'],inplace = True)

## Got rid of all nan volumes, first and last entries for each coin. Starts at 2020-09-24 ends at 2022-10-3

In [None]:
df3.head()

In [None]:
# df3.to_csv('clean_df.csv',index=True)

In [None]:
df3.index.unique(level='symbol_id')

In [None]:
btc = df3.loc['BTC_1']

In [None]:
plt.figure(2)
btc.plot(y=['close', 'volume', 'marketcap',
            'SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil'], 
        logy=True, figsize=(15, 10),
        cmap = 'tab10')
plt.xlabel('Date')
plt.ylabel('Price or volume')
plt.title('BTC price plotted against other prices and volume')
plt.show()

In [None]:
plt.figure(3)
q3, q1 = btc['tomorrow change'].quantile([0.99, 0.01])
btc.loc[(btc['tomorrow change'] < q3) & (btc['tomorrow change'] > q1)]['tomorrow change'].plot.hist(density=1,bins=55)
plt.title('BTC price change histogram')
plt.show()

In [None]:
plt.figure(4)
sns.boxplot(y='tomorrow change',x='weekday',data=btc,showfliers=False)
plt.xlabel('Day of the week')
plt.ylabel('Price change')
plt.title('Boxplot for price change and day of week')
plt.show()

In [None]:
plt.figure(5)
corrmat = btc.corr()
sns.heatmap(corrmat,cmap="RdBu",linewidths=.5, vmin=-1, vmax=1, annot=True)
plt.title('BTC correlation matrix')
plt.show()

In [None]:
plt.figure(6)
non_day_columns = ['close', 'volume', 'marketcap','SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil']
plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
plt.suptitle("Boxplot for all BTC columns", fontsize=18, y = 0.95)
for n, column in enumerate(non_day_columns):
    ax = plt.subplot(2,4, n + 1)
    sns.boxplot(y=btc[column], showfliers=False)
    ax.set_title(column)
    ax.set_ylabel("Price ($)", labelpad = 0.02)