# Load rankings and categories

In [1]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json,os,pdb
import tqdm
import warnings

# Choose the universe

Select the number of coins to consider. Right now we are still using a market cap criteria

In [2]:
N = 50

# Load data


In [3]:
df = pd.read_parquet('~/work/project/daily_20221004.parquet.gzip') # modify the path according to your needs
df.columns = [c.split('.')[-1].replace('_','') for c in df.columns]
df.index = df.index.set_levels([df.index.levels[0], pd.to_datetime(df.index.levels[1])])

Load the listing file and check for the rankings

In [4]:
with open('listing_map_20221004.json',encoding='utf-8') as json_data:
    data = json.load(json_data)
listmap = pd.DataFrame(data['data'])
listmap.sort_values('cmc_rank',inplace=True)
listmap.set_index('cmc_rank',inplace=True)
listmap['symbol_id'] = listmap['symbol']+ '_'+ listmap['id'].astype(str)

# Filter by the universe
Select only the symbols that interest you. According to our previous choice of N, in this case we are filtering for the top 50 by market cap

In [5]:
symbols = list(listmap.loc[:N,'symbol_id'].values)

Remove stablecoins by using the category JSON (available in sakai as the listing file)

In [6]:
with open('category_stablecoin.json',encoding='utf-8') as json_data:
    data = json.load(json_data)
stable_df = pd.json_normalize(data['data']['coins'],max_level=5)
stable_df['symbol_id'] = stable_df['symbol'] + '_' + stable_df['id'].astype(str)
stablecoins = list(stable_df['symbol_id'].values)

In [7]:
# get the top 200 by mkt cap
symbols = set(listmap.loc[:N,'symbol_id'].values) - set(stablecoins)

for l in stablecoins:
    if l in symbols:
        print('Stablecoins in the list!')

In [8]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timeopen,timehigh,timelow,open,high,low,close,volume,marketcap,timestamp,symbol,id
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MPH_7742,2020-11-23 00:59:59.999000+00:00,2020-11-23T00:00:00.000Z,2020-11-23T00:00:07.000Z,2020-11-23T00:37:05.000Z,90.554933,90.554933,79.85233,85.599205,16008510.0,0.0,2020-11-23T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-24 00:59:59.999000+00:00,2020-11-24T00:00:00.000Z,2020-11-24T00:00:07.000Z,2020-11-24T00:40:06.000Z,85.16103,85.16103,80.694133,81.645754,10097380.0,0.0,2020-11-24T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-25 00:59:59.999000+00:00,2020-11-25T00:00:00.000Z,2020-11-25T00:00:06.000Z,2020-11-25T00:58:05.000Z,57.527366,57.527366,53.679986,53.704963,8899399.0,0.0,2020-11-25T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-26 00:59:59.999000+00:00,2020-11-26T00:00:00.000Z,2020-11-26T00:56:05.000Z,2020-11-26T00:13:06.000Z,39.466839,40.121814,38.980499,40.056399,4225065.0,0.0,2020-11-26T00:59:59.999Z,MPH,7742
MPH_7742,2020-11-27 00:59:59.999000+00:00,2020-11-27T00:00:00.000Z,2020-11-27T00:57:05.000Z,2020-11-27T00:16:05.000Z,30.002092,30.391962,29.690305,30.383005,2933446.0,0.0,2020-11-27T00:59:59.999Z,MPH,7742


## Below shows the total missingness in the dataset

In [10]:
df.isnull().sum()

timeopen          0
timehigh        520
timelow         520
open              0
high              0
low               0
close             0
volume       501572
marketcap         0
timestamp         0
symbol            0
id                0
dtype: int64

## Total number of coins that have 0 as a market cap

In [54]:
len(df.loc[df['marketcap'] == 0]['symbol'].unique())

1113

# Begining Analysis

In [13]:
df2 = df.copy() # makes copy of original df
df2['year'] = pd.DatetimeIndex(df2['timeopen']).year # adds year column to data frame
df2['month'] = pd.DatetimeIndex(df2['timeopen']).month  # adds month column to data frame
df2['day'] = pd.DatetimeIndex(df2['timeopen']).day  # adds day column to data frame
df2_first = df2.drop_duplicates(subset='symbol') #finds the first entry for each symbol
df2_last = df2.drop_duplicates(subset='symbol', keep='last').copy() #finds the last symbol for each symbol
df2_last['market_perc'] = df2_last['marketcap']/ df2_last['marketcap'].sum() # finds the market cap percentage. Used last known marketcap to calc
df2_last = df2_last.sort_values('market_perc', ascending=False) # sort from highest market cap percent to lowest
df2_last['cum_perc'] = df2_last['market_perc'].cumsum() # creates a column for the cumulative percentage

In [35]:
df2_last.head(10) #shows the top 10 highest market cap coins

Unnamed: 0_level_0,Unnamed: 1_level_0,timeopen,timehigh,timelow,open,high,low,close,volume,marketcap,timestamp,symbol,id,year,month,day,market_perc,cum_perc
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
BTC_1,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:20:00.000Z,2022-10-04T00:59:00.000Z,19623.584689,19682.097005,19569.057599,19569.057599,30195210000.0,375107200000.0,2022-10-04T00:59:59.999Z,BTC,1,2022,10,4,0.369433,0.369433
ETH_1027,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:17:00.000Z,2022-10-04T00:59:00.000Z,1323.278296,1327.501947,1321.478301,1321.478301,9889020000.0,162067800000.0,2022-10-04T00:59:59.999Z,ETH,1027,2022,10,4,0.159616,0.529049
USDT_825,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:39:00.000Z,2022-10-04T00:10:00.000Z,1.000091,1.000136,1.000065,1.000099,39481200000.0,67956170000.0,2022-10-04T00:59:59.999Z,USDT,825,2022,10,4,0.066928,0.595977
USDC_3408,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:21:00.000Z,2022-10-04T00:05:00.000Z,0.999916,1.000129,0.999848,0.999932,4347674000.0,47176490000.0,2022-10-04T00:59:59.999Z,USDC,3408,2022,10,4,0.046463,0.64244
BNB_1839,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:17:00.000Z,2022-10-04T00:59:00.000Z,287.024182,287.688765,286.528964,286.528964,693443200.0,46227800000.0,2022-10-04T00:59:59.999Z,BNB,1839,2022,10,4,0.045528,0.687968
XRP_52,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:00:00.000Z,2022-10-04T00:58:00.000Z,0.462878,0.462878,0.456886,0.456954,1828199000.0,22802310000.0,2022-10-04T00:59:59.999Z,XRP,52,2022,10,4,0.022457,0.710425
BUSD_4687,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:15:00.000Z,2022-10-04T00:40:00.000Z,1.0002,1.000815,0.999495,1.000401,6356159000.0,21050670000.0,2022-10-04T00:59:59.999Z,BUSD,4687,2022,10,4,0.020732,0.731158
UST_7129,2022-04-18 00:59:59.999000+00:00,2022-04-18T00:00:00.000Z,2022-04-18T00:56:00.000Z,2022-04-18T00:03:00.000Z,1.001566,1.001987,1.00128,1.001743,239194500.0,17475290000.0,2022-04-18T00:59:59.999Z,UST,7129,2022,4,18,0.017211,0.748369
ADA_2010,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:17:00.000Z,2022-10-04T00:59:00.000Z,0.428003,0.428976,0.426254,0.426254,440816400.0,14600540000.0,2022-10-04T00:59:59.999Z,ADA,2010,2022,10,4,0.01438,0.762748
SOL_5426,2022-10-04 00:59:59.999000+00:00,2022-10-04T00:00:00.000Z,2022-10-04T00:22:00.000Z,2022-10-04T00:59:00.000Z,32.956772,33.091174,32.909861,32.909861,623097900.0,11685840000.0,2022-10-04T00:59:59.999Z,SOL,5426,2022,10,4,0.011509,0.774257


## 5 coins were first logged in 2022

In [50]:
df_cutoff = df2_last.loc[df2_last['cum_perc'] <= .91] # finds all coins that make up the 91% of market cap
df2_cutoff = df2_first.loc[df2_first['symbol'].isin(df_cutoff['symbol'])] # finds the first entry for the 91% coins
df2_cutoff.loc[df2_cutoff['year'] == df2_cutoff.year.max()] #shows the coins with the most recent creation date

Unnamed: 0_level_0,Unnamed: 1_level_0,timeopen,timehigh,timelow,open,high,low,close,volume,marketcap,timestamp,symbol,id,year,month,day
symbol_id,time_close,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
LUNC_4172,2022-04-07 00:59:59.999000+00:00,2022-04-07T00:00:00.000Z,2022-04-07T00:34:00.000Z,2022-04-07T00:59:00.000Z,107.903636,109.393207,107.077443,107.077443,3443712000.0,37673250000.0,2022-04-07T00:59:59.999Z,LUNC,4172,2022,4,7
XCN_18679,2022-03-16 00:59:59.999000+00:00,2022-03-16T00:00:00.000Z,2022-03-16T00:05:00.000Z,2022-03-16T00:59:00.000Z,0.052573,0.052708,0.050194,0.050194,2050086.0,0.0,2022-03-16T00:59:59.999Z,XCN,18679,2022,3,16
stETH_8085,2022-04-07 00:59:59.999000+00:00,2022-04-07T00:00:00.000Z,2022-04-07T00:38:00.000Z,2022-04-07T00:58:00.000Z,3162.58977,3197.201203,3143.240955,3148.767582,348529.2,6342142000.0,2022-04-07T00:59:59.999Z,stETH,8085,2022,4,7
WTRX_18579,2022-03-06 00:59:59.999000+00:00,2022-03-06T00:00:00.000Z,2022-03-06T00:41:00.000Z,2022-03-06T00:11:00.000Z,0.060396,0.060432,0.060379,0.060418,1211478.0,6146203000.0,2022-03-06T00:59:59.999Z,WTRX,18579,2022,3,6
APE_18876,2022-03-18 00:59:59.999000+00:00,2022-03-18T00:00:00.000Z,2022-03-18T00:46:00.000Z,2022-03-18T00:00:00.000Z,8.530408,9.974312,8.530408,9.635246,2755795000.0,2673781000.0,2022-03-18T00:59:59.999Z,APE,18876,2022,3,18


In [55]:
len(df_cutoff) #the number of coins that make up the 91% of the market cap

45

In [56]:
df_cutoff.isnull().sum() #shows that the 45 coins are not missing any values in last entry. Probably true for all entries

timeopen       0
timehigh       0
timelow        0
open           0
high           0
low            0
close          0
volume         0
marketcap      0
timestamp      0
symbol         0
id             0
year           0
month          0
day            0
market_perc    0
cum_perc       0
dtype: int64

### Most coins were earliest entry was in 2019 and 2020 with the ealiest having 4 entries start in 2013

In [57]:
df2_cutoff.groupby('year').count()['symbol'] # count for symbols by creation year

year
2013     4
2014     2
2015     2
2016     1
2017     5
2018     4
2019    11
2020    10
2021     1
2022     5
Name: symbol, dtype: int64