# Crypto ML Model scaling the entire dataset

In [1]:
import json
import requests
import pandas as pd
import hvplot.pandas
import panel as pn
import holoviews as hv
import numpy as np
from pathlib import Path
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [38]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

In [4]:

# Coin Metrics API endpoint url
url = 'https://community-api.coinmetrics.io/v4/timeseries/asset-metrics?'

# Dates to get historical data for the last 3 years
start_date = pd.Timestamp("2021-01-01").isoformat()
end_date = pd.Timestamp("2021-10-05").isoformat()

# Number of records to retrieve
page_size = 10000

# Define the list of crypto assets to analyze - Unable to loop through CSV file
#crypto_asset = ['1inch', 'aave', 'ada', 'alpha', 'ant', 'bal', 'bat', 'bch']

crypto_asset = ['1inch', 'aave', 'ada', 'alpha', 'ant', 'bal', 'bat', 'bch', 'bnb', 'bnb_eth', 'bsv', 'btc', 'btg', 'busd', 'comp', 'cro', 'crv', 'cvc', 'dai', 'dash', 'dcr', 'dgb', 'dgx', 'doge', 'dot', 'drgn', 'elf', 'eng', 'eos_eth', 'etc', 'eth', 'ethos', 'ftt', 'fun', 'fxc', 'gas', 'gno', 'gnt', 'gusd', 'hbtc', 'hedg', 'ht', 'husd', 'knc', 'lend', 'leo_eth', 'link', 'loom', 'lsk', 'ltc', 'maid', 'mana', 'mco', 'mkr', 'neo', 'nxm', 'omg', 'pax', 'paxg', 'pay', 'poly', 'powr', 'ppt', 'qash', 'ren', 'renbtc', 'rep', 'sai', 'snt', 'snx', 'srm', 'sushi', 'swrv', 'trx_eth', 'tusd', 'uma', 'uni', 'usdc', 'usdk', 'usdt', 'usdt_eth', 'usdt_omni', 'usdt_trx', 'vtc', 'wbtc', 'weth', 'wnxm', 'wtc', 'xaut', 'xlm', 'xrp', 'xtz', 'xvg', 'yfi', 'zec', 'zrx'
               ]

def get_data(metric):
    
    # Save the JSON response as lists
    symbol = []
    time = []
    value = []

    # Loop through the selected tokens
    for token in crypto_asset:
        
        # Build URL with the corresponding token, metric, and dates
        get_url = f"{url}&assets={token}&metrics={metric}&start_time={start_date}&end_time={end_date}&page_size={page_size}"

        # Get the response from Coin Metrics API is JSON format
        response = requests.get(get_url).json()

        # Iterate through the response to add the relevant items to its corresponding list
        for item in response['data']:

            symbol.append(item['asset'])
            time.append(item['time'])
            value.append(item[metric])
            
    # Build the dataframe with the data
    data_df = pd.DataFrame(
    {'Token': symbol,
     'Date': time,
     metric: value
    })
    
    # Format the date for easy reading 
    data_df['Date'] = pd.to_datetime(data_df['Date'], format="%Y/%m/%d").dt.date
    
    # Set Date as the index
    data_df.index = data_df['Date']
    
    # Change the value type to 'float' to allow graphing
    data_df[metric] = data_df[metric].astype(float)
    
    # Define columns
    data_df = data_df.drop(columns='Date')
    data_df.columns = ['Token', metric]
    
    # Format the DataFrame to allow xgraphing
    #updated_data_df = data_df.pivot_table('Value', ['Date'], 'Token')
    
    #return updated_data_df
    
    return data_df
    

In [5]:
# Apply get_data and parse the data for all metrics
active_addresses = get_data('AdrActCnt')
transaction_count = get_data('TxCnt')
supply = get_data('SplyCur')
mc = get_data('CapMrktCurUSD')
mc_real = get_data('CapRealUSD')
velocity = get_data('VelCur1yr')

In [6]:
# Reset indexes and prepare to concatenate DataFrames
active_addresses = active_addresses.reset_index()
transaction_count = transaction_count.reset_index()
supply = supply.reset_index()
mc = mc.reset_index()
mc_real = mc_real.reset_index()
velocity = velocity.reset_index()

## Preprocessing

In [1]:
# Create a new DataFrame with all fundamental data 
df = pd.concat([active_addresses, transaction_count, supply, mc, mc_real, velocity], join='outer', axis=1)

# Remove duplicate Token column
df = df.loc[:, ~df.columns.duplicated()]

# Rename columns
df = df.rename(columns={'AdrActCnt': 'Active Addresses', 'TxCnt': 'Transaction Count', 'SplyCur': 'Current Supply', 'CapMrktCurUSD': 'Market Cap', 'CapRealUSD': 'Real Market Cap', 'VelCur1yr': 'Velocity'})

# Set Date as index
df = df.set_index("Date")

# Replace NaN with zeros
df = df.fillna(0)

df

NameError: name 'pd' is not defined

In [29]:
# Calculate the 7-day rolling average of active addresses and market cap
df['active_address_df_7d'] = df['Active Addresses'].rolling(window=7).mean()
df['mc_df_7d'] = df['Market Cap'].rolling(window=7).mean()
df = df.dropna()

Unnamed: 0_level_0,Token,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity,active_address_df_7d,active_addresses_df_7d_chg,mc_df_7d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-20,1inch,1227.0,2045.0,1.500000e+09,2.779459e+09,1.957961e+08,1.254138,1667.142857,-0.073074,2.571527e+09
2021-01-21,1inch,1081.0,1731.0,1.500000e+09,2.376322e+09,1.725417e+08,1.268189,1633.857143,-0.019966,2.629394e+09
2021-01-22,1inch,1153.0,1871.0,1.500000e+09,2.637670e+09,1.942441e+08,1.284729,1576.000000,-0.035411,2.718587e+09
2021-01-23,1inch,1496.0,2477.0,1.500000e+09,2.898370e+09,2.100808e+08,1.300911,1513.285714,-0.039793,2.781162e+09
2021-01-24,1inch,1844.0,3270.0,1.500000e+09,3.665437e+09,2.552889e+08,1.321245,1409.000000,-0.068913,2.863324e+09
...,...,...,...,...,...,...,...,...,...,...
2021-10-01,zrx,466.0,684.0,1.000000e+09,0.000000e+00,0.000000e+00,7.968792,471.857143,-0.000605,0.000000e+00
2021-10-02,zrx,530.0,582.0,1.000000e+09,0.000000e+00,0.000000e+00,7.964344,485.428571,0.028762,0.000000e+00
2021-10-03,zrx,438.0,525.0,1.000000e+09,0.000000e+00,0.000000e+00,7.963604,440.285714,-0.092996,0.000000e+00
2021-10-04,zrx,420.0,452.0,1.000000e+09,0.000000e+00,0.000000e+00,7.964101,440.714286,0.000973,0.000000e+00


In [30]:
# Calculate the pct change in active addresses and market cap
df['active_addresses_df_7d_chg'] = df['active_address_df_7d'].pct_change()
df['mc_df_7d_chg'] = df['mc_df_7d'].pct_change()
df = df.dropna()
df

Unnamed: 0_level_0,Token,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity,active_address_df_7d,active_addresses_df_7d_chg,mc_df_7d,mc_df_7d_chg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-21,1inch,1081.0,1731.0,1.500000e+09,2.376322e+09,1.725417e+08,1.268189,1633.857143,-0.019966,2.629394e+09,0.022503
2021-01-22,1inch,1153.0,1871.0,1.500000e+09,2.637670e+09,1.942441e+08,1.284729,1576.000000,-0.035411,2.718587e+09,0.033922
2021-01-23,1inch,1496.0,2477.0,1.500000e+09,2.898370e+09,2.100808e+08,1.300911,1513.285714,-0.039793,2.781162e+09,0.023018
2021-01-24,1inch,1844.0,3270.0,1.500000e+09,3.665437e+09,2.552889e+08,1.321245,1409.000000,-0.068913,2.863324e+09,0.029542
2021-01-25,1inch,1917.0,3448.0,1.500000e+09,3.595392e+09,2.528589e+08,1.353092,1430.428571,0.015208,2.980442e+09,0.040903
...,...,...,...,...,...,...,...,...,...,...,...
2021-08-20,zrx,588.0,726.0,1.000000e+09,0.000000e+00,0.000000e+00,8.080216,560.714286,-0.000255,5.686746e+08,-0.197160
2021-08-21,zrx,514.0,652.0,1.000000e+09,0.000000e+00,0.000000e+00,7.932619,548.571429,-0.021656,4.284161e+08,-0.246641
2021-08-22,zrx,608.0,793.0,1.000000e+09,0.000000e+00,0.000000e+00,7.899200,568.000000,0.035417,2.851319e+08,-0.334451
2021-08-23,zrx,570.0,815.0,1.000000e+09,0.000000e+00,0.000000e+00,7.891725,573.428571,0.009557,1.462200e+08,-0.487185


In [50]:
# Use LineraRegression for all tokens
model = LinearRegression()

X = np.array(df['active_addresses_df_7d_chg'].loc[df['Token'] == 'btc'].tolist()).reshape(-1,1)
y = df['mc_df_7d_chg'].loc[df['Token'] == 'btc']

reg = model.fit(X, y)

reg.coef_

array([26.07066684])

In [64]:
# Determine each token's regression coefficient (slope)
coef = {}

for asset in crypto_asset:
    model = LinearRegression()
    
    try:
        X = np.array(df['active_addresses_df_7d_chg'].loc[df['Token'] == asset].tolist()).reshape(-1,1)
        y = df['mc_df_7d_chg'].loc[df['Token'] == asset]

        reg = model.fit(X, y)

        coef[asset] = reg.coef_
        
    except:
        pass

coef = pd.DataFrame(coef)
coef = coef.T
coef = coef.rename(columns={0: 'Coef'})
coef[coef['Coef'] > 0].sort_values(by='Coef', ascending=False)

Unnamed: 0,Coef
btc,26.070667
alpha,0.740344
btg,0.520723
eng,0.499882
dgb,0.426303
elf,0.196261
cro,0.161312
dgx,0.15483
dcr,0.151942
1inch,0.14544


# Using StandardScaler

In [None]:
# Scaling the columns using StandardScaler
df_scaled_std = StandardScaler().fit_transform(df[['Active Addresses', 'Transaction Count', 'Current Supply', 'Market Cap', 'Real Market Cap', 'Velocity']])

# Creating a DataFrame with the scaled data
df_scaled_std = pd.DataFrame(df_scaled_std, columns=['Active Addresses', 'Transaction Count', 'Current Supply', 'Market Cap', 'Real Market Cap', 'Velocity'])

df_scaled_std

Unnamed: 0,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity
0,-0.253292,-0.206561,-0.243918,-0.174987,-0.233065,-0.685311
1,-0.257085,-0.209478,-0.243918,-0.176886,-0.233273,-0.684595
2,-0.259745,-0.210622,-0.243918,-0.175468,-0.233080,-0.683730
3,-0.261456,-0.211964,-0.243918,-0.176920,-0.233272,-0.682847
4,-0.256055,-0.209433,-0.243918,-0.173319,-0.232665,-0.681784
...,...,...,...,...,...,...
24721,-0.266516,-0.215309,-0.265465,-0.193196,-0.235795,-0.445001
24722,-0.266041,-0.215530,-0.265465,-0.193196,-0.235795,-0.445148
24723,-0.266723,-0.215654,-0.265465,-0.193196,-0.235795,-0.445173
24724,-0.266856,-0.215812,-0.265465,-0.193196,-0.235795,-0.445156


In [9]:
# Finding the best number for k using the Elbow method

# Create a list with the k values to try
k = list(range(1, 11))

# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia for k
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_scaled_std)
    inertia.append(model.inertia_)

# Create a dict with the data to plot the elbow
elbow_data = {
    'k': k,
    'inertia': inertia
}

# Create a DataFrame with the elbow data
df_elbow = pd.DataFrame(elbow_data)

In [10]:
# Plot the elbow curve
df_elbow.hvplot.line(x='k', y='inertia', title='Elbow Curve', xticks=k)

In [11]:
# Applying the KMeans algo

# Initilize the KMeans model using 5 clusters
model = KMeans(n_clusters=3)

# Fit the model
model.fit(df_scaled_std)

# Predict clusters
token_segments = model.predict(df_scaled_std)

In [12]:
# Create a new column in the scaled DF with the predicted clusters
df_scaled_std['Token Segment'] = token_segments

In [13]:
# Adding the Token column to the scaled df
df = df.reset_index()
df_scaled_std['Token'] = df['Token']

# Adding the Date column to the scaled df
df_scaled_std['Date'] = df['Date']
df_scaled_std

Unnamed: 0,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity,Token Segment,Token,Date
0,-0.253292,-0.206561,-0.243918,-0.174987,-0.233065,-0.685311,0,1inch,2021-01-01
1,-0.257085,-0.209478,-0.243918,-0.176886,-0.233273,-0.684595,0,1inch,2021-01-02
2,-0.259745,-0.210622,-0.243918,-0.175468,-0.233080,-0.683730,0,1inch,2021-01-03
3,-0.261456,-0.211964,-0.243918,-0.176920,-0.233272,-0.682847,0,1inch,2021-01-04
4,-0.256055,-0.209433,-0.243918,-0.173319,-0.232665,-0.681784,0,1inch,2021-01-05
...,...,...,...,...,...,...,...,...,...
24721,-0.266516,-0.215309,-0.265465,-0.193196,-0.235795,-0.445001,0,zrx,2021-10-01
24722,-0.266041,-0.215530,-0.265465,-0.193196,-0.235795,-0.445148,0,zrx,2021-10-02
24723,-0.266723,-0.215654,-0.265465,-0.193196,-0.235795,-0.445173,0,zrx,2021-10-03
24724,-0.266856,-0.215812,-0.265465,-0.193196,-0.235795,-0.445156,0,zrx,2021-10-04


In [14]:
# Plot the scatter plot
df_scaled_std.hvplot.scatter(x='Active Addresses', y='Transaction Count', by='Token Segment')

### KMeans using MinMaxScaler

In [15]:
# Initialize the scaler
scaler = MinMaxScaler()

df_scaled_mm = scaler.fit_transform(df[['active_address_df_7d', '', 'Current Supply', 'Market Cap', 'Real Market Cap', 'Velocity']])

# Creating a DataFrame with the scaled data
df_scaled_mm = pd.DataFrame(df_scaled_mm, columns=['Active Addresses', 'Transaction Count', 'Current Supply', 'Market Cap', 'Real Market Cap', 'Velocity'])
df_scaled_mm

Unnamed: 0,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity
0,0.001647,0.000575,0.011413,0.001456,0.000269,0.002783
1,0.001273,0.000411,0.011413,0.001304,0.000249,0.002886
2,0.001010,0.000347,0.011413,0.001417,0.000268,0.003011
3,0.000841,0.000271,0.011413,0.001301,0.000249,0.003138
4,0.001374,0.000414,0.011413,0.001589,0.000308,0.003291
...,...,...,...,...,...,...
24721,0.000341,0.000083,0.007608,0.000000,0.000000,0.037355
24722,0.000388,0.000071,0.007608,0.000000,0.000000,0.037334
24723,0.000321,0.000064,0.007608,0.000000,0.000000,0.037330
24724,0.000307,0.000055,0.007608,0.000000,0.000000,0.037333


In [16]:
# Finding the best number for k using the Elbow method

# Create a list with the k values to try
k = list(range(1, 11))

# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia for k
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_scaled_mm)
    inertia.append(model.inertia_)

# Create a dict with the data to plot the elbow
elbow_data = {
    'k': k,
    'inertia': inertia
}

# Create a DataFrame with the elbow data
df_elbow = pd.DataFrame(elbow_data)

In [17]:
# Plot the elbow curve
df_elbow.hvplot.line(x='k', y='inertia', title='Elbow Curve', xticks=k)

In [18]:
# Applying the KMeans algo

# Initilize the KMeans model using 5 clusters
model = KMeans(n_clusters=4)

# Fit the model
model.fit(df_scaled_mm)

# Predict clusters
token_segments = model.predict(df_scaled_mm)

In [19]:
# Create a new column in the scaled DF with the predicted clusters
df_scaled_mm['Token Segment'] = token_segments

In [48]:
# Adding the Token column to the scaled df
df_scaled_mm['Token'] = df['Token']

# Adding the Date column to the scaled df
df_scaled_mm['Date'] = df['Date']
df_scaled_mm

Unnamed: 0,Active Addresses,Transaction Count,Current Supply,Market Cap,Real Market Cap,Velocity,Token Segment,Token,Date
0,0.001647,0.000575,0.011413,0.001456,0.000269,0.002783,0,1inch,2021-01-01
1,0.001273,0.000411,0.011413,0.001304,0.000249,0.002886,0,1inch,2021-01-02
2,0.001010,0.000347,0.011413,0.001417,0.000268,0.003011,0,1inch,2021-01-03
3,0.000841,0.000271,0.011413,0.001301,0.000249,0.003138,0,1inch,2021-01-04
4,0.001374,0.000414,0.011413,0.001589,0.000308,0.003291,0,1inch,2021-01-05
...,...,...,...,...,...,...,...,...,...
24721,0.000341,0.000083,0.007608,0.000000,0.000000,0.037355,0,zrx,2021-10-01
24722,0.000388,0.000071,0.007608,0.000000,0.000000,0.037334,0,zrx,2021-10-02
24723,0.000321,0.000064,0.007608,0.000000,0.000000,0.037330,0,zrx,2021-10-03
24724,0.000307,0.000055,0.007608,0.000000,0.000000,0.037333,0,zrx,2021-10-04


In [None]:
df_scaled_mm.corr

In [22]:
# Plot the scatter plot
df_scaled_mm.hvplot.scatter(x='Market Cap', y='Active Addresses', c='Token', by='Token Segment')

In [45]:
cluster_0 = df_scaled_mm[df_scaled_mm['Token Segment'] == 0]

In [46]:
cluster_0.Token.unique()

array(['1inch', 'aave', 'alpha', 'ant', 'bal', 'bat', 'bch', 'bsv', 'btg',
       'busd', 'comp', 'crv', 'cvc', 'dash', 'dcr', 'dgb', 'dgx', 'dot',
       'drgn', 'elf', 'eng', 'etc', 'eth', 'ethos', 'ftt', 'fun', 'gno',
       'gnt', 'gusd', 'hbtc', 'hedg', 'ht', 'husd', 'knc', 'lend',
       'leo_eth', 'link', 'loom', 'lsk', 'ltc', 'maid', 'mana', 'mco',
       'mkr', 'neo', 'nxm', 'omg', 'pax', 'paxg', 'pay', 'poly', 'powr',
       'ppt', 'qash', 'ren', 'renbtc', 'rep', 'snt', 'snx', 'srm',
       'sushi', 'swrv', 'uma', 'uni', 'usdk', 'usdt', 'usdt_omni',
       'usdt_trx', 'vtc', 'wbtc', 'wnxm', 'wtc', 'xaut', 'xtz', 'xvg',
       'yfi', 'zrx'], dtype=object)

In [44]:
cluster_3.hvplot.scatter(x='Market Cap', y='Active Addresses', c='Token', by='Token Segment')