# Clustering Crypto

In [122]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os

### Fetching Cryptocurrency Data

In [123]:

url = f"https://min-api.cryptocompare.com/data/all/coinlist"
crypto_df= pd.read_csv("crypto_data.csv", index_col=[0])
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159000000.0
2015,2015 coin,X11,True,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000.0


### Data Preprocessing

In [124]:
# Keep only cryptocurrencies that are on trading
crypto_df= crypto_df[crypto_df["IsTrading"]== True]


In [125]:
# Keep only cryptocurrencies with a working algorithm
crypto_df= crypto_df[crypto_df['Algorithm']!= 'N/A']


In [126]:
# Remove the "IsTrading" column
crypto_df.drop(columns=['IsTrading'], inplace=True)


In [127]:
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
365,365Coin,X11,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,PoW,,611000.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159000000.0
2015,2015 coin,X11,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000.0


In [128]:
# Removing rows with at least 1 null value
crypto_df= crypto_df.dropna()


In [129]:
# Removing rows with cryptocurrencies without coins mined
crypto_df= crypto_df[crypto_df["TotalCoinsMined"]> 0]


In [130]:
# Dropping rows where there are 'N/A' text values
crypto_df= crypto_df[crypto_df!='N/A']


In [131]:
# Fetching the cryptocurrencies names prior to drop them from crypto_df
coin_name= crypto_df['CoinName']


In [132]:
# Removig the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df.drop(columns='CoinName', inplace=True)


In [133]:
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
LTC,Scrypt,PoW,6.303924e+07,84000000
DASH,X11,PoW/PoS,9.031294e+06,22000000
XMR,CryptoNight-V7,PoW,1.720114e+07,0
ETC,Ethash,PoW,1.133597e+08,210000000
ZEC,Equihash,PoW,7.383056e+06,21000000


In [134]:
# Creating dummies variables for text features
crypto_binary_encoded= pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])


In [135]:
crypto_binary_encoded.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
# Standardizing data
crypto_scaled= StandardScaler().fit_transform(crypto_binary_encoded)
crypto_scaled

array([[-0.11710816, -0.15287029, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.14500899, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494542,  4.48942055, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217936, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694816, -0.15255997, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710535, -0.15285551, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

### Reducing Dimensions Using PCA

In [154]:
# Using PCA to reduce dimension to 3 principal components
# Initialize PCA model
pca = PCA(n_components=3)
#trnasform pca
crypto_pca = pca.fit_transform(crypto_scaled)
crypto_pca

array([[-0.33649458,  1.05422438, -0.53995742],
       [-0.31984023,  1.05429085, -0.5401735 ],
       [ 2.30087501,  1.61282308, -0.55858689],
       ...,
       [ 0.33131873, -2.29037117,  0.36249337],
       [-0.17091651, -2.12649261,  0.49783268],
       [-0.29399686,  0.82140696, -0.21633127]])

In [160]:
# Creating a DataFrame with the principal components data
pcs_df= pd.DataFrame(data=
    crypto_pca, columns=["PC1", "PC2", "PC3"], index= crypto_df.index)
pcs_df.head(10)


Unnamed: 0,PC1,PC2,PC3
42,-0.336495,1.054224,-0.539957
404,-0.31984,1.054291,-0.540173
1337,2.300875,1.612823,-0.558587
BTC,-0.142183,-1.338567,0.187152
ETH,-0.153233,-2.038996,0.367611
LTC,-0.166668,-1.090592,-0.02032
DASH,-0.390902,1.209735,-0.499443
XMR,-0.146896,-2.207955,0.33875
ETC,-0.151675,-2.039102,0.367602
ZEC,-0.170915,-2.126493,0.497833


In [161]:
pca.explained_variance_ratio_

array([0.02793141, 0.02140709, 0.02048791])

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [162]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=<you best value for k here>`

In [168]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
# Creating a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df=pd.DataFrame({
    "Algorithm": crypto_df.Algorithm,
    "ProofType": crypto_df.ProofType,
    "TotalCoinsMined": crypto_df.TotalCoinsMined,
    "TotalCoinSupply": crypto_df.TotalCoinSupply,
    "PC1": pcs_df["PC1"],
    "PC2": pcs_df["PC2"],
    "PC3": pcs_df["PC3"],
    "CoinName": coin_name,
    "Class": model.labels_,
    },index= crypto_df.index)
clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42.0,-0.336495,1.054224,-0.539957,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.31984,1.054291,-0.540173,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159000000.0,2.300875,1.612823,-0.558587,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.142183,-1.338567,0.187152,Bitcoin,3
ETH,Ethash,PoW,107684200.0,0.0,-0.153233,-2.038996,0.367611,Ethereum,3
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.166668,-1.090592,-0.02032,Litecoin,3
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.390902,1.209735,-0.499443,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.146896,-2.207955,0.33875,Monero,3
ETC,Ethash,PoW,113359700.0,210000000.0,-0.151675,-2.039102,0.367602,Ethereum Classic,3
ZEC,Equihash,PoW,7383056.0,21000000.0,-0.170915,-2.126493,0.497833,ZCash,3


### Visualizing Results

#### 3D-Scatter with Clusters

In [180]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


#### Table of Tradable Cryptocurrencies

In [173]:
# Table with tradable cryptos
columns= ["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"]
clustered_df.hvplot.table(columns)


#### Scatter Plot with Tradable Cryptocurrencies

In [176]:
# Scaling data to create the scatter plot
clustered_df["TotalCoinSupply"]= clustered_df["TotalCoinSupply"].astype(float)/100000000
clustered_df["TotalCoinsMined"]= clustered_df["TotalCoinsMined"].astype(float)/100000000

In [179]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot(
    kind="scatter",
    x= "TotalCoinsMined",
    y= "TotalCoinSupply",
    hover_cols=["CoinName"])
