# Clustering Crypto

In [87]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Data Preprocessing

In [88]:
# Loade the cryptocurrencies data
file_path = Path("Resources/crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [89]:
# Keep only cryptocurrencies that are on trading
crypto_df1 = crypto_df.loc[crypto_df["IsTrading"] == True]
crypto_df1.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [90]:
# Keep only cryptocurrencies with a working algorithm
crypto_df1 = crypto_df.loc[crypto_df["Algorithm"] != "N/A"]

In [91]:
# Remove the "IsTrading" column
crypto_df1 = crypto_df.drop(columns = "IsTrading")

In [92]:
# Remove rows with at least 1 null value
crypto_df1.dropna(inplace = True)

In [93]:
# Remove rows with cryptocurrencies without coins mined
crypto_df1.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [94]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
CoinName = pd.DataFrame(crypto_df["CoinName"])
CoinName.head()

Unnamed: 0,CoinName
42,42 Coin
365,365Coin
404,404Coin
611,SixEleven
808,808


In [95]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df1.drop(columns = ["CoinName"], inplace = True)
crypto_df1.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
808,SHA-256,PoW/PoS,0.0,0
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000


In [98]:
# Create dummies variables for text features
X = pd.get_dummies(data = crypto_df1, columns = ["Algorithm", "ProofType"])
X[:5]

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
808,0.0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
# Standardize data
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

### Reducing Dimensions Using PCA

In [109]:
# Use PCA to reduce dimension to 3 principal components
# crypto_scaled = StandardScaler().fit_transform(crypto_df)
# print(crypto_scaled[0:5])

pca = PCA(n_components=3)

crypto_pca = pca.fit_transform(X)

# Create a DataFrame with the principal components data
crypto_df_pca = pd.DataFrame(
    data=crypto_pca, columns=["PC1", "PC2", "PC3"], index = crypto_df1.index
)

crypto_df_pca.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.169403,1.256391,-0.497339
404,-0.154185,1.254741,-0.497798
808,-0.171439,0.860547,-0.244446
1337,0.320564,1.962952,-0.484088
BTC,-0.248563,-1.44389,0.173981


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [110]:
inertia = []
k = list(range(1, 11))

# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_df_pca)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)




Running K-Means with `k=4`

In [112]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=5)
# Fit the model
model.fit(crypto_df_pca)
# Predict clusters
predictions = model.predict(crypto_df_pca)
print(predictions)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
crypto_df_pca["class"] = model.labels_
crypto_df_pca.head()

[0 0 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1
 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0
 1 0 1 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0
 1 0 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1
 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 0 1 1 1 1
 1 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0
 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0
 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0
 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1 0 0 1
 0 1 1 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1
 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 0 0 1 1
 0 1 1 0 0 0 0 1 1 1 1 0 

Unnamed: 0,PC1,PC2,PC3,class
42,-0.169403,1.256391,-0.497339,0
404,-0.154185,1.254741,-0.497798,0
808,-0.171439,0.860547,-0.244446,0
1337,0.320564,1.962952,-0.484088,0
BTC,-0.248563,-1.44389,0.173981,1


In [187]:
crypto_df_pca.head(25)

Unnamed: 0,PC1,PC2,PC3,class
42,-0.169403,1.256391,-0.497339,0
404,-0.154185,1.254741,-0.497798,0
808,-0.171439,0.860547,-0.244446,0
1337,0.320564,1.962952,-0.484088,0
BTC,-0.248563,-1.44389,0.173981,1
ETH,-0.32995,-2.168992,0.335163,1
LTC,-0.245876,-1.048116,-0.078932,1
DASH,-0.27076,1.57664,-0.406203,0
XMR,-0.351883,-2.207019,0.276158,1
ETC,-0.329868,-2.169001,0.335161,1


In [114]:
# include "Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply", "PC 1", "PC 2", "PC 3", "CoinName", "Class"
clustered_df = pd.concat([crypto_df1, crypto_df_pca], sort=False, axis = 1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class
42,Scrypt,PoW/PoS,41.99995,42,-0.169403,1.256391,-0.497339,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.154185,1.254741,-0.497798,0
808,SHA-256,PoW/PoS,0.0,0,-0.171439,0.860547,-0.244446,0
1337,X13,PoW/PoS,29279420000.0,314159265359,0.320564,1.962952,-0.484088,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.248563,-1.44389,0.173981,1


### Visualizing Results

#### 3D-Scatter with Clusters

In [150]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="class",
    symbol="class",
    width=800
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Table of Tradable Cryptocurrencies

In [126]:
# Table with tradable cryptos
crypto_table = clustered_df
crypto_table.hvplot.table()

In [171]:
# Print the total number of tradable cryptocurrencies
print(crypto_table["Algorithm"].count())

744


#### Scatter Plot with Tradable Cryptocurrencies

In [174]:
# Scale data to create the scatter plot
mms = MinMaxScaler()
coin_scaled = mms.fit_transform(clustered_df[["TotalCoinsMined", "TotalCoinSupply"]])
print(coin_scaled[0:2])

[[5.94230127e-03 4.55364914e-16]
 [7.00182308e-03 5.76795558e-09]]


In [185]:
coin_df = pd.DataFrame(coin_scaled, columns = ["TotalCoinsMined", "TotalCoinSupply"], index = clustered_df.index)
coin_df["class"] = clustered_df["class"]
coin_df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,class
42,0.005942,4.553649e-16,0
404,0.007002,5.767956e-09,0
808,0.005942,0.0,0
1337,0.035342,3.406122e-06,0
BTC,0.00596,2.276825e-10,1


In [186]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
coin_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="class")