# Clustering Crypto

In [65]:
# Initial imports
import pandas as pd
import hvplot.pandas
import numpy as np
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [66]:
# Load the crypto_data.csv dataset.
file_path = "./Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [67]:
crypto_df.count()

Unnamed: 0         1252
CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [68]:
# check the data types
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [69]:
# Checking for duplicates
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")

Duplicate entries: 0


In [70]:
# keep a list of the names on the "Unnamed: 0" column. 
unnamed = crypto_df["Unnamed: 0"]
unnamed

0         42
1        365
2        404
3        611
4        808
        ... 
1247     XBC
1248    DVTC
1249    GIOT
1250    OPSC
1251    PUNK
Name: Unnamed: 0, Length: 1252, dtype: object

In [71]:
# Renamed "Unnamed: 0" column and then set it as the index
crypto_df = crypto_df.rename(columns={'Unnamed: 0': ''})
crypto_df = crypto_df.set_index('')
# crypto_df.sample()
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42.0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365.0,365Coin,X11,True,PoW/PoS,,2300000000.0
404.0,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611.0,SixEleven,SHA-256,True,PoW,,611000.0
808.0,808,SHA-256,True,PoW/PoS,0.0,0.0


In [72]:
# Keep all the cryptocurrencies that are being traded.
traded_crypto = crypto_df['IsTrading'] == True
crypto_df = crypto_df.loc[traded_crypto]
crypto_df.head(2)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42.0,42 Coin,Scrypt,True,PoW/PoS,41.999954,42.0
365.0,365Coin,X11,True,PoW/PoS,,2300000000.0


In [73]:
# Keep all the cryptocurrencies that have a working algorithm.
crypto_df["Algorithm"].value_counts()

Scrypt                   394
X11                      182
SHA-256                  121
X13                       54
PoS                       42
                        ... 
VeChainThor Authority      1
Ouroboros                  1
POS 2.0                    1
Proof-of-BibleHash         1
TRC10                      1
Name: Algorithm, Length: 89, dtype: int64

In [74]:
# Remove the "IsTrading" column. 
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42.0,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
365.0,365Coin,X11,PoW/PoS,,2300000000.0
404.0,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
611.0,SixEleven,SHA-256,PoW,,611000.0
808.0,808,SHA-256,PoW/PoS,0.0,0.0


In [75]:
# find null values 
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [76]:
# Remove rows that have at least 1 null value.
crypto_df.dropna(inplace=True)
crypto_df.head(2)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42.0,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404.0,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0


In [77]:
# check null values 
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [78]:
# Checking the row counts
crypto_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [79]:
# Keep the rows where coins are mined.
mined_coins = crypto_df['TotalCoinsMined'] >0

crypto_df = crypto_df.loc[mined_coins]

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0


In [80]:
# Checking the row counts
crypto_df.count()

CoinName           532
Algorithm          532
ProofType          532
TotalCoinsMined    532
TotalCoinSupply    532
dtype: int64

In [81]:
# Create a new DataFrame that holds only the cryptocurrency names, 
# and use the crypto_df DataFrame index as the index for this new DataFrame.

# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = pd.DataFrame(crypto_df["CoinName"])
crypto_names_df.head()

Unnamed: 0,CoinName
,
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [82]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df.drop(columns=["CoinName"], inplace=True)
crypto_df.head(2)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,
42.0,Scrypt,PoW/PoS,41.99995,42.0
404.0,Scrypt,PoW/PoS,1055185000.0,532000000.0


In [83]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(crypto_df,columns=["Algorithm","ProofType"])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [85]:
# Using PCA to reduce dimension to three principal components.

# Initialize PCA model
pca = PCA(n_components=3)
pca

# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[-0.36727755,  0.90967967, -0.45116274],
       [-0.35064221,  0.90988432, -0.45160978],
       [ 2.32653259,  1.64013976, -0.57723029],
       ...,
       [ 0.35220203, -2.17622325,  0.34836076],
       [-0.13474003, -1.9023281 ,  0.32513615],
       [-0.33280132,  0.78473468, -0.23221844]])

In [86]:
# Create a DataFrame with the three principal components.
crypto_pca_df = pd.DataFrame(
                data=X_pca, columns=["PC 1", "PC 2", "PC 3"], index = X.index)

crypto_pca_df.head(2)

Unnamed: 0,PC 1,PC 2,PC 3
,,,
42.0,-0.367278,0.90968,-0.451163
404.0,-0.350642,0.909884,-0.45161


In [103]:
pca.explained_variance_ratio_

array([0.02792561, 0.02136228, 0.02048144])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [87]:
# Create an elbow curve to find the best value for K.
# create empty list to hold inertia values.
inertia = []

# define k values.
k = list(range(1, 11))
k

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [88]:
# Fit the model - Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=4`

In [89]:
# Initialize the K-Means model, using k = 4, as it can be seen on the elbow curve above.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(crypto_pca_df)

# Predict clusters
predictions = model.predict(crypto_pca_df)

predictions

array([0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2,
       2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0,
       0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0,
       2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0,
       0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0,
       0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2,
       0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0,

In [90]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, crypto_pca_df], axis=1)
clustered_df.head(2)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3
,,,,,,,
42.0,Scrypt,PoW/PoS,41.99995,42.0,-0.367278,0.90968,-0.451163
404.0,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.350642,0.909884,-0.45161


In [91]:
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["CoinName"] = crypto_names_df
clustered_df.head(2)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName
,,,,,,,,
42.0,Scrypt,PoW/PoS,41.99995,42.0,-0.367278,0.90968,-0.451163,42 Coin
404.0,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.350642,0.909884,-0.45161,404Coin


In [92]:
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = predictions
clustered_df.head(2)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42.0,Scrypt,PoW/PoS,41.99995,42.0,-0.367278,0.90968,-0.451163,42 Coin,0.0
404.0,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.350642,0.909884,-0.45161,404Coin,0.0


In [93]:
# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(2)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42.0,Scrypt,PoW/PoS,41.99995,42.0,-0.367278,0.90968,-0.451163,42 Coin,0.0
404.0,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.350642,0.909884,-0.45161,404Coin,0.0


In [108]:
# check class 0
class_0 = clustered_df['Class'] == 0
class_0_df = clustered_df.loc[class_0]
class_0_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.367278,0.90968,-0.451163,42 Coin,0.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.350642,0.909884,-0.45161,404Coin,0.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0,2.326533,1.64014,-0.57723,EliteCoin,0.0
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.373728,1.128264,-0.376993,Dash,0.0
BTS,SHA-512,PoS,2741570000.0,3600570502.0,-0.194502,2.274995,-0.179139,Bitshares,0.0


In [106]:
# check class 1 
class_1 = clustered_df['Class'] == 1
class_1_df = clustered_df.loc[class_1]
class_1_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
BTT,TRC10,DPoS,989988700000.0,990000000000.0,34.035509,1.711075,-1.249818,BitTorrent,1.0


In [109]:
# check class 2
class_2 = clustered_df['Class'] == 2
class_2_df = clustered_df.loc[class_2]
class_2_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.117812,-1.207776,0.129481,Bitcoin,2.0
ETH,Ethash,PoW,107684200.0,0.0,-0.143967,-1.995649,0.352901,Ethereum,2.0
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.19109,-1.145372,0.008838,Litecoin,2.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.116445,-2.192023,0.356003,Monero,2.0
ETC,Ethash,PoW,113359700.0,210000000.0,-0.14241,-1.995732,0.352882,Ethereum Classic,2.0


In [110]:
# check class 3
class_3 = clustered_df['Class'] == 3
class_3_df = clustered_df.loc[class_3]
class_3_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
VET,VeChainThor Authority,Proof of Authority,55454730000.0,86712634466.0,4.23924,2.83646,9.241844,Vechain,3.0
BBP,Proof-of-BibleHash,POBh,1772092000.0,5200000000.0,-0.309159,1.492931,25.658626,BiblePay,3.0
WAVES,Leased POS,LPoS,100000000.0,100000000.0,-0.342358,4.31721,15.201353,Waves,3.0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [94]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="CoinName",
    hover_data=["Algorithm"])

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [95]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=["CoinName","Algorithm","ProofType","TotalCoinSupply","TotalCoinsMined","Class"])

In [96]:
# Print the total number of tradable cryptocurrencies.
print(len(clustered_df))

532


In [97]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
mms = MinMaxScaler()
clustered_df_scaled = mms.fit_transform(clustered_df[['TotalCoinSupply','TotalCoinsMined']])
clustered_df_scaled

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [98]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df=pd.DataFrame(
data=clustered_df_scaled, columns=['TotalCoinSupply','TotalCoinsMined'], index = clustered_df.index)

plot_df

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
,,
42,4.200000e-11,0.000000e+00
404,5.320000e-04,1.065855e-03
1337,3.141593e-01,2.957551e-02
BTC,2.100000e-05,1.810842e-05
ETH,0.000000e+00,1.087731e-04
...,...,...
ZEPH,2.000000e-03,2.020225e-03
GAP,2.500000e-04,1.508199e-05
BDX,1.400223e-03,9.901351e-04


In [99]:
# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df.head(2)


Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName
,,,
42.0,4.2e-11,0.0,42 Coin
404.0,0.000532,0.001066,404Coin


In [100]:
# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE
plot_df["Class"] = clustered_df["Class"]
plot_df.head(2)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
,,,,
42.0,4.2e-11,0.0,42 Coin,0.0
404.0,0.000532,0.001066,404Coin,0.0


In [101]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class", hover_cols=["CoinName"])