# Clustering Crypto

In [172]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [173]:
# Load the crypto_data.csv dataset.
file_path = "crypto_data.csv"
df_crypto = pd.read_csv(file_path)
df_crypto.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [174]:
df_crypto = df_crypto.set_index('Unnamed: 0')

In [175]:
# Keep all the cryptocurrencies that are being traded.
df_crypto.drop(df_crypto.loc[df_crypto['IsTrading']=='False'].index, inplace=True)
df_crypto.head()

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [176]:
# Keep all the cryptocurrencies that have a working algorithm.
df_crypto.drop(df_crypto.loc[df_crypto['TotalCoinsMined'].isnull()].index, inplace=True)
df_crypto.drop(df_crypto.loc[df_crypto['TotalCoinSupply']=='0'].index, inplace=True)
df_crypto.head()

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [177]:
# Remove the "IsTrading" column.
df_crypto = df_crypto.drop(columns=["IsTrading"],axis=1)
df_crypto.head()

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [178]:
# Remove rows that have at least 1 null value.
for column in df_crypto.columns:
    print(f"Column {column} has {df_crypto[column].isnull().sum()} null values")
df_crypto = df_crypto.dropna()

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [179]:
# Keep the rows where coins are mined.
df_crypto["TotalCoinsMined"].isnull().sum()
df_crypto["TotalCoinsMined"].dropna()
df_crypto.head()

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [180]:
# Create a new DataFrame that holds only the cryptocurrencies names.
df_coinname = df_crypto["CoinName"]
pd.DataFrame(df_coinname).head()

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
LTC,Litecoin


In [181]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df_crypto = df_crypto.drop(columns=["CoinName"],axis=1)
df_crypto.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
LTC,Scrypt,PoW,63039240.0,84000000


In [182]:
# Use get_dummies() to create variables for text features.
df_crypto_encoded = pd.get_dummies(df_crypto, columns=["Algorithm", "ProofType"])
df_crypto_encoded.head()

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [183]:
# Standardize the data with StandardScaler().
data_scaler = StandardScaler()
crypto_data_scaled = data_scaler.fit_transform(df_crypto_encoded)
crypto_data_scaled

array([[-0.09548029, -0.03793466, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216],
       [-0.06880727, -0.03793451, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216],
       [ 0.64464654, -0.03784474, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216],
       ...,
       [-0.09529585, -0.03793466, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216],
       [-0.09547705, -0.03793466, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216],
       [-0.09493704, -0.03793463, -0.03793216, ..., -0.03793216,
        -0.03793216, -0.03793216]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [184]:
# Using PCA to reduce dimension to three principal components.
# Initialize PCA model
pca = PCA(n_components=3)

# Get three principal components for the data
crypto_pca = pca.fit_transform(crypto_data_scaled)
crypto_pca

array([[-0.11609831,  1.28533363, -0.51909794],
       [-0.10085186,  1.28329362, -0.51974801],
       [ 0.49230243,  1.85465085, -0.51696972],
       ...,
       [-0.46424308, -1.86484101,  0.27631254],
       [-0.11535029,  0.94278194, -0.24142982],
       [-0.11578778,  1.28529208, -0.51911118]])

In [185]:
# Create a DataFrame with the three principal components.
df_crypto_pca = pd.DataFrame(data = crypto_pca, columns = ["PC 1", "PC 2", "PC 3"], index = df_crypto.index)
df_crypto_pca.head()


Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.116098,1.285334,-0.519098
404,-0.100852,1.283294,-0.519748
1337,0.492302,1.854651,-0.51697
BTC,-0.197199,-1.509533,0.152186
LTC,-0.288493,-0.950524,-0.041919


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [186]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_crypto_pca)
    inertia.append(km.inertia_)
    
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [187]:
# Initialize the K-Means model.
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

# Fit the model and predict clusters
four_clusters = get_clusters(4, df_crypto_pca)
four_clusters.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.116098,1.285334,-0.519098,3
404,-0.100852,1.283294,-0.519748,3
1337,0.492302,1.854651,-0.51697,3
BTC,-0.197199,-1.509533,0.152186,1
LTC,-0.288493,-0.950524,-0.041919,1


In [188]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([four_clusters, df_coinname, df_crypto], axis=1)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(696, 9)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,class,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,-0.116098,1.285334,-0.519098,3,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,-0.100852,1.283294,-0.519748,3,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,0.492302,1.854651,-0.51697,3,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,-0.197199,-1.509533,0.152186,1,Bitcoin,SHA-256,PoW,17927180.0,21000000
LTC,-0.288493,-0.950524,-0.041919,1,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,-0.098608,1.427544,-0.416499,3,Dash,X11,PoW/PoS,9031294.0,22000000
ETC,-0.319893,-2.039286,0.327299,1,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,-0.464242,-1.864841,0.276312,1,ZCash,Equihash,PoW,7383056.0,21000000
BTS,0.319486,1.163961,-0.259268,3,Bitshares,SHA-512,PoS,2741570000.0,3600570502
DGB,-0.088224,-1.812708,0.241799,1,DigiByte,Multiple,PoW,11406220000.0,21000000000


### Deliverable 4: Visualizing Cryptocurrencies Results
#### 3D-Scatter with Clusters

In [189]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
    hover_name="Algorithm",
    hover_data=["Algorithm","CoinName"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [190]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class'], sortable=True, selectable=True)

In [191]:
# Print the total number of tradable cryptocurrencies.
unique_values = clustered_df['CoinName'].value_counts()
x = unique_values.sum()

print (f"There are {x} tradable cryptocurrencies")

There are 696 tradable cryptocurrencies


In [192]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
min_max_scaler = MinMaxScaler()
X = clustered_df[['TotalCoinSupply', 'TotalCoinsMined']]
cluster_data_scaled = min_max_scaler.fit_transform(X)


In [193]:
clustered_df[['TotalCoinSupply', 'TotalCoinsMined']] = cluster_data_scaled
clustered_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,-0.116098,1.285334,-0.519098,3,42 Coin,Scrypt,PoW/PoS,0.005942,0.0
404,-0.100852,1.283294,-0.519748,3,404Coin,Scrypt,PoW/PoS,0.007002,5.767955e-09
1337,0.492302,1.854651,-0.51697,3,EliteCoin,X13,PoW/PoS,0.035342,3.406122e-06
BTC,-0.197199,-1.509533,0.152186,1,Bitcoin,SHA-256,PoW,0.00596,2.27682e-10
LTC,-0.288493,-0.950524,-0.041919,1,Litecoin,Scrypt,PoW,0.006006,9.107294e-10


In [194]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = clustered_df[['TotalCoinSupply', 'TotalCoinsMined', "class", "CoinName"]]
plot_df.head(10)


Unnamed: 0_level_0,TotalCoinSupply,TotalCoinsMined,class,CoinName
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,0.0,0.005942,3,42 Coin
404,5.767955e-09,0.007002,3,404Coin
1337,3.406122e-06,0.035342,3,EliteCoin
BTC,2.27682e-10,0.00596,1,Bitcoin
LTC,9.107294e-10,0.006006,1,Litecoin
DASH,2.38524e-10,0.005951,3,Dash
ETC,2.276824e-09,0.006056,1,Ethereum Classic
ZEC,2.27682e-10,0.00595,1,ZCash
BTS,3.903746e-08,0.008695,3,Bitshares
DGB,2.276825e-07,0.017395,1,DigiByte


In [195]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="class")