# Data Preprocessing

In [428]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import holoviews as hv
from holoviews import opts


In [383]:
#load data
file_path="Resources/crypto_data.csv"
crypto_df=pd.read_csv(file_path)
crypto_df.tail()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1247,XBC,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,21491210.0,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000
1251,PUNK,SteamPunk,PoS,False,PoS,,40000000


In [384]:
#crypto_df.columns

#Remove all cryptocurrencies that aren’t trading.
crypto_df=crypto_df[crypto_df.IsTrading != 'True']

#Remove all cryptocurrencies that don’t have an algorithm defined.
#All algorithms are defined

#Remove the IsTrading column.
crypto_df=crypto_df.drop(["IsTrading"], axis=1)

#Remove all cryptocurrencies with at least one null value.
crypto_df = crypto_df.dropna()

#Remove all cryptocurrencies without coins mined.
crypto_df[crypto_df.TotalCoinsMined == 0.0]
crypto_df = crypto_df.reset_index(drop=True)

crypto_df.tail()


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
739,GAP,Gapcoin,Scrypt,PoW/PoS,14931050.0,250000000
740,BDX,Beldex,CryptoNight,PoW,980222600.0,1400222610
741,ZEN,Horizen,Equihash,PoW,7296538.0,21000000
742,XBC,BitcoinPlus,Scrypt,PoS,128327.0,1000000
743,DVTC,DivotyCoin,Scrypt,PoW/PoS,21491210.0,100000000


In [385]:
#Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame
coins_name = pd.DataFrame([crypto_df.CoinName]).transpose().set_index(crypto_df.index)
coins_name


Unnamed: 0,CoinName
0,42 Coin
1,404Coin
2,808
3,EliteCoin
4,Bitcoin
...,...
739,Gapcoin
740,Beldex
741,Horizen
742,BitcoinPlus


In [386]:
#Remove the CoinName column
crypto_df=crypto_df.drop(["CoinName"], axis=1)
#change TotalCoinSupply to int64
crypto_df["TotalCoinSupply"] = pd.to_numeric(crypto_df["TotalCoinSupply"], errors="coerce").fillna(0).astype("int64")
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
1,404,Scrypt,PoW/PoS,1055185000.0,532000000
2,808,SHA-256,PoW/PoS,0.0,0
3,1337,X13,PoW/PoS,29279420000.0,314159265359
4,BTC,SHA-256,PoW,17927180.0,21000000


In [387]:
#Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
X=pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType', 'Unnamed: 0'])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,Unnamed: 0_ZEST,Unnamed: 0_ZET,Unnamed: 0_ZILLA,Unnamed: 0_ZND,Unnamed: 0_ZNE,Unnamed: 0_ZNY,Unnamed: 0_ZOI,Unnamed: 0_ZSE,Unnamed: 0_ZUR,Unnamed: 0_ZYD
0,41.99995,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [388]:
#Use the StandardScaler from sklearn (Links to an external site.) to standardize all of the data from the X DataFrame
X_scaled=StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.10047714 -0.03668886 -0.03668644 ... -0.03668644 -0.03668644
  -0.03668644]
 [-0.07317373 -0.0366887  -0.03668644 ... -0.03668644 -0.03668644
  -0.03668644]
 [-0.10047714 -0.03668886 -0.03668644 ... -0.03668644 -0.03668644
  -0.03668644]
 [ 0.65714187 -0.03659589 -0.03668644 ... -0.03668644 -0.03668644
  -0.03668644]
 [-0.10001326 -0.03668885 -0.03668644 ... -0.03668644 -0.03668644
  -0.03668644]]


# Reducing Data Dimensions Using PCA

In [389]:
#reduce the dimensions of the X DataFrame down to three principal components
# Initialize PCA model
pca = PCA(n_components=3)
# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)

In [390]:
#create a DataFrame named “pcs_df” that includes the following columns: PC 1, PC 2, and PC 3
pcs_df=pd.DataFrame(
    data= X_pca, columns=['PC 1', 'PC 2', 'PC 3']
    
)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,0.149778,1.217415,-0.30933
1,0.167217,1.209191,-0.308477
2,0.204538,0.851979,-0.129179
3,1.033963,2.040189,-0.329977
4,-0.589299,-1.381138,0.108864


In [391]:
#add crypto_df index to pcs_df
pcs_df.reindex(coins_name)


Unnamed: 0,PC 1,PC 2,PC 3
"(42 Coin,)",,,
"(404Coin,)",,,
"(808,)",,,
"(EliteCoin,)",,,
"(Bitcoin,)",,,
...,...,...,...
"(Gapcoin,)",,,
"(Beldex,)",,,
"(Horizen,)",,,
"(BitcoinPlus,)",,,


# Clustering Cryptocurrencies Using K-means

In [399]:
#Create an elbow curve to find the best value for K, and use the pcs_df DataFrame

inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)
#create the elbow curve using hvplot
elbow_data= {'k':k, 'inertia':inertia}
df_elbow=pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow Curve')



In [447]:
def get_clusters(k, data):  
# Create a copy of the DataFrame   
    data = pcs_df.copy()      
# Initialize the K-Means model   
    model = KMeans(n_clusters=k, random_state=0)   
# Fit the model   
    model.fit(data)  
# Predict clusters   
    predictions = model.predict(data)   
# Create return DataFrame with predicted clusters   
    data["class"] = model.labels_   
    return data


In [448]:
three_clusters=get_clusters(3, pcs_df)
three_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
0,0.149778,1.217415,-0.30933,1
1,0.167217,1.209191,-0.308477,1
2,0.204538,0.851979,-0.129179,1
3,1.033963,2.040189,-0.329977,1
4,-0.589299,-1.381138,0.108864,0


In [449]:
#create new dataframe
frames=[crypto_df, three_clusters, coins_name]
clustered_df=pd.concat(frames)

clustered_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
0,42,Scrypt,PoW/PoS,41.99995,42.0,,,,,
1,404,Scrypt,PoW/PoS,1055185000.0,532000000.0,,,,,
2,808,SHA-256,PoW/PoS,0.0,0.0,,,,,
3,1337,X13,PoW/PoS,29279420000.0,314159300000.0,,,,,
4,BTC,SHA-256,PoW,17927180.0,21000000.0,,,,,


# Visualizing Results

In [450]:
# Plot the 3D-scatter 
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    #color="class",
    #symbol="class",
    hover_name="CoinName",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [453]:
#create a data table with all the current tradable cryptocurrencies
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class'], width=400)


In [438]:
#Create a scatter plot
clustered_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', hover_cols=["CoinName"])