## Cryptocurrency Unsupervised Learning

In [250]:
# Import Dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import hvplot.pandas

In [251]:
# Loading the dataset
file = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


#### Data Preprocessing Tasks

In [252]:
crypto_df.count()

Unnamed: 0         1252
CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [253]:
# crypto_df.set_index('Unnamed: 0', inplace=True)
# crypto_df.index.name=None
# # crypto_df.reset_index(drop=True, inplace=True)
# crypto_df.head()

In [254]:
# Removing all cryptocurrencies that are not trading
crypto_df = crypto_df[crypto_df.IsTrading == True]
crypto_df.count()

Unnamed: 0         1144
CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [255]:
# Removing all cryptocurrencies that have no defined algorithm
crypto_df["Algorithm"].isnull().count()
# There are all defined
# But the following syntax will remove rows with non-null values in Algorithm column
# crypto_df = crypto_df[crypto_df.Algorithm.notnull()]

1144

In [256]:
# Removing the IsTrading column
crypto_df.drop('IsTrading', axis=1, inplace=True)

In [257]:
# Removing all cryptocurrencies with at least one null value
crypto_df = crypto_df.dropna()

In [258]:
# Removing all cryptocurrencies without coins mined
crypto_df= crypto_df[crypto_df.TotalCoinsMined != 0]
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [259]:
# Creating a DataFrame for all the names of cryptocurrencies
coins_name = pd.DataFrame(crypto_df, columns = ['Unnamed: 0','CoinName'])
# pcs_df.set_index('Unnamed: 0', inplace=True)
coins_name.set_index('Unnamed: 0', inplace=True)
coins_name.index.name=None
# coins_name = crypto_df.filter(['Unnamed: 0','CoinName'], axis=1)
# coins_name.set_index('Unnamed: 0')
coins_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [260]:
# Removing the CoinName column in the crypto_df
# drop_coin = crypto_df.drop('CoinName', axis=1)
crypto_df = pd.DataFrame(crypto_df, columns = ['Unnamed: 0','Algorithm','ProofType',
                                               'TotalCoinsMined', 'TotalCoinSupply'])
crypto_df.set_index('Unnamed: 0', inplace=True)
crypto_df.index.name=None
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [261]:
# Creating dummy variables for all of the text features
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [262]:
# Standardizing Data from X dataframe
crypto_scaled = StandardScaler().fit_transform(X)
print(crypto_scaled[0:101])


[[-0.11674788 -0.15286468 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.09358885 -0.14499604 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [ 0.52587231  4.4937636  -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 ...
 [ 3.92312182  2.57553368 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [ 0.03483681 -0.00495791 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]
 [-0.0769557  -0.12093438 -0.0433555  ... -0.0433555  -0.0433555
  -0.0433555 ]]


### Reducing Data Dimensions Using PCA

In [263]:
from sklearn.decomposition import PCA

# Reducing to three principal components
pca = PCA(n_components=3)

In [264]:
# Applying Dimensionality Reduction
crypto_pca = pca.fit_transform(crypto_scaled)

In [265]:
# Transforming PCA data to a DataFrame
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"]
)
pcs_df.set_index(X.index, inplace=True)
pcs_df.index.name=None
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.335767,0.982322,-0.581368
404,-0.319085,0.9823,-0.581721
1337,2.309817,1.626677,-0.713184
BTC,-0.141259,-1.339585,0.116826
ETH,-0.149832,-1.993421,0.423413
LTC,-0.167522,-1.147759,0.017639
DASH,-0.388745,1.25766,-0.524275
XMR,-0.152339,-2.167252,0.528433
ETC,-0.148272,-1.993535,0.423409
ZEC,-0.150117,-2.016246,0.380269


### Clustering Using K-means

In [266]:
# Attaining the Explained Variance Ratio
pca.explained_variance_ratio_

array([0.02737284, 0.02094369, 0.02009693])

### Elbow Curve

In [267]:
# Finding the best value for K
inertia = []
k = list(range(1, 11))

# Calculating the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Creating Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

### Function to Determine Best Value for K

In [268]:
def test_k_value(k, data):
    data=data.copy()
    model=KMeans(n_clusters=k, random_state=0)
    
    model.fit(data)
    predictions=model.predict(data)
    
    data["class"]=model.labels_
    
    return data

#### Testing K=4

In [269]:
four_clusters=test_k_value(4, pcs_df)
four_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.335767,0.982322,-0.581368,0
404,-0.319085,0.9823,-0.581721,0
1337,2.309817,1.626677,-0.713184,0
BTC,-0.141259,-1.339585,0.116826,2
ETH,-0.149832,-1.993421,0.423413,2


In [270]:
# Plotting the clusters on 3D plot for K=4
import plotly.express as px
fig=px.scatter_3d(
    four_clusters,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Testing K=5

In [271]:
five_clusters=test_k_value(5, pcs_df)
five_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.335767,0.982322,-0.581368,0
404,-0.319085,0.9823,-0.581721,0
1337,2.309817,1.626677,-0.713184,4
BTC,-0.141259,-1.339585,0.116826,2
ETH,-0.149832,-1.993421,0.423413,2


In [272]:
# Plotting the clusters on 3D plot for K=5
fig=px.scatter_3d(
    five_clusters,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Testing K=6

In [273]:
six_clusters=test_k_value(6, pcs_df)
six_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.335767,0.982322,-0.581368,1
404,-0.319085,0.9823,-0.581721,1
1337,2.309817,1.626677,-0.713184,5
BTC,-0.141259,-1.339585,0.116826,0
ETH,-0.149832,-1.993421,0.423413,0


In [274]:
# Plotting the clusters on 3D plot for K=6
fig=px.scatter_3d(
    six_clusters,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Based on the above 3D plots for K=4, K=5 & K=6, it seems that K=4 is the best option.

In [275]:
# Creating DataFrame with results
df1 = pd.merge(crypto_df, four_clusters, left_index=True, right_index=True)
clustered_df = pd.merge(df1, coins_name, left_index=True, right_index=True)
clustered_df =clustered_df[['Algorithm','ProofType','TotalCoinsMined','TotalCoinSupply','PC 1',
                           'PC 2', 'PC 3','CoinName','class']]
clustered_df.rename(columns = {'class':'Class'}, inplace = True) 
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.335767,0.982322,-0.581368,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.319085,0.9823,-0.581721,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.309817,1.626677,-0.713184,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.141259,-1.339585,0.116826,Bitcoin,2
ETH,Ethash,PoW,107684200.0,0,-0.149832,-1.993421,0.423413,Ethereum,2


### Visualizing Results

In [283]:
# 3D plot with parameters hover_name="CoinName" and hover_data="Algorithm"
fig=px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    color="Class",
    symbol="Class",
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
# Table with All Current Tradable Cryptocurrencies
gender = lustered_df
age = [10,16,13,12]
weight = [15,18,16,10]
height = [0.8,0.6,0.7,0.8]
table = hv.Table({'CoinName':clustered_df, 'Age':age, 'Weight':weight, 'Height':height},
                 ['Gender', 'Age'],  ['Weight', 'Height'])
table.opts(height=140)