# Clustering Crypto

In [331]:
# Initial imports
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Fetching Cryptocurrency Data

In [332]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [333]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
response_data=requests.get(url).json()
data=response_data['Data']
data['42'].keys()


dict_keys(['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 'Name', 'Symbol', 'CoinName', 'FullName', 'Description', 'AssetTokenStatus', 'Algorithm', 'ProofType', 'SortOrder', 'Sponsored', 'Taxonomy', 'Rating', 'IsTrading', 'TotalCoinsMined', 'CirculatingSupply', 'BlockNumber', 'NetHashesPerSecond', 'BlockReward', 'BlockTime', 'AssetLaunchDate', 'AssetWhitepaperUrl', 'AssetWebsiteUrl', 'MaxSupply', 'MktCapPenalty', 'IsUsedInDefi', 'IsUsedInNft', 'PlatformType', 'AlgorithmType', 'Difficulty'])

In [334]:
data_list=[]
for i,key in enumerate(data.keys()):
    data_sub=[]
    # for j,key2 in enumerate(data[key].keys()):
    #     data_sub.append(data[key][key2])
    try:
        for key2 in ['Symbol','CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply']:
            data_sub.append(data[key][key2])
    except:
        data_sub.append(0)
    
    data_list.append(data_sub)

data_list[0:4]

[['42', '42 Coin', 'Scrypt', True, 'PoW/PoS', 41.9999516, 41.9999516],
 ['300', '300 token', 'N/A', True, 'N/A', 300, 0],
 ['365', '365Coin', 'X11', True, 'PoW/PoS', 0, 0],
 ['404', '404Coin', 'Scrypt', True, 'PoW/PoS', 0, 0]]

In [335]:
data_df=pd.DataFrame(data_list,columns=['Symbol','CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'])
data_df.head() 

Unnamed: 0,Symbol,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.999952,41.999952
1,300,300 token,,True,,300.0,0.0
2,365,365Coin,X11,True,PoW/PoS,0.0,0.0
3,404,404Coin,Scrypt,True,PoW/PoS,0.0,0.0
4,433,433 Token,,False,,0.0,


In [336]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")
# Create a DataFrame
data_df=pd.read_csv("crypto_data.csv")
data_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [337]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
data_df=data_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]
data_df.head()
# data_df2.shape

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [338]:
# Keep only cryptocurrencies that are trading
data_df=data_df[data_df['IsTrading']==True]
data_df.shape

(1144, 6)

In [339]:
# Keep only cryptocurrencies with a working algorithm?????
data_df=data_df[data_df['Algorithm']!='N/A']
data_df.shape

(1144, 6)

In [340]:
# Remove the "IsTrading" column
data_df=data_df.drop(columns='IsTrading')
data_df.shape

(1144, 5)

In [341]:
# Remove rows with at least 1 null value
data_df.dropna(inplace=True)
data_df.shape

(685, 5)

In [342]:
# Remove rows with cryptocurrencies having no coins mined
data_df=data_df[data_df['TotalCoinsMined']>0]
data_df.shape

(532, 5)

In [343]:
# Drop rows where there are 'N/A' text values
data_df=data_df[~data_df.isin(['N/A'])]
data_df.shape

(532, 5)

In [344]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
CoinName_df=data_df[['CoinName']]
CoinName_df


Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [345]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
data_df.drop('CoinName',axis=1,inplace=True)

In [346]:
# process one-hot ecoding for top 10 values and the rest of the (least common) values are treated as 'other'
def dummy_with_other(df,Cvar,n):#dataframe, categorical variable name as string, number of values to keep
    list=[]
    
    for index, row in df.iterrows():
        if row[Cvar] in set(df[Cvar].value_counts().nlargest(n).keys()):
            list.append(row[Cvar])
        else:
            list.append('Other')
    return list

In [350]:
data_df['Algorithm2']=dummy_with_other(data_df,'Algorithm',10)
data_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Algorithm2
0,Scrypt,PoW/PoS,4.199995e+01,42,Scrypt
2,Scrypt,PoW/PoS,1.055185e+09,532000000,Scrypt
5,X13,PoW/PoS,2.927942e+10,314159265359,X13
7,SHA-256,PoW,1.792718e+07,21000000,SHA-256
8,Ethash,PoW,1.076842e+08,0,Other
...,...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000,SHA-256
1242,Scrypt,PoW/PoS,1.493105e+07,250000000,Scrypt
1245,CryptoNight,PoW,9.802226e+08,1400222610,CryptoNight
1246,Equihash,PoW,7.296538e+06,21000000,Equihash


In [351]:
data_df['ProofType'].value_counts().nlargest(10)

PoW               237
PoW/PoS           176
PoS                86
DPoS                9
PoC                 3
PoS/PoW             2
POBh                1
LPoS                1
Proof of Trust      1
Pos                 1
Name: ProofType, dtype: int64

In [353]:
data_df['ProofType2']=dummy_with_other(data_df,'ProofType',3)

In [355]:
data_df_t=data_df.drop(columns=['Algorithm','ProofType'])

In [357]:
data_df_t

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm2,ProofType2
0,4.199995e+01,42,Scrypt,PoW/PoS
2,1.055185e+09,532000000,Scrypt,PoW/PoS
5,2.927942e+10,314159265359,X13,PoW/PoS
7,1.792718e+07,21000000,SHA-256,PoW
8,1.076842e+08,0,Other,PoW
...,...,...,...,...
1238,2.000000e+09,2000000000,SHA-256,Other
1242,1.493105e+07,250000000,Scrypt,PoW/PoS
1245,9.802226e+08,1400222610,CryptoNight,PoW
1246,7.296538e+06,21000000,Equihash,PoW


In [358]:
# Create dummy variables for text features
data_df_t=pd.get_dummies(data_df_t, prefix=['Algorithm2', 'ProofType2'], columns=['Algorithm2', 'ProofType2'],drop_first=True)
data_df_t.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm2_Equihash,Algorithm2_NIST5,Algorithm2_NeoScrypt,Algorithm2_Other,Algorithm2_PoS,Algorithm2_Quark,Algorithm2_SHA-256,Algorithm2_Scrypt,Algorithm2_X11,Algorithm2_X13,ProofType2_PoS,ProofType2_PoW,ProofType2_PoW/PoS
0,41.99995,42,0,0,0,0,0,0,0,1,0,0,0,0,1
2,1055185000.0,532000000,0,0,0,0,0,0,0,1,0,0,0,0,1
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,0,1,0,0,1
7,17927180.0,21000000,0,0,0,0,0,0,1,0,0,0,0,1,0
8,107684200.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [359]:
# Standardize data
stad=StandardScaler().fit_transform(data_df_t.values)
data_df_stz=pd.DataFrame(stad,columns=data_df_t.columns)
data_df_stz.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm2_Equihash,Algorithm2_NIST5,Algorithm2_NeoScrypt,Algorithm2_Other,Algorithm2_PoS,Algorithm2_Quark,Algorithm2_SHA-256,Algorithm2_Scrypt,Algorithm2_X11,Algorithm2_X13,ProofType2_PoS,ProofType2_PoW,ProofType2_PoW/PoS
0,-0.117108,-0.15287,-0.151911,-0.138409,-0.138409,-0.571562,-0.181686,-0.158266,-0.314918,1.38675,-0.3988,-0.181686,-0.439119,-0.89632,1.422226
1,-0.09397,-0.145009,-0.151911,-0.138409,-0.138409,-0.571562,-0.181686,-0.158266,-0.314918,1.38675,-0.3988,-0.181686,-0.439119,-0.89632,1.422226
2,0.524946,4.489424,-0.151911,-0.138409,-0.138409,-0.571562,-0.181686,-0.158266,-0.314918,-0.72111,-0.3988,5.504009,-0.439119,-0.89632,1.422226
3,-0.116715,-0.15256,-0.151911,-0.138409,-0.138409,-0.571562,-0.181686,-0.158266,3.175426,-0.72111,-0.3988,-0.181686,-0.439119,1.115673,-0.703123
4,-0.114747,-0.15287,-0.151911,-0.138409,-0.138409,1.749591,-0.181686,-0.158266,-0.314918,-0.72111,-0.3988,-0.181686,-0.439119,1.115673,-0.703123


### Reducing Dimensions Using PCA

In [360]:
# Use PCA to reduce dimensions to 3 principal components
pca=PCA(n_components = 3)
pca.fit(data_df_stz)
data_pca=pca.transform(data_df_stz)


In [361]:
# Create a DataFrame with the principal components data
data_pca_df=pd.DataFrame(data=data_pca,columns=['PCA1','PCA2','PCA3'])
data_pca_df

Unnamed: 0,PCA1,PCA2,PCA3
0,-1.446542,1.189025,-0.735879
1,-1.437640,1.204600,-0.723976
2,-0.599278,3.966223,2.302521
3,1.123796,-0.742553,-0.722120
4,1.700560,-0.760207,-0.195960
...,...,...,...
527,-0.003900,-0.406838,-0.065270
528,-1.445342,1.191043,-0.734382
529,1.025517,-0.440074,-0.526040
530,1.941035,-1.164650,-1.007405


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [363]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km=KMeans(n_clusters=i, random_state=0)
    km.fit(data_pca_df)
    inertia.append(km.inertia_)
# Create the Elbow Curve using hvPlot
elbow={'k':k, 'inertia':inertia}
df_elbow=pd.DataFrame(elbow)
df_elbow.hvplot.line(x='k',y='inertia',xticks=k,title="Elbow Curve")

Running K-Means with `k=4`

In [364]:
# Initialize the K-Means model
km4=KMeans(n_clusters=4,random_state=1)
# Fit the model
km4.fit(data_pca_df)
# Predict clusters
pred_y=km4.predict(data_pca_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pred_df=data_df.copy()
pred_df['class']=pred_y

In [388]:
pred_df.drop(columns=['Algorithm2','ProofType2'])
pred_df['CoinName']=CoinName_df

pred_df=pred_df[pred_df.columns.tolist()[-1:] + pred_df.columns.tolist()[:-1]]


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [373]:
# Scale data to create the scatter plot
scaler=MinMaxScaler() 
pred_df[['TotalCoinsMined','TotalCoinSupply']]=scaler.fit_transform(pred_df[['TotalCoinsMined','TotalCoinSupply']])

In [374]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
pred_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["class"],
    by="class",
)

#### Table of Tradable Cryptocurrencies

In [389]:
# Table with tradable cryptos
pred_df.hvplot.table()

In [393]:
# Print the total number of tradable cryptocurrencies
print(len(pred_df))

532
