# Clustering Crypto

In [209]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [210]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [211]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
# CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
crypto = requests.get(url).json()
# def crypto_clean(crypto):
#     for coin in crypto['Data']:
#         name = coin['CoinName']
#         algorithm = coin['Algorithm']
#         trading = coin['IsTrading']
#         proof = coin['ProofType']
#         total_minted = coin['TotalCoinsMined']
#         total_supply = coin['TotalCoinSupply']

#     return pd.DataFrame({
#         "Name":name,
#         'Algorithm': algorithm,
#         'Currently trading': trading,
#         'ProofType': proof,
#          'Total Minted': total_minted,
#          'Supply': total_supply
#     })

crypto_df = pd.DataFrame(crypto['Data']).T
crypto_df.head()


Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.440859,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [212]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# crypto_df = pd.read_csv('crypto_data.csv', index_col="CoinName")
# crypto_df = crypto_df.drop(columns=['Unnamed: 0'], axis=1)

crypto_df

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42,0,0,0,blockchain,scrypt,0.440859,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300,0,0,0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1,0,0,0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1,0,0,0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
COMM,4422,/coins/comm/overview,/media/19661/comm.png,1427711603,COMM,COMM,Community Coin,Community Coin (COMM),Community Coin is a fully premined coin with a...,,...,,,,,,,,,,
CRYPT,4427,/coins/crypt/overview,/media/19664/crypt.png,1427711654,CRYPT,CRYPT,CryptCoin,CryptCoin (CRYPT),CryptCoin (CRYPT) is an X11 altcoin - a third ...,,...,0,0,0,0,,,,,,
CTO,6117,/coins/cto/overview,/media/19808/mrs.png,1433498561,CTO,CTO,Crypto,Crypto (CTO),Crypto is a decentralized cryptocurrency desig...,,...,,,,,,,,,,
CTX,941224,/coins/ctx/overview,/media/37747472/ctx1.png,1620380827,CTX,CTX,Cryptex,Cryptex (CTX),"Cryptex is focused on building innovative, ope...",,...,-1,0,0,0,token,,,ETH,0x321c2fe4446c7c963dc41dd58879af648838f98d,18


### Data Preprocessing

In [213]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]

In [214]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df['IsTrading'] == True]

In [215]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df['Algorithm'] != "N/A"]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0,-1
404,404Coin,Scrypt,True,PoW/PoS,0,-1
611,SixEleven,SHA-256,True,PoW,0,0
808,808,SHA-256,True,PoW/PoS,0,0
...,...,...,...,...,...,...
BUN,BunnyCoin,Scrypt,True,PoW,,
CHILD,ChildCoin,X11,True,PoW,,
COMM,Community Coin,Scrypt,True,PoW/PoS,,
CRYPT,CryptCoin,X11,True,PoW,0,0


In [216]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(columns="IsTrading")

In [217]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(axis=0, how='any')
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
365,365Coin,X11,PoW/PoS,0,-1
404,404Coin,Scrypt,PoW/PoS,0,-1
611,SixEleven,SHA-256,PoW,0,0
808,808,SHA-256,PoW/PoS,0,0
...,...,...,...,...,...
NEO,NEO,dBFT2.0,,100000000,100000000
OXYC,Oxycoin,DPoS,DPoS,1122382283.37,-1
AUR,AUREO,BEP-20 Token,,19769313.742959,21000000
BTCR,BitCurrency,Scrypt,PoS,0,-1


In [218]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]
crypto_df.head(20)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42.0
NSR,NuShares,PoS,PoS,6177926685.8373,0.0
TRI,Triangles Coin,X13,PoW/PoS,191624.022943,0.0
CMTC,CometCoin,Scrypt,PoW,872830.0,0.0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1.0
PURA,Pura,X11,PoW,188358976.839698,-1.0
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0.0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000.0
FOIN,Foin,SHA-256,,92631000.8161,100000000.0
NVL,Nevula,NEP-5,,40000000000.0,40000000000.0


In [219]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != "N/A"].dropna()
crypto_df


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6177926685.8373,0
TRI,Triangles Coin,X13,PoW/PoS,191624.022943,0
CMTC,CometCoin,Scrypt,PoW,872830,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,-1
...,...,...,...,...,...
RDD,Reddcoin,Scrypt,PoW/PoS,30966300704.728371,-1
NMC,Namecoin,SHA-256,PoW,18157737.5,-1
NANO,Nano,Blake2b,PoW,133248290,133248290
NAV,NavCoin,X13,PoW/PoS,73506978.423581,-1


In [220]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
Coin_name = crypto_df[['CoinName']]
Coin_name


Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat
...,...
RDD,Reddcoin
NMC,Namecoin
NANO,Nano
NAV,NavCoin


In [221]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns=['CoinName'])
crypto_df


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6177926685.8373,0
TRI,X13,PoW/PoS,191624.022943,0
CMTC,Scrypt,PoW,872830,0
CHAT,Scrypt,PoW/PoS,1000000000,-1
...,...,...,...,...
RDD,Scrypt,PoW/PoS,30966300704.728371,-1
NMC,SHA-256,PoW,18157737.5,-1
NANO,Blake2b,PoW,133248290,133248290
NAV,X13,PoW/PoS,73506978.423581,-1


In [222]:
# Create dummy variables for text features
dummy = pd.get_dummies(data=crypto_df,columns=['Algorithm','ProofType'])


In [223]:
# Standardize data
scaler = StandardScaler()
dummy_scaled = scaler.fit_transform(dummy)

### Reducing Dimensions Using PCA

In [224]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
pcs_transform = pca.fit_transform(dummy_scaled)


In [225]:
# Create a DataFrame with the principal components data
pca_df = pd.DataFrame(pcs_transform, columns=['Feature_1','Feature_2',"Feature_3"],index=crypto_df.index)
pca_df

Unnamed: 0,Feature_1,Feature_2,Feature_3
42,0.222853,-1.326063,-1.343659
NSR,0.696264,-1.169297,-0.312067
TRI,0.654795,-1.974567,-1.626911
CMTC,-0.853521,0.447471,-0.364627
CHAT,0.222857,-1.326059,-1.343659
...,...,...,...
RDD,0.223001,-1.325943,-1.343666
NMC,-1.263204,1.280477,0.162597
NANO,-1.218753,1.246341,0.226203
NAV,0.654796,-1.974567,-1.626911


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [226]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    mean_model = KMeans(n_clusters=i, random_state=32)
    mean_model.fit(pca_df)
    inertia.append(mean_model.inertia_)


# Create the Elbow Curve using hvPlot
elbow = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow)
elbow_df.hvplot(x="k", y="inertia")
# elbow_df[elbow_df['k'] == elbow_df['k'].max()]

  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [227]:
# Initialize the K-Means model
K_mugga = KMeans(n_clusters=4, random_state=22)
# Fit the model
K_mugga.fit(pca_df)
# Predict clusters
predictions = K_mugga.predict(pca_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
combined_df = pd.concat([pca_df, crypto_df],axis=1,sort=False)
combined_df['cluster'] = K_mugga.labels_
combined_df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Algorithm,ProofType,TotalCoinsMined,MaxSupply,cluster
42,0.222853,-1.326063,-1.343659,Scrypt,PoW/PoS,41.999952,42,2
NSR,0.696264,-1.169297,-0.312067,PoS,PoS,6177926685.8373,0,2
TRI,0.654795,-1.974567,-1.626911,X13,PoW/PoS,191624.022943,0,2
CMTC,-0.853521,0.447471,-0.364627,Scrypt,PoW,872830,0,0
CHAT,0.222857,-1.326059,-1.343659,Scrypt,PoW/PoS,1000000000,-1,2
...,...,...,...,...,...,...,...,...
RDD,0.223001,-1.325943,-1.343666,Scrypt,PoW/PoS,30966300704.728371,-1,2
NMC,-1.263204,1.280477,0.162597,SHA-256,PoW,18157737.5,-1,0
NANO,-1.218753,1.246341,0.226203,Blake2b,PoW,133248290,133248290,0
NAV,0.654796,-1.974567,-1.626911,X13,PoW/PoS,73506978.423581,-1,2


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [228]:
# Scale data to create the scatter plot
scale = MinMaxScaler()
series = scale.fit_transform(combined_df[['TotalCoinsMined','MaxSupply']])
series_df = pd.DataFrame(series, columns=['TotalCoinsMined','MaxSupply'], index=combined_df.index)
series_df['cluster'] = combined_df['cluster']
series_df['CoinName'] = Coin_name
series_df['Algorithm'] = combined_df['Algorithm']
series_df


Unnamed: 0,TotalCoinsMined,MaxSupply,cluster,CoinName,Algorithm
42,0.000000e+00,2.047619e-12,2,42 Coin,Scrypt
NSR,6.240330e-06,4.761905e-14,2,NuShares,PoS
TRI,1.935172e-10,4.761905e-14,2,Triangles Coin,X13
CMTC,8.816040e-10,4.761905e-14,0,CometCoin,Scrypt
CHAT,1.010101e-06,0.000000e+00,2,OpenChat,Scrypt
...,...,...,...,...,...
RDD,3.127909e-05,0.000000e+00,2,Reddcoin,Scrypt
NMC,1.834111e-08,0.000000e+00,0,Namecoin,SHA-256
NANO,1.345942e-07,6.345157e-06,0,Nano,Blake2b
NAV,7.424943e-08,0.000000e+00,2,NavCoin,X13


In [229]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
series_df.hvplot.scatter(x="TotalCoinsMined", y="MaxSupply",by='cluster',hover_cols='CoinName')

#### Table of Tradable Cryptocurrencies

In [230]:
# Table with tradable cryptos
combined_df['CoinName'] = series_df["CoinName"]
tradable_table = combined_df[["CoinName", "Algorithm", "MaxSupply","TotalCoinsMined", "cluster","ProofType"]].hvplot.table()



In [231]:
# Print the total number of tradable cryptocurrenciescombined_df
print(len(tradable_table))

140
