# Clustering Crypto

In [1]:
# Initial imports
import requests
import pandas as pd
from pathlib import Path
import altair as alt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [2]:
# Use the following endpoint to fetch json data
import json
import pprint
url = "https://min-api.cryptocompare.com/data/all/coinlist"
r = requests.get(url)
content = r.content
content_d = content.decode('utf-8')
data = json.loads(content_d)

In [3]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
# dataframe creation
crypto_df = pd.DataFrame(data=data['Data'])

# Transpose Dataframe
crypto_df = crypto_df.T

crypto_df.head(1)

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,DecimalPoints,Difficulty,IsUsedInDefi
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0,0000-00-00,0,0,,,,,,


In [4]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
# crypto_df = pd.read_csv(file_path)
# crypto_df.head()

### Data Preprocessing

In [5]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'


# Reorder Columns that are still in the dataframe
crypto_df = crypto_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined', 'MaxSupply']]

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0.0,0.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,0.0
433,433 Token,,False,,,


In [6]:
# Keep only cryptocurrencies that are trading

# determine the count of false values(1094)
# crypto_df['IsTrading'].value_counts() 

crypto_df.drop(crypto_df[crypto_df['IsTrading'] == False].index,
              inplace=True)

In [7]:
# Keep only cryptocurrencies with a working algorithm
crypto_df.drop(crypto_df[crypto_df['Algorithm'] == 'N/A'].index, inplace=True)

In [8]:
# Remove the "IsTrading" column
crypto_df.drop(columns = ['IsTrading'], inplace=True)

In [9]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)

In [10]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == 0].index, inplace=True)

In [11]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df != 'N/A'].dropna()

In [12]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
CoinName_df = crypto_df['CoinName'].to_frame()

In [13]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns = ['CoinName'], inplace=True)

In [14]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'], drop_first=True)

In [15]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)

### Reducing Dimensions Using PCA

In [16]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)

In [17]:
# Create a DataFrame with the principal components data
pca_df = pd.DataFrame(data=crypto_pca,
                            columns=['PC 1', 'PC 2', 'PC 3'],
                            index=crypto_df.index
                            )
pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
NVC,-0.373536,1.232409,-1.69369
XCP,-0.476476,-1.625347,0.031088
NSR,-0.166336,1.103218,1.046176
MONA,-0.450968,-0.870212,-0.526856
TRI,-0.350827,2.057431,-2.196086


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [18]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=6)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)
alt.Chart(elbow_df).mark_line().encode(x='k', y='inertia')

Running K-Means with `k=<your best value for k here>`

In [19]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)
pca_df['class'] = model.labels_

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, CoinName_df, pca_df], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,CoinName,PC 1,PC 2,PC 3,class
NVC,Scrypt,PoW/PoS,3079310.0,-1,NovaCoin,-0.373536,1.232409,-1.69369,3
XCP,SHA-256,PoW,2615220.0,-1,CounterParty,-0.476476,-1.625347,0.031088,1
NSR,PoS,PoS,5849670000.0,0,NuShares,-0.166336,1.103218,1.046176,0
MONA,Scrypt,PoW,80001600.0,-1,MonaCoin,-0.450968,-0.870212,-0.526856,1
TRI,X13,PoW/PoS,166244.0,0,Triangles Coin,-0.350827,2.057431,-2.196086,3


### Visualizing Results

#### 3D-Scatter with Clusters

In [24]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
                   x='PC 1',
                   y='PC 2',
                    color='class',
                   tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'MaxSupply']
).interactive()

#### Table of Tradable Cryptocurrencies

In [21]:
# Table with tradable cryptos
clustered_df_no_pc = clustered_df.drop(columns=['PC 1', 'PC 2', 'PC 3'])
display(clustered_df_no_pc)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,CoinName,class
NVC,Scrypt,PoW/PoS,3.07931e+06,-1,NovaCoin,3
XCP,SHA-256,PoW,2.61522e+06,-1,CounterParty,1
NSR,PoS,PoS,5.84967e+09,0,NuShares,0
MONA,Scrypt,PoW,8.00016e+07,-1,MonaCoin,1
TRI,X13,PoW/PoS,166244,0,Triangles Coin,3
...,...,...,...,...,...,...
BTC,SHA-256,PoW,18575475,2.1e+07,Bitcoin,1
ETH,Ethash,PoW,1.13899e+08,-1,Ethereum,1
WAVES,Leased POS,LPoS,103838704,-1,Waves,0
ADA,Ouroboros,PoS,3.17827e+10,45000000000,Cardano,0


In [22]:
# Print the total number of tradable cryptocurrencies
print(f'The total number of tradable cryptocurrenices is: {len(clustered_df_no_pc)}')

The total number of tradable cryptocurrenices is: 97


#### Scatter Plot with Tradable Cryptocurrencies

In [25]:
# Scale data to create the scatter plot
scaler = MinMaxScaler()

normalize_columns = ['TotalCoinsMined', 'MaxSupply']
x = crypto_df[normalize_columns].values
x_scaled = scaler.fit_transform(x)

df_tradable = pd.DataFrame(x_scaled, columns=normalize_columns, index=crypto_df.index).reset_index()


In [26]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(df_tradable).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='MaxSupply',
)