# Clustering Crypto

In [None]:
#%%capture
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path


### Fetching Cryptocurrency Data

In [None]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url)
print(response.status_code)
crypto_content = response.json()
print(crypto_content)
print(type(crypto_content))
# Note using this technique ran into an error below.

In [None]:
# Read in the CSV file and create a DataFrame
file_path = Path("Resources/crypto_data.csv")
crytpo_df = pd.read_csv(file_path, index_col=0)
crytpo_df.isnull().sum()
crytpo_df.head(25)


### Data Preprocessing

In [None]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
# These columns are already in place.
crytpo_df.tail(10)
crytpo_df.shape

In [None]:
# Keep only cryptocurrencies that are trading
no_trades = crytpo_df[crytpo_df['IsTrading'] == False ].index 
crytpo_df.drop(no_trades, inplace = True) 
crytpo_df.shape


In [None]:
# Keep only cryptocurrencies with a working algorithm
crytpo_df.isnull().sum()
# Note: there are no nulls under the algorthim column.


In [None]:
# Remove the 'IsTrading' column
crytpo_df.drop(['IsTrading'], axis=1, inplace=True)
crytpo_df.head(25)


In [None]:
# See how many nulls we have
crytpo_df.isnull().sum()

In [None]:
# Remove rows with at least 1 null value
crytpo_df.dropna(inplace=True)
crytpo_df.count()

In [None]:
#See the new shape 
crytpo_df.shape

In [None]:
# Remove rows with cryptocurrencies having no coins mined
no_mining = crytpo_df[crytpo_df['TotalCoinsMined'] == 0].index 
crytpo_df.drop(no_mining, inplace = True)  
crytpo_df.shape


In [None]:
#See the datatypes
crytpo_df.dtypes

In [None]:
#Convert TotalCoinSupply to float
crytpo_df['TotalCoinSupply'] = crytpo_df['TotalCoinSupply'].astype('float')
crytpo_df.dtypes

In [None]:
# Drop rows where there are 'N/A' text values

blank_string1 = crytpo_df[crytpo_df['CoinName'] == "N/A"].index 
blank_string2 = crytpo_df[crytpo_df['Algorithm'] == "N/A"].index 
blank_string3 = crytpo_df[crytpo_df['ProofType'] == "N/A"].index 
crytpo_df.drop(blank_string1, inplace = True) 
crytpo_df.drop(blank_string2, inplace = True) 
crytpo_df.drop(blank_string3, inplace = True) 
crytpo_df.count()  
#There are no "N/A" text values

In [None]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
CoinName_df=crytpo_df["CoinName"]
CoinName_df.tail(10)

In [None]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crytpo_df.drop(['CoinName'], axis=1, inplace=True)
crytpo_df.count()

In [None]:
#Put a name to the index for crytpo_df
crytpo_df = crytpo_df.rename_axis('CoinType')
crytpo_df.head() 

In [None]:
# Create dummy variables for text features
X_df = pd.get_dummies(crytpo_df, columns=["Algorithm", "ProofType"])
X_df.head()

In [None]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X_df)
print(X_scaled[0:3])


### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=X_pca, columns=["principal component 1", "principal component 2", "principal component 3"]
)
pcs_df.count()
pcs_df.head()

In [None]:
pcs_df["Coin"]=crytpo_df.index
pcs_df.set_index("Coin", inplace=True)
pcs_df.head()

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=1)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=<your best value for k here>`

In [None]:
# Initialize the K-Means model
# Based on the shape of the elbow curve, 4 clusters is where the decline in inertia recedes meaningfully
model = KMeans(n_clusters=4, random_state=1)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pcs_df["class"] = model.labels_
pcs_df.head(20)
clustered_df = pd.concat([pcs_df, crytpo_df], axis="columns", join="inner")


clustered_df['CoinName'] = CoinName_df
clustered_df = clustered_df[["Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply",
                             "principal component 1", "principal component 2", "principal component 3", 
                             "CoinName", "class"]]
clustered_df.head()

### Visualizing Results

#### 3D-Scatter with Clusters

In [None]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="principal component 3",
    y="principal component 2",
    z="principal component 1",
    title = "Classifying Digital Currencies",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    color="class",
    symbol="class",
    width=1000,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Table of Tradable Cryptocurrencies

In [None]:
# Table with tradable cryptos
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class'], sortable=True, selectable=True)


In [None]:
# Print the total number of tradable cryptocurrencies
totalcoins = clustered_df['CoinName'].count()
print(F"There are {totalcoins} crytpocurrencies in circulation")

In [None]:
#Create an index for clustered_df in order to concatenate properly with MMS_df (CREATED IN NEXT KERNEL)
clustered_df.set_index("CoinName", inplace=True)
clustered_df.head()

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# First create a new dataframe with scaled data using MinMaxScaler   
scaled_output = clustered_df[['TotalCoinsMined','TotalCoinSupply']]
MMS =MinMaxScaler().fit_transform(scaled_output)
#print(MMS[0:3])

MMS_df = pd.DataFrame(
    data=MMS, columns=["Scaled Coins Mined", "Scaled Coins Supply"]
)

#Set index of MMS_df to "Coin"
MMS_df["Coin"]=clustered_df.index
MMS_df.set_index("Coin", inplace=True)
MMS_df.head()

In [None]:
#Concatenate clustered_df with MMS_df
new_clustered_df = pd.concat([clustered_df, MMS_df], axis="columns", join="inner")


new_clustered_df = new_clustered_df.rename_axis('CoinType')
new_clustered_df.head()
#new_clustered_df.count()

In [None]:
#Create an HVplot table for new_clustered_df
new_clustered_df.hvplot.table(columns=['CoinType','Scaled Coins Supply', 'TotalCoinSupply', 'Scaled Coins Mined','TotalCoinsMined', 'class'], sortable=True, selectable=True)

In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply" WITH UNSCALED DATA
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by = "class", hover_cols=["CoinName"], 
                            title = "Tradeable Cryto Currencies")

In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply" WITH SCALED DATA

new_clustered_df.hvplot.scatter(x="Scaled Coins Mined", y="Scaled Coins Supply", by = "class", 
                                xlabel = "Number of Coins Mined (Scaled)", 
                                ylabel = "Number of Coins in Supply (Scaled)",
                                hover_cols=["CoinName"], 
                                title = "Tradeable Cryto Currencies",
                                )

In [None]:
#Scatter plot with logaritmic axes
new_clustered_df.hvplot.scatter(x="Scaled Coins Mined", y="Scaled Coins Supply", by = "class", 
                                xlabel = "Number of Coins Mined (Scaled)", 
                                ylabel = "Number of Coins in Supply (Scaled)",
                                hover_cols=["CoinName"], 
                                title = "Tradeable Cryto Currencies",
                                logx=True, logy=True,
                                ylim=(-0.5, 1.1),
                                xlim=(-0.5, 1.1))

In [None]:
#new_clustered_df.hvplot.scatter(x="Scaled Coins Mined", y="Scaled Coins Supply", by = "class", 
                                #xlabel = "Number of Coins Mined (Scaled)", 
                                #ylabel = "Number of Coins in Supply (Scaled)",
                                #hover_cols=["CoinName"], 
                                #title = "Tradeable Cryto Currencies",
                                #logx=True, logy=True)