In [157]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.express as px
import hvplot.pandas

In [158]:
# Loading data
file_path = "crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [159]:
# Data PreProcessing tasks:
crypto_df_prep = crypto_df

# It looks like the unnamed column with tickers should be set as the index.
crypto_df_prep.rename(columns={'Unnamed: 0':''}, inplace=True)
crypto_df_prep.set_index([''], inplace=True)

# 1: Remove all cryptocurrencies that aren’t trading.
trading_f_indices = crypto_df_prep[crypto_df_prep['IsTrading'] == False].index
crypto_df_prep.drop(trading_f_indices, inplace=True)
#crypto_df_prep['IsTrading'].describe()

# 2: Remove all cryptocurrencies that don’t have an algorithm defined.
# I interpreted this as dropping all rows where 'Algorithm' was 'Multiple', since we're also dropping nulls later.
algo_f_indices = crypto_df_prep[crypto_df_prep['Algorithm'] == 'Multiple'].index
crypto_df_prep.drop(algo_f_indices, inplace=True)
#crypto_df_prep['Algorithm'].unique()
#crypto_df_prep.head()

# 3: Remove the IsTrading column.
crypto_df_prep.drop(columns = ['IsTrading'], inplace=True)

# 4: Remove all cryptocurrencies with at least one null value.
crypto_df_prep.dropna()

# 5: Remove all cryptocurrencies without coins mined.
mined_f = (crypto_df_prep['TotalCoinsMined'] == 0) | (pd.isna(crypto_df_prep['TotalCoinsMined']))
mined_f_indices = crypto_df_prep[mined_f].index
crypto_df_prep.drop(mined_f_indices, inplace=True)
#crypto_df_prep['TotalCoinsMined']
#mined_f.describe()
#crypto_df_prep.head()

# 6: Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.
coins_name = pd.DataFrame(crypto_df_prep['CoinName'], index = crypto_df.index)
#coins_name.head()

# 7: Remove the CoinName column.
# I presume this is from the prepped crypto dataframe.
crypto_df_prep.drop(columns = ['CoinName'], inplace=True)
#crypto_df_prep.head()

# 8: Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
X = pd.get_dummies(crypto_df_prep, columns = ["Algorithm", "ProofType"])
X.head()

# 9: Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame. Remember, this is important prior to using PCA and K-means algorithms.
X_scaled = StandardScaler().fit_transform(X)
#print(X_scaled[0:5])

# Reducing Data Dimensions Using PCA

In [160]:
# Initialize PCA model, and get three principal components for the data.
X_pca = PCA(n_components=3).fit_transform(X_scaled)

In [161]:
# Transform PCA data to a DataFrame
pcs_df = pd.DataFrame(
    data = X_pca,
    columns = ["principal component 1", "principal component 2", "principal component 3"], index = crypto_df.index
)
pcs_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
,,,
42,-0.36696,0.845506,-0.615516
404,-0.350508,0.845367,-0.615883
1337,2.252839,1.283099,-0.695736
BTC,-0.124365,-1.276516,0.238158
ETH,-0.121186,-1.944149,0.472975


# Clustering Cryptocurrencies Using K-means

In [144]:
# 1: Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
	km = KMeans(n_clusters=i, random_state=0)
	km.fit(pcs_df)
	inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [145]:
# 2: Run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data.
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

In [146]:
# 3: Create a new DataFrame named “clustered_df”
# Add the predicted class columns
pcs_df.columns = ["PC 1", "PC 2", "PC 3"]
pcs_df["CoinName"] = coins_name
pcs_df["Class"] = model.labels_

clustered_df = crypto_df_prep
clustered_df = clustered_df.merge(pcs_df, how = 'inner', left_index=True, right_index=True)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.332873,0.865885,-0.575821,42 Coin,2.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.316351,0.865945,-0.576123,404Coin,2.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0,2.285097,1.526404,-0.695151,EliteCoin,2.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.144972,-1.293303,0.197261,Bitcoin,0.0
ETH,Ethash,PoW,107684200.0,0.0,-0.139425,-1.953134,0.457285,Ethereum,0.0


# Visualizing Results

In [155]:
# 1: Plot the 3D-scatter
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    color="Class",
    symbol="Class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [162]:
# 2:

clustered_df = clustered_df.astype({"TotalCoinsMined": "float64", "TotalCoinSupply": "float64"})

col_names = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']

clustered_df.hvplot.table(columns=col_names, sortable=True, selectable=True)

In [163]:
# 3:



clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", hover_cols=["CoinName"])

# I should retroactively change the datatype of coins mined and supply to floats.

#clustered_df['TotalCoinSupply']