In [29]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering


# Data Preprocessing

In [30]:
# Loading data
file_path = "/Users/annashvilpe/Documents/UT - Data Bootcamp/Analysis_Projects/Cryptocurrencies/Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [31]:
crypto_df.count()

CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [32]:
# 1. Remove all cryptocurrencies that aren’t trading.
# Remove the `Issued` loan status
crypto_trading = crypto_df['IsTrading'] == True
crypto_df = crypto_df.loc[crypto_trading]

crypto_df.shape

(1144, 6)

In [33]:
# 2. Remove all cryptocurrencies that don’t have an algorithm defined.

crypto_df.isnull().sum()


CoinName             0
Algorithm            0
IsTrading            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [34]:
crypto_algo = crypto_df['Algorithm'] != 'N/A'
crypto_df = crypto_df[crypto_algo]
crypto_df.shape

(1144, 6)

In [35]:
# 3. Remove the IsTrading column.
crypto_df.drop('IsTrading', axis=1, inplace=True)
crypto_df.head()


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [36]:
crypto_df.count()

CoinName           1144
Algorithm          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [37]:
# 4. Remove all cryptocurrencies with at least one null value.
crypto_df = crypto_df.dropna(axis=0, how='any')
crypto_df.head()


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [38]:
crypto_df.shape

(685, 5)

In [39]:
# 5.0 Remove all cryptocurrencies without coins mined.

crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]
crypto_df.shape

(532, 5)

In [40]:
# 6.0 Store the names of all cryptocurrencies on a DataFramed named coins_name, 
# and use the crypto_df.index as the index for this new DataFrame.

coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
coins_name.shape
coins_name.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [41]:
# 7.0 Remove the CoinName column.

crypto_df.drop(columns=['CoinName'], inplace=True)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [42]:
# 8.0 Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.

X = pd.get_dummies(data = crypto_df, columns = ["Algorithm", "ProofType"])
X.shape
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# 9.0 Use the StandardScaler from sklearn (Links to an external site.) to standardize all of the data from the X DataFrame. 
#Remember, this is important prior to using PCA and K-means algorithms.

X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

# Reducing Data Dimensions Using PCA

In [44]:
# Initialize PCA model
pca = PCA(n_components=3)

In [45]:
# Get two principal components for the X data.
X_pca = pca.fit_transform(X_scaled)

In [46]:
#Transform PCA data to a dataframe
# Use the crypto_df.index as the index for this new DataFrame.
df_X_pca = pd.DataFrame(data=X_pca, columns=["PC1", "PC2", "PC3"], index=crypto_df.index)
df_X_pca.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.332012,0.957966,-0.412405
404,-0.315352,0.957994,-0.412489
1337,2.307715,1.568195,-0.427429
BTC,-0.144551,-1.286577,0.09596
ETH,-0.152553,-1.980808,0.315979


# Clustering Cryptocurrencies Using K-means

In [47]:
#1.0 Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.

inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(df_X_pca)
   inertia.append(km.inertia_)

In [48]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [59]:
#2.0 Once you define the best value for K, run the K-means algorithm to predict the K 
#clusters for the cryptocurrencies’ data. Use the pcs_df to run the K-means algorithm.

# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_X_pca)

# Predict clusters
predictions = model.predict(df_X_pca)

# Add the predicted class columns
df_X_pca["class"] = model.labels_
df_X_pca.head()


Unnamed: 0,PC1,PC2,PC3,class
42,-0.332012,0.957966,-0.412405,0
404,-0.315352,0.957994,-0.412489,0
1337,2.307715,1.568195,-0.427429,0
BTC,-0.144551,-1.286577,0.09596,1
ETH,-0.152553,-1.980808,0.315979,1


In [60]:
# 3.0 Create a new DataFrame named “clustered_df,” that includes the following columns: 
#Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class. 
#You should maintain the index of the crypto_df DataFrames as is shown below:

clustered_df = pd.concat([crypto_df, df_X_pca], axis=1, sort=False)


In [61]:
clustered_df['CoinName']=coins_name['CoinName']

In [62]:
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.332012,0.957966,-0.412405,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.315352,0.957994,-0.412489,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.307715,1.568195,-0.427429,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.144551,-1.286577,0.09596,1,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.152553,-1.980808,0.315979,1,Ethereum


## Visualizing Results

In [63]:
#1.0 Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame. 
#You should include the following parameters on the plot: hover_name="CoinName" 
#and hover_data=["Algorithm"] to show this additional info on each data point.

# Plot the 3D-scatter 
fig = px.scatter_3d(
    clustered_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="class",
    hover_name="CoinName", 
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [64]:
# 2.0 Use hvplot.table to create a data table with all the current tradable cryptocurrencies. 
#The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, 
#TotalCoinsMined, and Class.

clustered_df[['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class']].hvplot.table()

In [65]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["TotalCoinSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["class"] = clustered_df["class"]

In [66]:
# 3.0 Create a scatter plot using hvplot.scatter to present the clustered data about 
#cryptocurrencies having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the 
#number of available coins versus the total number of mined coins. Use the hover_cols=["CoinName"] 
#parameter to include the cryptocurrency name on each data point.


plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", hover_cols=["CoinName"], by="class")