# Clustering Crypto

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [None]:
# Load the crypto_data.csv dataset.
file = "crypto_data.csv"
crypto_df = pd.read_csv(file)
crypto_df.head(10)

In [None]:
## We'll clean up our dataframe by setting the index to the crypto abbreviation column.
## Additionally, we'll remove the name of the index after using set_index().
## Source: https://stackoverflow.com/questions/29765548/remove-index-name-in-pandas
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df.index.name = None
crypto_df

In [None]:
# Keep all the cryptocurrencies that are being traded.
## If the "IsTraded" column had integer values to stand for True and False,
## we could easily filter out nontraded cryptocurrencies by number.
## While there are several ways to convert values in a dataframe column,
## we have found that the easiest way is the astype() method 
## Source: https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe
crypto_df["IsTrading"] = crypto_df["IsTrading"].astype(int)
crypto_df.head()

In [None]:
## To isolate the trading crypto, we can pass a boolean Series into our DataFrame, 
## such that the DataFrame only shows crypto that is in that Series.
## We've done something similar in 4.7.6 (2021).
crypto_df = crypto_df[crypto_df["IsTrading"]==1]
crypto_df

In [None]:
# Keep all the cryptocurrencies that have a working algorithm.
## Per info(), we see that all 1144 trading cryptos have an algorithm.
## However, we noticed that only 685 cryptos had mined coins.

In [None]:
# Remove the "IsTrading" column. 
crypto_df = crypto_df.drop(columns=['IsTrading'])
crypto_df.head()

In [None]:
# Remove rows that have at least 1 null value.
## As noted in 18.2.4 (2021), we'll use the dropna() method to drop null cryptos.
crypto_df = crypto_df.dropna()
crypto_df

In [None]:
# Keep the rows where coins are mined.
## Like we did with isolating trading cryptos, we'll pass a boolean Series
## where total coins mined is greater than 0.
crypto_df = crypto_df[crypto_df['TotalCoinsMined']>0]
crypto_df.head()

In [None]:
# Create a new DataFrame that holds only the cryptocurrencies names.
## We do this by passing the CoinName column (which is a Series)
## into a new dataframe as follows:
newDF = pd.DataFrame(crypto_df["CoinName"])
newDF

In [None]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop(columns=["CoinName"])

In [None]:
## We'll check our work using info(). 
## We ultimately get 532 rows with 4 columns of data.
crypto_df.info()

In [None]:
# Use get_dummies() to create variables for text features.
## We'll refactor code from 17.6.1 (2021) to get dummy variables 
## for Algorithm and ProofType features.
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X.head()

In [None]:
# Standardize the data with StandardScaler().
## We'll refactor code from 17.6.4 (2021) to standardize our data.
X_scaled = StandardScaler().fit_transform(X)

### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
## We'll follow 18.5.2 (2021) when reducing data dimensions with PCA.
pca = PCA(n_components=3)
Standard_pca = pca.fit_transform(X_scaled)

In [None]:
# Create a DataFrame with the three principal components.
## As we've done in 18.5.2 (2021), we'll create a new DataFrame with
## our reduced data. Additionally, as we investigated ways to assign the new dataframe's index,
## we noticed in the Pandas documentation (2021) that pandas.Dataframe contains an index parameter.
## Given knowledge from AskPython (2021) of the index attribute, we eventually learned from
## Viktor Kerkez's reply (2021) that we can pass that index attribute to the pcs_df dataframe
## as seen below.
# Sources consulted:
## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
## https://stackoverflow.com/questions/18176933/create-an-empty-data-frame-with-index-from-another-data-frame
## https://www.askpython.com/python-modules/pandas/get-index-of-dataframe
pcs_df = pd.DataFrame(data=Standard_pca, index=crypto_df.index, columns=["PC1", "PC2", "PC3"])
pcs_df.head()

In [None]:
## In order to add CoinNames to pcs_df, we'll set that column equal to
## CoinName column from our earlier newDF.
pcs_df['CoinName'] = newDF['CoinName']
pcs_df['CoinName']

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
## We'll refer to 18.5.2 (2021) to build our code.
inertia = []
k = list(range(1, 11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
## Per 18.4.1 (2021), we see that the curve's slope changes noticeably at k=4,
## indicating that 4 is our ideal number of clusters.
## We'll prepare our K-means work with reference to 18.3.2 (2021).
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
## Now we can concatenate pcs_df and crypto_df using concat(), per the Python documentation.
## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
clustered_df = pd.concat([crypto_df, pcs_df], axis=1)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
## In order to add CoinNames to pcs_df, we'll set that column equal to
## CoinName column from our earlier newDF.
clustered_df['CoinName'] = newDF['CoinName']

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
## For "Class", we'll set the new column equal to the model labels as done in 18.5.2. (2021).
clustered_df['Class'] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE


In [None]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
