In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=1000,
    height=600,
    rot=90
)

---

### Prepare the Data

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()
normalized_data = scaler.fit_transform(df_market_data)

In [6]:
# Create a DataFrame with the scaled data
df_normalized_data = pd.DataFrame(normalized_data, columns=df_market_data.columns, index=df_market_data.index)

# Create a diff of the DataFrame with the index set to cryptocurrency names
df_normalized_data_copy = df_normalized_data.copy()

# Copy the crypto names from the original data
names = list(df_normalized_data_copy.index)

# Set the 'coinid' column as the index
print(df_normalized_data_copy.index.name)

# Display sample data
display(df_normalized_data_copy.head(10))


coin_id


Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317
binancecoin,0.891871,1.327295,0.800214,-0.057148,0.778653,-0.188232,-0.225533
chainlink,0.011397,2.572251,1.101647,-0.490495,-0.931954,0.387759,-0.018284
cardano,0.10253,1.508001,0.648885,0.328959,-0.486349,0.06508,-0.155428
litecoin,0.077497,0.334297,0.85852,-0.012646,-0.366477,-0.486266,-0.292351
bitcoin-cash-sv,0.448952,-0.190684,-0.248043,0.051634,-0.529666,-0.532961,-0.206029


---

### Find the Best Value for k Using the Original Data.

In [7]:
# Create a list with the numbers from 1 to 11
k_values = list(range(1, 12))

In [8]:
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in range(1, 12):
    # Create a KMeans model
    kmeans = KMeans(n_clusters=k, n_init=10)  # Set n_init to suppress the warning
    
    # Fit the model to the scaled data
    kmeans.fit(df_normalized_data)
    
    # Append the inertia value to the inertia list
    inertia_values.append(kmeans.inertia_)


In [9]:
# Create a dictionary to store the inertia values for different values of k
elbow_data = {
    'k_values': range(1, 12),  # Values of k
    'inertia_values': inertia_values  # Inertia values obtained from KMeans clustering
}

# Convert the dictionary to a DataFrame
df_elbow_curve = pd.DataFrame(elbow_data)
save = df_elbow_curve

# Display the DataFrame
display(df_elbow_curve)

Unnamed: 0,k_values,inertia_values
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,65.957881
5,6,54.869951
6,7,47.574268
7,8,37.452321
8,9,32.636912
9,10,28.866775


In [10]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

# Plot the Elbow curve using hvplot
elbow_curve = df_elbow_curve.hvplot.line(
    x='k_values', y='inertia_values', xlabel='Number of Clusters (k)',
    ylabel='Inertia', title='Elbow Curve', grid=True,
    line_color='blue', line_width=2, hover_line_color='red',
    width=1000, height=600)

# Show the plot
elbow_curve

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 4

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [11]:
# Initialize a dictionary to store K-Means models
kmeans_models = {}

best_ks = [2,3,4,6]
# Iterate over each value of k in the list
for k in best_ks:
    # Initialize the K-Means model with the current value of k
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)  # Set n_init explicitly
    
    # Fit the model to the scaled data
    kmeans.fit(df_normalized_data)
    
    # Store the fitted model in the dictionary with the value of k as the key
    kmeans_models[k] = kmeans


In [12]:
# Initialize an empty dictionary to store the inertia values
elbow_data = {'k_values': [], 'inertia_values': []}

# Iterate over each value of k
for k in range(1, 12):
    # Initialize the KMeans model with the current value of k
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    
    # Fit the model to the scaled data
    kmeans.fit(df_normalized_data)
    
    # Append the current value of k and its inertia to the dictionary
    elbow_data['k_values'].append(k)
    elbow_data['inertia_values'].append(kmeans.inertia_)

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_curve = pd.DataFrame(elbow_data)


In [13]:
# Create a list of DataFrame copies for each value of k
df_normalized_data_copies = [df_normalized_data.copy() for k in best_ks]

# Predict the clusters for each value of k in best_ks and assign them to the corresponding DataFrame copy
for i, k in enumerate(best_ks):
    # Predict the clusters
    cluster_labels = kmeans_models[k].predict(df_normalized_data)
    
    # Print the resulting array of cluster values
    print(f"Cluster labels for k = {k}: {cluster_labels}")
    
    # Add a new column to the DataFrame with the predicted clusters
    df_normalized_data_copies[i]['Predictions'] = cluster_labels

# Display sample data for each DataFrame copy
df_normalized_data_copies[0].head()


Cluster labels for k = 2: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1]
Cluster labels for k = 3: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 2
 1 1 1 1]
Cluster labels for k = 4: [0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]
Cluster labels for k = 6: [3 3 0 0 3 3 3 3 3 0 4 0 0 3 0 0 4 0 3 0 4 3 0 0 0 0 0 4 3 5 0 0 2 0 4 0 1
 4 0 0 0]


Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,Predictions
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,1
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,1
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,1
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,1
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,1


In [28]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

# Initialize a list to store the scatter plots
scatter_plots = []

# Define a custom colormap starting from grey to green
colors = ["#FF0000", "#0000FF", "#00FF00", "#800080", "#FFA500", "#FFFF00"]

# Create a scatter plot for each value of n_bins and each k
for i, k in enumerate(best_ks):

    # Add a new column for the crypto names
    df_normalized_data_copies[i]['crypto_name'] = df_market_data.index.tolist()

    # Create the scatter plot
    scatter_plot = df_normalized_data_copies[i].hvplot.scatter(
        x="price_change_percentage_24h",
        y="price_change_percentage_7d",
        by='Predictions',
        hover_cols=['crypto_name'],
        title=f'Scatter Plot for k = {k}',
        width=700,
        height=600,
    )

    # Append the scatter plot to the list
    scatter_plots.append(scatter_plot)

# Combine the plots into a 2x2 grid layout
layout = (scatter_plots[0] + scatter_plots[1] + scatter_plots[2] + scatter_plots[3]).cols(2)

# Display the grid layout
display(layout)

---

### Optimize Clusters with Principal Component Analysis.

In [29]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [30]:
# Use the PCA model with `fit_transform` to reduce to three principal components.
principal_components = pca.fit_transform(df_normalized_data)

# Create a DataFrame to hold the principal components
df_principal_components = pd.DataFrame(
    data=principal_components,
    columns=['Principal Component 1', 'Principal Component 2', 'Principal Component 3'],
    index=df_normalized_data.index  # Assuming you want to keep the same index
)

# View the first five rows of the DataFrame
display(df_principal_components.head())

Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [31]:
# Retrieve the explained variance to determine how much information can be attributed to each principal component.
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:")
display(pd.DataFrame(explained_variance, columns=['Explained Variance Ratio']))
total_variance_ratio = explained_variance.sum()
print("Total Explained Variance Ratio:", total_variance_ratio)

Explained Variance Ratio:


Unnamed: 0,Explained Variance Ratio
0,0.371986
1,0.347008
2,0.176038


Total Explained Variance Ratio: 0.8950316570309841


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** Total Explained Variance Ratio: 0.8950316570309841

In [32]:
# Create a new DataFrame with the PCA data
df_pca_data = pd.DataFrame(
    data=principal_components,
    columns=['Principal Component 1', 'Principal Component 2', 'Principal Component 3'],
    index=df_normalized_data.index  # Assuming you want to keep the same index
)

# Copy the crypto names from the original data
# Set the 'coinid' column as the index

# Display sample data
display(df_pca_data.head())


Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [33]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [34]:
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    
    # Fit the model to the data using df_pca_data
    kmeans.fit(df_pca_data)
    
    # Append the model.inertia_ to the inertia list
    inertia_values.append(kmeans.inertia_)


In [35]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k_values': k_values,
    'inertia_values': inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_data = pd.DataFrame(elbow_data)
df_elbow_data

Unnamed: 0,k_values,inertia_values
0,1,256.874086
1,2,165.901994
2,3,93.774626
3,4,49.665497
4,5,37.839466
5,6,30.777746
6,7,21.134056
7,8,17.091637
8,9,13.68114
9,10,10.630648


In [36]:
#This graph shows the difference in inertia_values between the original data and the pca data.
#This shows the data are less than 30 apart for all k while tending toward 0 difference
#Hence they would both show 4 as the k_value to choose
diff = df_elbow_curve.copy()
diff['difference'] = diff['inertia_values'] - df_elbow_data['inertia_values']
diff = diff.drop(columns='inertia_values')
diff_plot = diff.hvplot.line(
    x='k_values', y='difference',
)
diff_plot

In [37]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow_data.hvplot.line(
    x='k_values', y='inertia_values', xlabel='Number of Clusters (k)',
    ylabel='Inertia', title='Elbow Curve', grid=True,
    line_color='blue', line_width=2, hover_line_color='red',
    width=1000, height=600)

elbow_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** 4


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** It does not differ

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [38]:
# Iterate over each value of k in the list of best k-values
for k in best_ks:
    # Initialize the K-Means model using the best value for k
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    
    # Fit the K-Means model using the PCA data
    kmeans.fit(df_pca_data)
    
    # Predict the clusters to group the cryptocurrencies using the PCA data
    cluster_labels = kmeans.predict(df_pca_data)
    
    # Print the resulting array of cluster values
    print(f"Cluster labels for k = {k}: {cluster_labels}")
    
    # Create a copy of the DataFrame with the PCA data
    df_clustered_pca_data = df_pca_data.copy()
    
    # Add a new column to the DataFrame with the predicted clusters
    df_clustered_pca_data['Cluster'] = cluster_labels
    
    # Display sample data
    display(df_clustered_pca_data.head())
    
    # Create a scatter plot using hvPlot by setting x="Principal Component 1" and y="Principal Component 2"
    scatter_plot = df_clustered_pca_data.hvplot.scatter(
        x="Principal Component 1",
        y="Principal Component 2",
        by='Cluster',
        hover_cols=['coin_id'],
        title=f'Scatter Plot of Clustered Cryptocurrencies for k = {k}',
        width=700,
        height=600,
    )
    
    # Append the scatter plot to the list
    scatter_plots.append(scatter_plot)


Cluster labels for k = 2: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1]


Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,1
ethereum,-0.458261,0.458466,0.952877,1
tether,-0.43307,-0.168126,-0.641752,1
ripple,-0.471835,-0.22266,-0.479053,1
bitcoin-cash,-1.1578,2.041209,1.859715,1


Cluster labels for k = 3: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1
 0 0 0 0]


Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,0


Cluster labels for k = 4: [0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,2
ripple,-0.471835,-0.22266,-0.479053,2
bitcoin-cash,-1.1578,2.041209,1.859715,0


Cluster labels for k = 6: [0 0 3 3 4 4 4 4 0 3 3 3 3 4 0 0 3 3 4 3 3 0 3 3 5 3 0 3 0 5 0 3 1 0 3 5 2
 3 0 3 3]


Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,3
ripple,-0.471835,-0.22266,-0.479053,3
bitcoin-cash,-1.1578,2.041209,1.859715,4


### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [41]:
#This graph shows both elbow curves for the data and the difference in their inertias
#The difference is almost constant the entire way trending toward 0
#This is justification towards choosing 4 as the optimal k for both data curves
#Interestingly 2 seems to gather much of the data set both times as well
elbow_plot*elbow_curve*diff_plot

In [40]:
# Composite plot to contrast the clusters
# Initialize a list to store scatter plots for k-values 0, 1, 2, 3
# Initialize a list to store the composite scatter plots
composite_scatter_plots = []

# Create composite scatter plots for k-values 0, 1, 2, 3 along with scatter plot for k-value 4
for i in range(4):
    composite_plot = (scatter_plots[i] + scatter_plots[i+4]).cols(2)
    composite_scatter_plots.append(composite_plot)

# Display the composite scatter plots
for composite_plot in (composite_scatter_plots):
    display(composite_plot)



#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Reducing the number of features for clustering using K-Means has both benefits and drawbacks:

    Simplicity and Efficiency: Using fewer features simplifies the clustering process and reduces computational complexity, making it easier to interpret and faster to compute.

    Loss of Information: However, this reduction may lead to a loss of valuable information, potentially resulting in less accurate clustering outcomes.

    Impact on Cluster Quality: The quality of clusters obtained with fewer features may differ from those with the full feature set, affecting cluster distinctiveness and separation.

    Overfitting Reduction: Fewer features can mitigate overfitting risks by focusing on more relevant information and minimizing the influence of noisy or irrelevant features.

Overall, the decision to use fewer features for clustering should consider trade-offs between simplicity, computational efficiency, and the potential impact on clustering accuracy and quality.
In particular there seems to be minimal benefit from higher than 4, but 2 still manages to capture almost everyone even when using higher than 2 as k value.
The PCA and original data graphs seem to be mirrorlike in their graphing. The PCA does have tighter group clustering.