In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the file as a single string
with open("Resources/crypto_market_data.csv", 'r') as file:
    content = file.read()

# Split the content into lines, skipping the first line
lines = content.split('\n')[1:]

# Parse the data
data = []
for line in lines:
    if line.strip():  # Skip empty lines
        parts = line.split(',')
        data.append(parts)

# Create DataFrame
df_market_data = pd.DataFrame(data[1:], columns=data[0])  # Use first row as column names

# Set the index
df_market_data.set_index(df_market_data.columns[0], inplace=True)

# Convert columns to numeric
for col in df_market_data.columns:
    df_market_data[col] = pd.to_numeric(df_market_data[col], errors='coerce')

# Display the first few rows
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.678107,6.335743,8.325069,26.181094,47.072514,432.530385,1240.116229
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()
df_market_data_scaled = scaler.fit_transform(df_market_data)

In [6]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(df_market_data_scaled, columns=df_market_data.columns)

# Copy the crypto names from the original DataFrame
df_market_data_scaled['coin_id'] = df_market_data.index

# Set the coin_id column as index
df_market_data_scaled.set_index('coin_id', inplace=True)

print("NaN values in the dataset:")
print(df_market_data_scaled.isna().sum())
df_market_data_scaled_clean = df_market_data_scaled.dropna()

# Display the scaled DataFrame
df_market_data_scaled_clean.head()

NaN values in the dataset:
price_change_percentage_24h     3
price_change_percentage_7d      3
price_change_percentage_14d     3
price_change_percentage_30d     3
price_change_percentage_60d     3
price_change_percentage_200d    3
price_change_percentage_1y      3
dtype: int64


Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Scaled DataFrame.

In [7]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [8]:
# Create an empty list to store the inertia values

inertia = []
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for k in k_values:
    km = KMeans(n_clusters=k, random_state=42)
    # CHANGED: Use df_market_data_scaled_clean instead of df_market_data_scaled
    km.fit(df_market_data_scaled_clean)
    inertia.append(km.inertia_)

print("Inertia values:", inertia)

Inertia values: [574.0, 397.1436361959626, 291.27149544385725, 158.0448707024195, 132.43556012000676, 115.71201137011357, 94.89982383894724, 76.83402636783377, 70.03914125386218, 59.59615047271066, 56.16294540897964]




In [9]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [10]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k_values)
elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** Based on the elbow curve plot, the best value for k appears to be 4. This is where the "elbow" of the curve is, meaning adding more clusters beyond this point would not significantly reduce inertia.

---

### Cluster Cryptocurrencies with K-means Using the Original Scaled DataFrame

In [11]:
# Initialize the K-Means model using the best value for k
best_k = 4  # This is an example, replace with the value you determine
km = KMeans(n_clusters=best_k, random_state=42)

print("NaN values in the scaled dataset:")
print(df_market_data_scaled.isna().sum())

df_market_data_scaled_clean = df_market_data_scaled.dropna()


print("Shape before removing NaN values:", df_market_data_scaled.shape)
print("Shape after removing NaN values:", df_market_data_scaled_clean.shape)

NaN values in the scaled dataset:
price_change_percentage_24h     3
price_change_percentage_7d      3
price_change_percentage_14d     3
price_change_percentage_30d     3
price_change_percentage_60d     3
price_change_percentage_200d    3
price_change_percentage_1y      3
dtype: int64
Shape before removing NaN values: (85, 7)
Shape after removing NaN values: (82, 7)


In [12]:
# Fit the K-Means model using the scaled DataFrame
km.fit(df_market_data_scaled_clean)



In [13]:
# Predict the clusters to group the cryptocurrencies using the scaled DataFrame

clusters = km.predict(df_market_data_scaled_clean)
# Print the resulting array of cluster values.
print(clusters)

[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 1 0 2 2 3
 2 2 2 2 0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 1
 0 2 2 3 2 2 2 2]


In [14]:
# Create a copy of the scaled DataFrame
df_market_data_scaled_clustered = df_market_data_scaled_clean.copy()

In [15]:
# Add a new column to the copy of the scaled DataFrame with the predicted clusters

df_market_data_scaled_clustered['Cluster'] = clusters
# Display the copy of the scaled DataFrame
print(df_market_data_scaled_clustered.head())

              price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                 
bitcoin                          0.508529                    0.493193   
ethereum                         0.185446                    0.934445   
tether                           0.021774                   -0.706337   
ripple                          -0.040764                   -0.810928   
bitcoin-cash                     1.193036                    2.000959   

              price_change_percentage_14d  price_change_percentage_30d  \
coin_id                                                                  
bitcoin                          0.772200                     0.235460   
ethereum                         0.558692                    -0.054341   
tether                          -0.021680                    -0.061030   
ripple                           0.249458                    -0.050388   
bitcoin-cash                     1.760610   

In [16]:
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
scatter_plot = df_market_data_scaled_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Cluster",
    hover_cols=["coin_id"],
    title="Cryptocurrency Clusters"
)
scatter_plot

---

### Optimize Clusters with Principal Component Analysis.

In [17]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [18]:
# Use the PCA model with `fit_transform` to reduce the original scaled DataFrame
# down to three principal components.
pca_data = pca.fit_transform(df_market_data_scaled_clean)

# View the scaled PCA data
print(pca_data)

[[-0.60066733  0.84276006  0.46159457]
 [-0.45826071  0.45846566  0.95287678]
 [-0.43306981 -0.16812638 -0.64175193]
 [-0.47183495 -0.22266008 -0.47905316]
 [-1.15779997  2.04120919  1.85971527]
 [-0.51653377  1.38837748  0.80407131]
 [-0.45071134  0.51769912  2.84614316]
 [-0.34559977  0.72943939  1.47801284]
 [-0.64946792  0.43216514  0.60030286]
 [-0.75901394 -0.20119979 -0.21765292]
 [-0.24819846 -1.37625159 -1.46202571]
 [-0.43840762 -0.17533654 -0.6633884 ]
 [-0.69342533 -0.47381462 -0.52759693]
 [ 0.06049915  2.90940385  1.49857131]
 [-0.39335243 -0.10819197 -0.01275608]
 [-0.79617564 -0.49440875  1.08281169]
 [ 0.06407452 -1.26982514 -1.09882928]
 [-0.48901506 -0.73271912 -0.06254323]
 [-0.3062723   0.70341515  1.71422359]
 [-0.51352775 -0.14280239 -0.65656583]
 [-0.36212044 -0.98691441 -0.72875232]
 [-0.60426463  0.82739764  0.43931594]
 [-0.4132956  -0.67411527 -1.07662834]
 [-0.40748304 -0.21250655 -0.35142563]
 [ 0.60897382  0.56353212 -1.14874159]
 [-0.45021114 -0.15101945

In [19]:
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
explained_variance = pca.explained_variance_ratio_
print("Explained Variance:", explained_variance)

Explained Variance: [0.3719856  0.34700813 0.17603793]


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** Add the values in the explained_variance array:

total_explained_variance = sum(explained_variance)
print(f"The total explained variance of the three principal components is {total_explained_variance:.4f}")

In [20]:
total_explained_variance = sum(explained_variance)
print(f"The total explained variance of the three principal components is {total_explained_variance:.4f}")

The total explained variance of the three principal components is 0.8950


In [21]:
# Create a new DataFrame with the PCA data.
df_market_data_pca = pd.DataFrame(
    pca_data,
    columns=['PC1', 'PC2', 'PC3'],
    index=df_market_data_scaled_clean.index
)


# Copy the crypto names from the original scaled DataFrame
print(df_market_data_pca.head())

# Set the coin_id column as index


# Display the scaled PCA DataFrame
print(df_market_data_pca)

                   PC1       PC2       PC3
coin_id                                   
bitcoin      -0.600667  0.842760  0.461595
ethereum     -0.458261  0.458466  0.952877
tether       -0.433070 -0.168126 -0.641752
ripple       -0.471835 -0.222660 -0.479053
bitcoin-cash -1.157800  2.041209  1.859715
                           PC1       PC2       PC3
coin_id                                           
bitcoin              -0.600667  0.842760  0.461595
ethereum             -0.458261  0.458466  0.952877
tether               -0.433070 -0.168126 -0.641752
ripple               -0.471835 -0.222660 -0.479053
bitcoin-cash         -1.157800  2.041209  1.859715
...                        ...       ...       ...
celsius-degree-token  4.792395  6.767679 -1.986985
ontology             -0.632355 -2.108117 -0.652227
ftx-token            -0.593142  0.021485  0.209911
true-usd             -0.458131 -0.135734 -0.635284
digibyte             -0.297910 -0.191126 -0.909602

[82 rows x 3 columns]


---

### Find the Best Value for k Using the Scaled PCA DataFrame

In [22]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [23]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for k in k_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(df_market_data_pca)
    inertia.append(km.inertia_)
    print(f"Completed clustering for k={k}, inertia={km.inertia_}")  # Add this line for debugging
print("Length of k_values:", len(k_values))
print("Length of inertia:", len(inertia))

if len(k_values) == len(inertia):
    elbow_data = {"k": k_values, "inertia": inertia}
    df_elbow = pd.DataFrame(elbow_data)
    print(df_elbow)
else:
    print("Error: Lengths of k_values and inertia do not match")

Completed clustering for k=1, inertia=513.7481711357851
Completed clustering for k=2, inertia=337.6237926137046
Completed clustering for k=3, inertia=231.83615326640142
Completed clustering for k=4, inertia=99.33099330359478
Completed clustering for k=5, inertia=76.7045024327737
Completed clustering for k=6, inertia=63.37829058715895
Completed clustering for k=7, inertia=53.58974177131818
Completed clustering for k=8, inertia=34.56644059927371
Completed clustering for k=9, inertia=27.33664976732814
Completed clustering for k=10, inertia=22.06716831842234
Completed clustering for k=11, inertia=16.063465858922623
Length of k_values: 11
Length of inertia: 11
     k     inertia
0    1  513.748171
1    2  337.623793
2    3  231.836153
3    4   99.330993
4    5   76.704502
5    6   63.378291
6    7   53.589742
7    8   34.566441
8    9   27.336650
9   10   22.067168
10  11   16.063466




In [24]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

print("Length of k_values:", len(k_values))
print("Length of inertia:", len(inertia))

Length of k_values: 11
Length of inertia: 11


In [25]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve (PCA Data)", xticks=k_values)
elbow_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:**  Based on the elbow curve plot for the PCA data, the best value for k also appears to be 4. This is where the "elbow" of the curve occurs for the PCA data.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, it does not differ from the best k value found using the original data. Both the original data and the PCA data suggest an optimal k value of 4.

### Cluster Cryptocurrencies with K-means Using the Scaled PCA DataFrame

In [26]:
# Initialize the K-Means model using the best value for k
best_k = 4  # Replace this with the value you determine from the elbow curve
km = KMeans(n_clusters=best_k, random_state=42)

In [27]:
# Fit the K-Means model using the PCA data
km.fit(df_market_data_pca)



In [28]:
# Predict the clusters to group the cryptocurrencies using the scaled PCA DataFrame
clusters = km.predict(df_market_data_pca)

# Print the resulting array of cluster values.
print(clusters)

[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 1 0 2 2 3
 2 2 2 2 0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 1
 0 2 2 3 2 2 2 2]


In [29]:
# Create a copy of the scaled PCA DataFrame

df_market_data_pca_clustered = df_market_data_pca.copy()
# Add a new column to the copy of the PCA DataFrame with the predicted clusters
df_market_data_pca_clustered['Cluster'] = clusters

# Display the copy of the scaled PCA DataFrame
print(df_market_data_pca_clustered.head())

                   PC1       PC2       PC3  Cluster
coin_id                                            
bitcoin      -0.600667  0.842760  0.461595        0
ethereum     -0.458261  0.458466  0.952877        0
tether       -0.433070 -0.168126 -0.641752        2
ripple       -0.471835 -0.222660 -0.479053        2
bitcoin-cash -1.157800  2.041209  1.859715        0


In [30]:
# Create a scatter plot using hvPlot by setting
# `x="PC1"` and `y="PC2"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
scatter_plot = df_market_data_pca_clustered.hvplot.scatter(
x="PC1",
y="PC2",
by="Cluster",
hover_cols=["coin_id"],
title="Cryptocurrency Clusters (PCA)"
)
scatter_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [31]:
# Composite plot to contrast the Elbow curves
# YOUR CODE HERE!
# Rename the elbow plot for PCA data to avoid confusion
elbow_plot_pca = elbow_plot

# Create the elbow plot for the original data (if not already done)
elbow_plot_original = df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve (Original Data)", xticks=k_values)


In [32]:
# Composite plot to contrast the clusters
# YOUR CODE HERE!
composite_elbow = elbow_plot_original * elbow_plot_pca
composite_elbow

# Rename the scatter plot for PCA data to avoid confusion
scatter_plot_pca = scatter_plot

# Create the scatter plot for the original data (if not already done)
scatter_plot_original = df_market_data_scaled_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Cluster",
    hover_cols=["coin_id"],
    title="Cryptocurrency Clusters (Original Data)"
)

# Composite plot to contrast the clusters
composite_scatter = scatter_plot_original * scatter_plot_pca
composite_scatter

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Using fewer features for clustering has several impacts:

    It reduces the dimensionality of the data, potentially making patterns more apparent and reducing noise.
    The clusters in the PCA plot may appear more distinct or compact compared to the original data plot.
    You are able to find the most significant or insightful patterns even though nuance is removed. 
    The fact that the optimal k value remained the same (4) for both original and PCA data suggests PCA effectively captured.