# Principal Component Analysis
## Ron Briggs

In [264]:
# Import required libraries and dependencies
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [265]:
# Load the data into a Pandas DataFrame
df_heartattack = pd.read_csv(
    "Resources/heart.csv")

# Display sample data
df_heartattack.head(10)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [266]:
# Generate summary statistics
df_heartattack.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [267]:
# Plot your data to see what's in your DataFrame
df_heartattack.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Normalize the Data using StandardScaler

In [268]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()

In [269]:
# Create a DataFrame with the scaled data
scaled_data = scaler.fit_transform(df_heartattack)

# Copy the heartattack_names from the original data
heartattack_names = df_heartattack.index
# Set the heartattack_names column as index
df_scaled_heartattack = pd.DataFrame(scaled_data, columns=df_heartattack.columns, index=heartattack_names)

# Display sample data
df_scaled_heartattack.head(10)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,0.914529
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,0.914529
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,0.914529
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,0.914529
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,0.914529
5,0.290464,0.681005,-0.938515,0.478391,-1.048678,-0.417635,0.898962,-0.072018,-0.696631,-0.551783,-0.649113,-0.714429,-2.148873,0.914529
6,0.180175,-1.468418,0.032031,0.478391,0.922521,-0.417635,-1.005832,0.146634,-0.696631,0.224643,-0.649113,-0.714429,-0.512922,0.914529
7,-1.143291,0.681005,0.032031,-0.663867,0.323431,-0.417635,0.898962,1.021244,-0.696631,-0.896862,0.976352,-0.714429,1.123029,0.914529
8,-0.26098,0.681005,1.002577,2.306004,-0.9134,2.394438,0.898962,0.540209,-0.696631,-0.465514,0.976352,-0.714429,1.123029,0.914529
9,0.290464,0.681005,1.002577,1.04952,-1.51249,-0.417635,0.898962,1.064975,-0.696631,0.483451,0.976352,-0.714429,-0.512922,0.914529


---

### Find the Best Value for k Using the Original Data.

In [270]:
# Create a list with the number of k-values from 1 to 11
inertia = []

In [271]:
# Create an empty list to store the inertia values
k = list(range(1, 11))

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_scaled_heartattack`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_scaled_heartattack)
    inertia.append(k_model.inertia_)



In [272]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [273]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

What is the best value for `k`?

2 - this point has the sharpest bend

---

### Cluster with K-means Using the Original Data

In [274]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=2, random_state=1)

In [275]:
# Fit the K-Means model using the scaled data
model.fit(df_scaled_heartattack)



In [276]:
# Predict the clusters to group the data using the scaled data
cluster_labels = model.predict(df_scaled_heartattack)

# Print the resulting array of cluster values.
print(cluster_labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1]


In [277]:
# Create a copy of the DataFrame
cluster_label_predictions_df = df_scaled_heartattack.copy()

In [278]:
# Add a new column to the DataFrame with the predicted clusters
cluster_label_predictions_df['predicted cluster'] = cluster_labels

# Display sample data
cluster_label_predictions_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,predicted cluster
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,0.914529,1
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,0.914529,1
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,0.914529,1
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,0.914529,1
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,0.914529,1


In [287]:
# Create a scatter plot using hvPlot by setting 
# `Age"` and `Chol"`. 
# Color the graph points with the labels found using K-Means and 
# add the cluster number in the `hover_cols` parameter to identify 
# which class is represented by each data point.

cluster_label_predictions_df.hvplot.scatter(
    x="trtbps",
    y="chol",
    c="predicted cluster",  # Color by the predicted clusters
    cmap=["orange", "red", "blue", "green"],  
    title="Heart Attack Clusters",
    xlabel="Age",
    ylabel="Chol",
    hover_cols=["predicted cluster"],  # Display predicted cluster on hover
)

---

### Optimize Clusters with Principal Component Analysis

In [280]:
# Create a PCA model instance and set `n_components=6`.
pca = PCA(n_components=6)

In [288]:
# Use the PCA model with fit_transform to reduce to six principal components.
pca_data = pca.fit_transform(df_scaled_heartattack)
df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=df_scaled_heartattack.index)

# View the first five rows of the DataFrame.
print(df_pca.head())


        PC1       PC2       PC3       PC4       PC5       PC6
0 -0.051739  2.624022  0.990054  3.479328  0.362773  1.695836
1 -0.817441 -0.730375 -0.235928  2.767545  2.258404 -0.808414
2 -2.057599 -0.039098 -0.519839  0.081198  0.860416  0.752783
3 -1.903043 -0.596701  0.076204 -0.082624 -0.235141 -0.500292
4 -0.768371  0.412545 -2.187455 -1.988510  0.017448 -0.074219


In [290]:
# Get the loadings of the original features on each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=df_scaled_heartattack.columns, index=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'])

# Display the loadings
print("Loadings of Original Features on Principal Components:")
print(loadings_df)


Loadings of Original Features on Principal Components:
          age       sex        cp    trtbps      chol       fbs   restecg  \
PC1  0.253960  0.122038 -0.274551  0.147362  0.092197  0.052928 -0.111924   
PC2  0.444685 -0.391571  0.267563  0.443429  0.358966  0.305974 -0.213297   
PC3 -0.066910  0.559561  0.227335  0.198929 -0.180715  0.468036 -0.202971   
PC4 -0.066354  0.081211  0.378760  0.089773 -0.509299  0.158279  0.190195   
PC5 -0.306740  0.056129  0.159369  0.186880  0.321398 -0.230988 -0.397053   
PC6 -0.126418  0.062044 -0.198399 -0.181346 -0.103434  0.253369 -0.668392   

     thalachh      exng   oldpeak       slp       caa     thall    output  
PC1 -0.366443  0.335673  0.369772 -0.325461  0.261576  0.222602 -0.438295  
PC2 -0.002230 -0.205115  0.026024 -0.039484  0.093167 -0.192089  0.151141  
PC3  0.267379 -0.122270 -0.088549  0.195845  0.304621  0.241079 -0.121233  
PC4 -0.126480 -0.100436  0.369911 -0.497764 -0.192468 -0.235768  0.116420  
PC5  0.322451  0.039097  

In [289]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratios:", explained_variance)

Explained Variance Ratios: [0.23581966 0.11229748 0.08800207 0.08618849 0.07300207 0.06929774]


What is the total explained variance of the three principal components?

0.435 - 3 components
0.663 - 6 components

In [None]:
# Create a new DataFrame with the PCA data
df_pca_data = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=df_scaled_heartattack.index)

# Copy the crypto names from the original data
crypto_names = df_heartattack.index

# Display sample data
df_pca_data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
0,-0.051739,2.624022,0.990054,3.479328,0.362773,1.695836
1,-0.817441,-0.730375,-0.235928,2.767545,2.258404,-0.808414
2,-2.057599,-0.039098,-0.519839,0.081198,0.860416,0.752783
3,-1.903043,-0.596701,0.076204,-0.082624,-0.235141,-0.500292
4,-0.768371,0.412545,-2.187455,-1.98851,0.017448,-0.074219


---

### Find the Best Value for k Using the PCA Data

In [None]:
# Create a list with the number of k-values from 1 to 11
inertia_pca = []

In [None]:
# Create an empty list to store the inertia values
k_pca = list(range(1, 11))

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_pca_data`
# 3. Append the model.inertia_ to the inertia list
for i in k_pca:
    k_model_pca = KMeans(n_clusters=i, random_state=1)
    k_model_pca.fit(df_pca_data)  # Use the PCA-transformed data
    inertia_pca.append(k_model_pca.inertia_)





In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {"k": k_pca, "inertia": inertia_pca}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_pca.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve for K-Means (PCA Data)',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    grid=True,
    height=400,
    width=600,
    line_color='blue',
)

What is the best value for `k` when using the PCA data?
2

Does it differ from the best k value found using the original data?
No

### Cluster Data with K-means Using the PCA Data

In [None]:
# Initialize the K-Means model using the best value for k
model_pca = KMeans(n_clusters=2, random_state=1)

In [None]:
# Fit the K-Means model using the PCA data
model_pca.fit(df_pca_data)



In [None]:
# Predict the clusters to group the data using the PCA data
cluster_labels_pca = model_pca.predict(df_pca_data)
# Print the resulting array of cluster values.
print(cluster_labels_pca)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1
 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 0]


In [None]:
# Create a copy of the DataFrame with the PCA data
cluster_label_predictions_pca_df = df_pca_data.copy()

# Add a new column to the DataFrame with the predicted clusters
cluster_label_predictions_pca_df['predicted cluster'] = cluster_labels_pca

# Display sample data
print(cluster_label_predictions_pca_df.head())

        PC1       PC2       PC3       PC4       PC5       PC6  \
0 -0.051739  2.624022  0.990054  3.479328  0.362773  1.695836   
1 -0.817441 -0.730375 -0.235928  2.767545  2.258404 -0.808414   
2 -2.057599 -0.039098 -0.519839  0.081198  0.860416  0.752783   
3 -1.903043 -0.596701  0.076204 -0.082624 -0.235141 -0.500292   
4 -0.768371  0.412545 -2.187455 -1.988510  0.017448 -0.074219   

   predicted cluster  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  


In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the data represented by each data point.

cluster_label_predictions_pca_df.hvplot.scatter(
    x="PC1",
    y="PC2",
    c="predicted cluster",  # Color by the predicted clusters
    cmap=["orange", "red", "blue", "green"],  
    title="Clusters (PCA Data)",
    xlabel="Principal Component 1 (PC1)",
    ylabel="Principal Component 2 (PC2)",
    hover_cols=["predicted cluster"],  # Display predicted cluster on hover
    width=600,
    height=400,
)

### Visualize and Compare the Results

In [None]:
# Composite plot to contrast the Elbow curves
composite_plot = df_elbow + df_elbow_pca
composite_plot

Unnamed: 0,k,inertia
0,2,7061.265143
1,4,5552.507151
2,6,5080.901659
3,8,4703.43524
4,10,4435.280816
5,12,4202.621002
6,14,4061.987335
7,16,3866.619796
8,18,3721.512956
9,20,3643.167332


In [None]:
composite_plot.hvplot.line(
    x='k',
    y='inertia',
    title='Composite Plot',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    grid=True,
    height=400,
    width=600,
    line_color='blue',
)

In [None]:
# Create a composite plot to contrast the clusters of the original data and the PCA data

composite_plot2 = cluster_label_predictions_df.hvplot.scatter(
    x="age",
    y="chol",
    c="predicted cluster",  # Color by the predicted clusters
    cmap=["orange", "red", "blue", "green"],  
    title="Heart Attack Clusters",
    xlabel="Age",
    ylabel="Chol",
    hover_cols=["predicted cluster"],  # Display predicted cluster on hover
) + cluster_label_predictions_pca_df.hvplot.scatter(
    x="PC1",
    y="PC2",
    c="predicted cluster",  # Color by the predicted clusters of the PCA data
    cmap=["orange", "red", "blue", "green"],
    title="Clusters (PCA Data)",
    xlabel="Principal Component 1 (PC1)",
    ylabel="Principal Component 2 (PC2)",
    hover_cols=["predicted cluster"],  # Display predicted cluster on hover
    width=600,
    height=400,
)

# Show the composite plot
composite_plot2