# PCA 

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target

In [4]:
# Create dataframe from features and target and preview
import pandas as pd
df = pd.DataFrame(data.data, columns=data.feature_names)
df["target"] = data.target
df.columns = df.columns.str.replace(' ', '_')
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [12]:
from sklearn.decomposition import PCA
# Initialize PCA model
pca = PCA(n_components=3)

# Get two principal components for the data.
cancer_pca = pca.fit_transform(df)

In [13]:
# Transform PCA data to a DataFrame
df_cancer_pca = pd.DataFrame(
    data=cancer_pca, columns=["principal component 1", "principal component 2", "principal component 3"]
)
df_cancer_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,1160.142744,-293.917535,48.578388
1,1269.122597,15.630194,-35.394338
2,995.794082,39.156723,-1.709922
3,-407.180411,-67.380505,8.671338
4,930.341382,189.34071,1.374512


In [14]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.98204444, 0.01617648, 0.00155751])

In [15]:
from sklearn.cluster import KMeans

# Initialize the K-Means model
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(df_cancer_pca)

# Predict clusters
predictions = model.predict(df_cancer_pca)

# Add the predicted class columns
df_cancer_pca["class"] = data.target
df_cancer_pca.head(5)

Unnamed: 0,principal component 1,principal component 2,principal component 3,class
0,1160.142744,-293.917535,48.578388,0
1,1269.122597,15.630194,-35.394338,0
2,995.794082,39.156723,-1.709922,0
3,-407.180411,-67.380505,8.671338,0
4,930.341382,189.34071,1.374512,0


In [16]:
# Plot the 3 principal components
import plotly.express as px
fig = px.scatter_3d(
    df_cancer_pca,
    x="principal component 3",
    y="principal component 2",
    z="principal component 1",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()