In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import matplotlib.pyplot as plt

data = load_breast_cancer()
df_pandas = pd.DataFrame(data.data, columns=data.feature_names)
df_pandas.head()

In [None]:
from pyspark.sql import SparkSession

# Crear sesión de Spark
spark = SparkSession.builder.appName("PCA_Breast_Cancer").getOrCreate()

# Convertir a DataFrame de PySpark
df_spark = spark.createDataFrame(df_pandas)

# Mostrar el esquema
df_spark.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler

# Unir todas las columnas en un solo vector de características
feature_columns = df_spark.columns  # Todas las columnas del dataset
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transformar el DataFrame
df_vector = vector_assembler.transform(df_spark)

# Seleccionar solo la columna "features" para PCA
df_vector.select("features").show(truncate=False)

In [None]:
from pyspark.ml.feature import PCA

# Aplicar PCA para reducir a 3 componentes principales
pca = PCA(k=3, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(df_vector)

# Transformar los datos
df_pca = pca_model.transform(df_vector)

# Mostrar las primeras filas con los nuevos componentes
df_pca.show()

In [None]:
explained_variance = pca_model.explainedVariance
print("Varianza explicada por cada componente principal:", explained_variance)

In [None]:
df_pandas_pca = df_pca.select("pca_features").toPandas()
df_pandas_pca.head()

In [None]:
import pandas as pd

# Supongamos que ya tenemos el DataFrame con la columna "pca_features"
df_pandas_pca_expanded = df_pandas_pca["pca_features"].apply(pd.Series)

# Renombrar las columnas para mayor claridad
df_pandas_pca_expanded.columns = [f"PCA_{i+1}" for i in range(df_pandas_pca_expanded.shape[1])]

# Mostrar las primeras filas
df_pandas_pca_expanded.head()

In [None]:
df_pandas_pca_expanded['target'] = data.target
df_pandas_pca_expanded.head()

In [None]:
plt.figure(figsize=(8,6))
maligno = df_pandas_pca_expanded[df_pandas_pca_expanded["target"] == 0]
benigno = df_pandas_pca_expanded[df_pandas_pca_expanded["target"] == 1]

plt.scatter(maligno["PCA_1"], maligno["PCA_2"], c="red", label="Maligno (0)", alpha=0.5)
plt.scatter(benigno["PCA_1"], benigno["PCA_2"], c="blue", label="Benigno (1)", alpha=0.5)
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")
plt.title("Proyección PCA (2D) - Breast Cancer Dataset")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(maligno["PCA_1"], maligno["PCA_2"], maligno["PCA_3"], 
           c="red", label="Maligno (0)", alpha=0.5)

ax.scatter(benigno["PCA_1"], benigno["PCA_2"], benigno["PCA_3"], 
           c="blue", label="Benigno (1)", alpha=0.5)

ax.set_xlabel("PCA1")
ax.set_ylabel("PCA2")
ax.set_zlabel("PCA3")
ax.set_title("Proyección PCA (3D) - Breast Cancer Dataset")
ax.legend()
plt.show()