dataset : https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder \
    .appName("BreastCancerClustering") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/12 19:31:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
csv_file_path = "datasets/BCWD/data.csv"  # Adjust the file path accordingly
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [4]:
# Select relevant features and drop unnecessary columns
selected_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
df = df.select(selected_features)

In [5]:
df = df.dropna()

In [6]:
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
df_assembled = assembler.transform(df)

In [7]:
# Scale features to have zero mean and unit standard deviation
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)


                                                                                

In [8]:
# Train KMeans model
kmeans = KMeans(featuresCol='scaled_features', predictionCol='cluster', k=2, seed=42)
model = kmeans.fit(df_scaled)

24/04/12 19:32:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [9]:
predictions = model.transform(df_scaled)

In [10]:
predictions.columns

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'features',
 'scaled_features',
 'cluster']

In [11]:
df_scaled.columns

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'features',
 'scaled_features']

In [12]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette_score = evaluator.evaluate(predictions.withColumnRenamed('cluster','prediction'))
print("Silhouette Score:", silhouette_score)

Silhouette Score: 0.6486024129761664


In [14]:
from pyspark.ml.feature import PCA
from pyspark.sql.functions import col

# Perform PCA for dimensionality reduction
pca = PCA(k=2, inputCol="scaled_features", outputCol="pca_features")
model_pca = pca.fit(df_scaled)
df_pca = model_pca.transform(df_scaled)

# Add a row index column to both DataFrames
df_pca = df_pca.rdd.zipWithIndex().toDF()
predictions = predictions.rdd.zipWithIndex().toDF()

# Join the DataFrames on the row index column
merged_df = df_pca.join(predictions, df_pca["_2"] == predictions["_2"]).drop("_2")

# Convert merged DataFrame to Pandas DataFrame for visualization
merged_pd = merged_df.select("_1", "pca_features").toPandas()

# Scatter plot of clusters
plt.figure(figsize=(10, 6))
plt.scatter(merged_pd[merged_pd['_1'] == 0]['pca_features'].apply(lambda x: x[0]), 
            merged_pd[merged_pd['_1'] == 0]['pca_features'].apply(lambda x: x[1]),
            color='blue', label='Cluster 0')
plt.scatter(merged_pd[merged_pd['_1'] == 1]['pca_features'].apply(lambda x: x[0]), 
            merged_pd[merged_pd['_1'] == 1]['pca_features'].apply(lambda x: x[1]),
            color='red', label='Cluster 1')
plt.title("Breast Cancer Clustering (PCA)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()


                                                                                

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `_1` is ambiguous, could be: [`_1`, `_1`].