# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ing. en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 13**: Clustering with k-means

**Fecha**: 11 de mayo del 2025

**Nombre del Estudiante**: Marco Albanese, Vicente Siloe

**Profesor**: Pablo Camarillo Ramirez

In [12]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master("spark://2da3617855ce:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Preparación de datos

In [14]:
from equipo_mcqueen.spark_utils import SparkUtils

wine_data = [
    ("Alcohol", "FloatType"),
    ("Malic_Acid", "FloatType"),
    ("Ash", "FloatType"),
    ("Ash_Alcanity", "FloatType"),
    ("Magnesium", "FloatType"),
    ("Total_Phenols", "FloatType"),
    ("Flavanoids", "FloatType"),
    ("Nonflavanoid_Phenols", "FloatType"),
    ("Proanthocyanins", "FloatType"),
    ("Color_Intensity", "FloatType"),
    ("Hue", "FloatType"),
    ("OD280", "FloatType"),
    ("Proline", "IntegerType")
]

wine_schema = SparkUtils.generate_schema(wine_data)

wine_df = spark.read.schema(wine_schema).option("header", "true").csv("/home/jovyan/notebooks/data/wine-clustering.csv")

#### Assemble the features into a single vector column

In [15]:
from pyspark.ml.feature import VectorAssembler

wine_cols = [
    "Alcohol",
    "Malic_Acid",
    "Ash",
    "Ash_Alcanity",
    "Magnesium",
    "Total_Phenols",
    "Flavanoids",
    "Nonflavanoid_Phenols",
    "Proanthocyanins",
    "Color_Intensity",
    "Hue",
    "OD280",
    "Proline"
]

assembler = VectorAssembler(inputCols=wine_cols, outputCol="features")
assembled_df = assembler.transform(wine_df)

#### Initialize KMeans

In [16]:
from pyspark.ml.clustering import KMeans

k = [2, 10, 15, 20]
kmeans = [KMeans().setSeed(19).setK(i) for i in k]


#### Training

In [17]:
models = [model.fit(assembled_df) for model in kmeans]

                                                                                

#### Predictions

In [18]:
predictions = [model.transform(assembled_df) for model in models]

#### Model Evaluation

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(predictionCol="prediction", featuresCol="features")
silhouette = [evaluator.evaluate(prediction) for prediction in predictions]

for i, model in enumerate(models):
    print(f"Silhouette with {k[i]} clusters: {silhouette[i]}")
    print(f"Cluster Centers for {k[i]} clusters: ")
    for center in model.clusterCenters():
        print(center)
    print("\n")

Silhouette with 2 clusters: 0.821360351333219
Cluster Centers for 2 clusters: 
[1.27028455e+01 2.54455285e+00 2.33910569e+00 2.04081301e+01
 9.68130081e+01 2.06211382e+00 1.64146342e+00 3.92682924e-01
 1.45406503e+00 4.85138211e+00 9.08617886e-01 2.40821138e+00
 5.65869919e+02]
[1.36665455e+01 1.87072727e+00 2.42781818e+00 1.74527272e+01
 1.06290909e+02 2.81618182e+00 2.89654548e+00 2.92909090e-01
 1.89690911e+00 5.52036361e+00 1.06665455e+00 3.06672727e+00
 1.15172727e+03]


Silhouette with 10 clusters: 0.6099911274077665
Cluster Centers for 10 clusters: 
[1.39289474e+01 1.78157895e+00 2.48684212e+00 1.70789474e+01
 1.05315789e+02 2.90578946e+00 3.09947371e+00 2.84736841e-01
 1.93789475e+00 6.37105263e+00 1.11000000e+00 3.00894736e+00
 1.36747368e+03]
[1.26437931e+01 2.80896552e+00 2.26724137e+00 2.07241379e+01
 9.27586207e+01 1.80034481e+00 1.35379312e+00 4.06206897e-01
 1.34310344e+00 4.53482758e+00 8.73793103e-01 2.26344827e+00
 5.29827586e+02]
[1.32000002e+01 2.87375000e+00 2.4599

In [20]:
sc.stop()