# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Code Lab 13: Clustering with k-means** </center>

---
**Alumnos**: David Abraham Naranjo Salgado, Benjamin Zarate y Angel Cortes

In [3]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master("spark://2c9c6f7ab23e:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Preparación de Datos

In [10]:
from team_name.spark_utils import SparkUtils
from pyspark.sql.types import StructType, StructField, FloatType

path = "/home/jovyan/notebooks/data/wine-clustering.csv"

headers = [ "Alcohol", "Malic_Acid", "Ash", "Ash_Alcanity","Magnesium", "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols","Proanthocyanins", "Color_Intensity", "Hue", "OD280", "Proline"]


fields = []
for head in headers:
    fields.append(StructField(head, FloatType(), True)) 

schema = StructType(fields)

df = spark.read \
        .schema(schema) \
        .option("header", "true") \
        .csv(path)
        
df.show(5)

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|  14.23|      1.71|2.43|        15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92| 1065.0|
|   13.2|      1.78|2.14|        11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4| 1050.0|
|  13.16|      2.36|2.67|        18.6|    101.0|          2.8|      3.24|                 0.3|           2.81|           5.68|1.03| 3.17| 1185.0|
|  14.37|      1.95| 2.5|        16.8|    113.0|         3.85|      3.49|                0.24|           2.18|            7.

### Assemble the features into a single vector column

In [13]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=headers, outputCol="features")
assembled_df = assembler.transform(df)

# Initialize KMeans

In [14]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(2).setSeed(19)

# TRAINNING

In [15]:
model = kmeans.fit(assembled_df)

25/05/08 01:53:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/05/08 01:53:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/08 01:53:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


# PREDICTIONS

In [16]:
predictions = model.transform(assembled_df)

# EVALUATE MODEL

In [17]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette score: 0.821360351333219
Cluster Centers: 
[1.27028455e+01 2.54455285e+00 2.33910569e+00 2.04081301e+01
 9.68130081e+01 2.06211382e+00 1.64146342e+00 3.92682924e-01
 1.45406503e+00 4.85138211e+00 9.08617886e-01 2.40821138e+00
 5.65869919e+02]
[1.36665455e+01 1.87072727e+00 2.42781818e+00 1.74527272e+01
 1.06290909e+02 2.81618182e+00 2.89654548e+00 2.92909090e-01
 1.89690911e+00 5.52036361e+00 1.06665455e+00 3.06672727e+00
 1.15172727e+03]


In [18]:
sc.stop()