# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ing. en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 11**: Multi classification (Decision Trees & SVM)

**Fecha**: 11 de mayo del 2025

**Nombre del Estudiante**: Marco Albanese, Vicente Siloe

**Profesor**: Pablo Camarillo Ramirez

In [3]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Decision-Trees-SVM") \
    .master("spark://2da3617855ce:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Preparación de datos


In [5]:
from equipo_mcqueen.spark_utils import SparkUtils

iris_data = [
    ("Id", "IntegerType"),
    ("SepalLengthCm", "DoubleType"),
    ("SepalWidthCm", "DoubleType"),
    ("PetalLengthCm", "DoubleType"),
    ("PetalWidthCm", "DoubleType"),
    ("Species", "StringType")
]

schema = SparkUtils.generate_schema(iris_data)

df = spark.read.schema(schema).option("header", "true").csv("/home/jovyan/notebooks/data/Iris.csv")

#### Assemble the features into a single vector column

In [6]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

iris_cols = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]

label_indexer = StringIndexer(inputCol="Species", outputCol="label")
df = label_indexer.fit(df).transform(df)

assembler = VectorAssembler(inputCols=iris_cols, outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")

                                                                                

#### Split the data into training and test sets 80% training data and 20% testing data

In [7]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

#### Show the whole dataset

In [8]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
|  0.0|[5.4,3.9,1.7,0.4]|
|  0.0|[4.6,3.4,1.4,0.3]|
|  0.0|[5.0,3.4,1.5,0.2]|
|  0.0|[4.4,2.9,1.4,0.2]|
|  0.0|[4.9,3.1,1.5,0.1]|
|  0.0|[5.4,3.7,1.5,0.2]|
|  0.0|[4.8,3.4,1.6,0.2]|
|  0.0|[4.8,3.0,1.4,0.1]|
|  0.0|[4.3,3.0,1.1,0.1]|
|  0.0|[5.8,4.0,1.2,0.2]|
|  0.0|[5.7,4.4,1.5,0.4]|
|  0.0|[5.4,3.9,1.3,0.4]|
|  0.0|[5.1,3.5,1.4,0.3]|
|  0.0|[5.7,3.8,1.7,0.3]|
|  0.0|[5.1,3.8,1.5,0.3]|
+-----+-----------------+
only showing top 20 rows

train set
+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[4.3,3.0,1.1,0.1]|
|  0.0|[4.4,2.9,1.4,0.2]|
|  0.0|[4.4,3.0,1.3,0.2]|
|  0.0|[4.4,3.2,1.3,0.2]|
|  0.0|[4.5,2.3,1.3,0.3]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[4.6,3.2,1.4,0.2]|
|  0.0|[4.6,3.4,1.4,0.3]|
|  0.0|[4.6,3.6,1.0,0.2]|
|  0.0|[4.7

### Create a Decision Tree model

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

#### Training

In [10]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eb8747fe4a00, depth=5, numNodes=15, numClasses=3, numFeatures=4
  If (feature 2 <= 2.45)
   Predict: 0.0
  Else (feature 2 > 2.45)
   If (feature 2 <= 4.85)
    If (feature 3 <= 1.65)
     Predict: 1.0
    Else (feature 3 > 1.65)
     If (feature 0 <= 5.95)
      Predict: 1.0
     Else (feature 0 > 5.95)
      Predict: 2.0
   Else (feature 2 > 4.85)
    If (feature 3 <= 1.75)
     If (feature 2 <= 4.95)
      Predict: 1.0
     Else (feature 2 > 4.95)
      If (feature 3 <= 1.55)
       Predict: 2.0
      Else (feature 3 > 1.55)
       Predict: 1.0
    Else (feature 3 > 1.75)
     Predict: 2.0



#### Predictions

In [11]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.0,3.4,1.6,0.4]|       0.0|
|[5.0,3.5,1.3,0.3]|       0.0|
|[5.1,3.7,1.5,0.4]|       0.0|
|[5.2,3.5,1.5,0.2]|       0.0|
|[5.4,3.9,1.7,0.4]|       0.0|
|[5.7,2.6,3.5,1.0]|       1.0|
|[5.8,2.6,4.0,1.2]|       1.0|
|[5.8,2.7,3.9,1.2]|       1.0|
|[6.1,2.8,4.7,1.2]|       1.0|
|[6.1,2.9,4.7,1.4]|       1.0|
|[6.2,2.9,4.3,1.3]|       1.0|
|[6.3,3.3,4.7,1.6]|       1.0|
|[6.7,3.1,4.7,1.5]|       1.0|
|[4.9,2.5,4.5,1.7]|       1.0|
|[6.0,3.0,4.8,1.8]|       2.0|
|[6.1,3.0,4.9,1.8]|       2.0|
|[6.2,3.4,5.4,2.3]|       2.0|
|[6.3,2.5,5.0,1.9]|       2.0|
|[6.3,2.9,5.6,1.8]|       2.0|
|[6.4,3.2,5.3,2.3]|       2.0|
+-----------------+----------+
only showing top 20 rows



### Create a LinearSVC with OneVsRest

In [12]:
from pyspark.ml.classification import LinearSVC, OneVsRest

lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol="label", featuresCol="features")
ovr = OneVsRest(classifier=lsvc, labelCol="label", featuresCol="features")

#### Training

In [13]:
ovr_model = ovr.fit(train_df)

25/05/07 20:23:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/07 20:23:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


#### Predictions

In [14]:
ovr_predictions = ovr_model.transform(test_df)

### Model testing: Decision Trees

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

Accuracy: 0.9545454545454546
Precision: 0.9595959595959596
Recall: 0.9545454545454546
F1 Score: 0.9545454545454545


### Model testing: SVM

In [17]:
ovr_f1 = evaluator.evaluate(ovr_predictions, {evaluator.metricName: "f1"})

                                                                                

### Comparison

In [18]:
print("Model Comparison:")
print(f"Decision Tree F1: {f1}")
print(f"LinearSVC OVR F1: {ovr_f1}")

Model Comparison:
Decision Tree F1: 0.9545454545454545
LinearSVC OVR F1: 0.8142857142857143


In [19]:
sc.stop()