In [22]:
# Bibliotecas de PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
# Initialize Spark
spark = SparkSession.builder.appName("SVM").getOrCreate()

23/10/03 23:04:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
file_path = "./letter-recognition.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.describe().show()

23/10/03 23:04:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+------+----------------+----------------+----------------+----------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+
|summary|letter|            xbox|            ybox|           width|          height|             onpix|              xbar|             ybar|            x2bar|             y2bar|            xybar|            x2ybar|            xy2bar|             xedge|           xedgey|             yedge|            yedgex|
+-------+------+----------------+----------------+----------------+----------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+
|  count| 20000|           20000|           20000|           20000|      

                                                                                

In [4]:
df.printSchema()

root
 |-- letter: string (nullable = true)
 |-- xbox: integer (nullable = true)
 |-- ybox: integer (nullable = true)
 |-- width: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- onpix: integer (nullable = true)
 |-- xbar: integer (nullable = true)
 |-- ybar: integer (nullable = true)
 |-- x2bar: integer (nullable = true)
 |-- y2bar: integer (nullable = true)
 |-- xybar: integer (nullable = true)
 |-- x2ybar: integer (nullable = true)
 |-- xy2bar: integer (nullable = true)
 |-- xedge: integer (nullable = true)
 |-- xedgey: integer (nullable = true)
 |-- yedge: integer (nullable = true)
 |-- yedgex: integer (nullable = true)



In [5]:
df.show()

+------+----+----+-----+------+-----+----+----+-----+-----+-----+------+------+-----+------+-----+------+
|letter|xbox|ybox|width|height|onpix|xbar|ybar|x2bar|y2bar|xybar|x2ybar|xy2bar|xedge|xedgey|yedge|yedgex|
+------+----+----+-----+------+-----+----+----+-----+-----+-----+------+------+-----+------+-----+------+
|     T|   2|   8|    3|     5|    1|   8|  13|    0|    6|    6|    10|     8|    0|     8|    0|     8|
|     I|   5|  12|    3|     7|    2|  10|   5|    5|    4|   13|     3|     9|    2|     8|    4|    10|
|     D|   4|  11|    6|     8|    6|  10|   6|    2|    6|   10|     3|     7|    3|     7|    3|     9|
|     N|   7|  11|    6|     6|    3|   5|   9|    4|    6|    4|     4|    10|    6|    10|    2|     8|
|     G|   2|   1|    3|     1|    1|   8|   6|    6|    6|    6|     5|     9|    1|     7|    5|    10|
|     S|   4|  11|    5|     8|    3|   8|   8|    6|    9|    5|     6|     6|    0|     8|    9|     7|
|     B|   4|   2|    5|     4|    4|   8|   7

In [6]:
# Agrega una columna con un identificador único a df
df = df.withColumn("row_id", monotonically_increasing_id())

# Define los índices para la división
start_train = 0
end_train = 16000
end_test = 20000

# Filtra las filas para obtener los conjuntos de entrenamiento y prueba
letters_train = df.filter((df["row_id"] >= start_train) & (df["row_id"] <= end_train - 1))
letters_test = df.filter((df["row_id"] >= end_train) & (df["row_id"] <= end_test))

# Elimina la columna de identificación row_id
letters_train = letters_train.drop("row_id")
letters_test = letters_test.drop("row_id")

In [7]:
letters_train.count()

16000

In [8]:
letters_test.count()

4000

In [9]:
# Crea un StringIndexer para convertir las etiquetas de letras en valores numéricos
indexer = StringIndexer(inputCol="letter", outputCol="label")

# Ajusta el StringIndexer al DataFrame
indexer_model = indexer.fit(letters_train)

# Transforma el DataFrame original para agregar la columna de etiquetas numéricas
letters_train = indexer_model.transform(letters_train)
letters_test = indexer_model.transform(letters_test)

                                                                                

In [10]:
# Define un ensamblador de características para agrupar las columnas óptimas en un solo vector
assembler = VectorAssembler(inputCols=["xbox", "ybox", "onpix", "xbar", "ybar", "x2bar", "y2bar", "xybar", "x2ybar", "xy2bar", "xedge", "xedgey", "yedge", "yedgex"], outputCol="features")
# Define el modelo SVM
svm = LinearSVC(maxIter=10, regParam=0.1, featuresCol="features", labelCol="label")

# Define el modelo OneVsRest con SVM binario
ovr = OneVsRest(classifier=svm, labelCol="label")

# Aplica el ensamblador y ajusta el modelo al DataFrame de entrenamiento
pipeline = Pipeline(stages=[assembler, ovr])
model = pipeline.fit(letters_train)


23/10/03 23:05:25 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [11]:
letters_train.show()

+------+----+----+-----+------+-----+----+----+-----+-----+-----+------+------+-----+------+-----+------+-----+
|letter|xbox|ybox|width|height|onpix|xbar|ybar|x2bar|y2bar|xybar|x2ybar|xy2bar|xedge|xedgey|yedge|yedgex|label|
+------+----+----+-----+------+-----+----+----+-----+-----+-----+------+------+-----+------+-----+------+-----+
|     T|   2|   8|    3|     5|    1|   8|  13|    0|    6|    6|    10|     8|    0|     8|    0|     8|  1.0|
|     I|   5|  12|    3|     7|    2|  10|   5|    5|    4|   13|     3|     9|    2|     8|    4|    10| 22.0|
|     D|   4|  11|    6|     8|    6|  10|   6|    2|    6|   10|     3|     7|    3|     7|    3|     9|  4.0|
|     N|   7|  11|    6|     6|    3|   5|   9|    4|    6|    4|     4|    10|    6|    10|    2|     8| 11.0|
|     G|   2|   1|    3|     1|    1|   8|   6|    6|    6|    6|     5|     9|    1|     7|    5|    10| 16.0|
|     S|   4|  11|    5|     8|    3|   8|   8|    6|    9|    5|     6|     6|    0|     8|    9|     7

In [12]:
# Realiza predicciones en el conjunto de prueba
predictions = model.transform(letters_test)

# Calcula el accuracy para cada modelo binario

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)

print("Accuracy del modelo multiclase:", accuracy)


[Stage 838:>                                                        (0 + 1) / 1]

Accuracy del modelo multiclase: 0.6015


                                                                                

In [14]:
# Extrae las etiquetas verdaderas y las predicciones
labelsAndPredictions = predictions.select("label", "prediction")

# Crea un objeto de evaluación multiclase
metrics = MulticlassMetrics(labelsAndPredictions.rdd.map(tuple))

# Obtiene la matriz de confusión global
confusion_matrix = metrics.confusionMatrix()

print("Matriz de confusión global:")
print(confusion_matrix)

[Stage 923:>                                                        (0 + 1) / 1]

Matriz de confusión global:
DenseMatrix([[127.,   2.,  12.,   2.,   5.,   0.,   4.,   3.,   0.,   0.,   0.,
               24.,   0.,   2.,   1.,   6.,   2.,   0.,   1.,   5.,   2.,   4.,
                1.,   0.,   8.,   0.],
             [  0., 120.,   0.,  17.,   0.,   0.,   0.,   0.,   5.,   2.,  17.,
                1.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,
                2.,   2.,   0.,   2.],
             [  0.,   2., 120.,   5.,   0.,   0.,   3.,   0.,   0.,   9.,   0.,
                0.,   1.,   0.,   3.,   0.,   0.,   0.,   1.,   0.,   0.,   4.,
                0.,   0.,   6.,   0.],
             [  0.,   0.,   0.,  72.,   0.,   7.,   2.,   0.,   1.,   0.,   2.,
                0.,   0.,   1.,   0.,   0.,   0.,   3.,   0.,   0.,   0.,   0.,
                0.,   6.,   0.,   0.],
             [  0.,   0.,   1.,   0., 141.,   6.,   0.,  27.,   0.,   4.,   8.,
                3.,   0.,   2.,  13.,   0.,   0.,   0.,   3.,   9.,   0.,   0.,
                

                                                                                