<a href="https://colab.research.google.com/github/EderLara/Fundamentos-Big-Data/blob/main/Uso_de_pyspark_en_un_modelo_de_clasificaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Crear una sesión de Spark
spark = SparkSession.builder.appName("SimpleClassification").getOrCreate()

In [2]:
# Crear un DataFrame de ejemplo
data = spark.createDataFrame([
    (1.0, "A", 0),
    (2.0, "B", 1),
    (3.0, "A", 0),
    (4.0, "C", 1),
    (5.0, "B", 0)
], ["feature1", "categorical_feature", "label"])

# Definir las etapas del pipeline
indexer = StringIndexer(inputCol="categorical_feature", outputCol="indexed_feature")
assembler = VectorAssembler(inputCols=["feature1", "indexed_feature"], outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

In [3]:
# Crear el pipeline
pipeline = Pipeline(stages=[indexer, assembler, lr])

# Entrenar el pipeline
model = pipeline.fit(data)

# Hacer predicciones
predictions = model.transform(data)
predictions.select("label", "probability", "prediction").show()

# Evaluar el modelo
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

# Detener la sesión de Spark
spark.stop()

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    0|[0.99999996942479...|       0.0|
|    1|[2.73168288742007...|       1.0|
|    0|           [1.0,0.0]|       0.0|
|    1|[3.76052346746478...|       1.0|
|    0|[0.99999999182448...|       0.0|
+-----+--------------------+----------+

AUC: 1.0
