## Ejercicio Módulo 6 con Dataset Diamonds

0. Librerías a utilizar

In [1]:
import pandas as pd 
import seaborn as sns 
import requests

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType, NumericType
from pyspark.sql.functions import col, sum 
from pyspark.ml.feature import StringIndexer, Imputer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import PipelineModel

1. Carga de datos de diamonds desde CSV con schema

In [3]:
spark = SparkSession.builder.appName("pipeline_diamonds").getOrCreate()
spark

In [4]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv'
# csv_path= '/tmp/tips.csv'
csv_path= 'diamonds.csv'

with open(csv_path, 'wb') as file: 
    file.write(requests.get(url).content)

schema = StructType([
    # columnas del dataset y su tipo de dato
    StructField('carat', FloatType(), True),
    StructField('depth', FloatType(), True),
    StructField('x', FloatType(), True),
    StructField('y', FloatType(), True),
    StructField('z', FloatType(), True),
    StructField('color', StringType(), True),
    StructField('cut', StringType(), True),
    StructField('clarity', StringType(), True),
    StructField('price', IntegerType(), True),
    StructField('table', IntegerType(), True)
])
    
df = spark.read.csv(csv_path, header=True, inferSchema=False, schema=schema)
df.show(5)
df.printSchema()    


+-----+-----+----+----+----+-----+---+-------+-----+-----+
|carat|depth|   x|   y|   z|color|cut|clarity|price|table|
+-----+-----+----+----+----+-----+---+-------+-----+-----+
| 0.23| NULL|NULL|NULL|61.5|   55|326|   3.95| NULL| NULL|
| 0.21| NULL|NULL|NULL|59.8|   61|326|   3.89| NULL| NULL|
| 0.23| NULL|NULL|NULL|56.9|   65|327|   4.05| NULL| NULL|
| 0.29| NULL|NULL|NULL|62.4|   58|334|    4.2| NULL| NULL|
| 0.31| NULL|NULL|NULL|63.3|   58|335|   4.34| NULL| NULL|
+-----+-----+----+----+----+-----+---+-------+-----+-----+
only showing top 5 rows

root
 |-- carat: float (nullable = true)
 |-- depth: float (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- color: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- table: integer (nullable = true)



In [5]:
# Borramos las filas donde las variables a predeicir sean NaN
df_regresion = df.dropna(subset=['price'])
df_clasificacion = df.dropna(subset=['cut'])

# Contamos nulos
df_regresion.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()
df_clasificacion.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()

+-----+-----+---+---+---+-----+---+-------+-----+-----+
|carat|depth|  x|  y|  z|color|cut|clarity|price|table|
+-----+-----+---+---+---+-----+---+-------+-----+-----+
|    0|  299|299|299|  0|    0|  0|      0|    0|  291|
+-----+-----+---+---+---+-----+---+-------+-----+-----+

+-----+-----+-----+-----+---+-----+---+-------+-----+-----+
|carat|depth|    x|    y|  z|color|cut|clarity|price|table|
+-----+-----+-----+-----+---+-----+---+-------+-----+-----+
|    0|53940|53940|53940|  0|    0|  0|      0|53641|53085|
+-----+-----+-----+-----+---+-----+---+-------+-----+-----+



2. Pipeline regresión price con preprocesados: 
    * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

3. Pipeline clasificación multiclase sobre variable cut con preprocesados:
    * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

In [27]:
numerical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType) and field.name != 'cut']
label_col = 'cut'

In [28]:
# Indexer para la columna a predecir "cut"
indexer_label = StringIndexer(
    inputCol=label_col,
    outputCol='label',
    handleInvalid='keep'
)

In [29]:
# Indexers para las features de la entrada que no son la columna label a predecir
indexers_features = [
    StringIndexer(inputCol=c, outputCol=c + '_indexed', handleInvalid='keep') for c in categorical_cols
]
categorical_cols_indexed = [c + '_indexed' for c in categorical_cols]
print(categorical_cols_indexed)

['color_indexed', 'clarity_indexed']


In [30]:
# Imputer con la moda para las columnas categóricas indexadas
imputer_categorical = Imputer(
    inputCols=categorical_cols_indexed,
    outputCols=[c + '_imputed' for c in categorical_cols_indexed],
    strategy='mode'
)
categorical_cols_indexed_imputed = [c + '_imputed' for c in categorical_cols_indexed]
print(categorical_cols_indexed_imputed)

['color_indexed_imputed', 'clarity_indexed_imputed']


In [31]:
# one hot encoders para las categóricas indexadas imputadas
encoders_onehot = [
    OneHotEncoder(inputCol=c, outputCol=c + '_onehot') 
    for c in categorical_cols_indexed_imputed
]
categorical_cols_onehot = [c + '_onehot' for c in categorical_cols_indexed_imputed]
print(categorical_cols_onehot)

['color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [32]:
# Imputer con la mediana para la columnas numéricas
imputer_numerical = Imputer(
    inputCols=numerical_cols,
    outputCols=[c + '_imputed' for c in numerical_cols],
    strategy='median'
)
numerical_cols_imputed = [c + '_imputed' for c in numerical_cols]
print(numerical_cols_imputed)

['carat_imputed', 'depth_imputed', 'x_imputed', 'y_imputed', 'z_imputed', 'price_imputed', 'table_imputed']


In [33]:
# Escalar numéricas con MinMaxScaler
assembler_numerical = VectorAssembler(
    inputCols=numerical_cols_imputed,
    outputCol='numeric_features'
)
scaler = MinMaxScaler(
    inputCol='numeric_features',
    outputCol='numeric_features_scaled'
)

In [34]:
all_columns = ['numeric_features_scaled'] + categorical_cols_onehot
print(all_columns)

['numeric_features_scaled', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [35]:
# Ensamblar todo: numéricas + categóricas y obtener features
assembler_all = VectorAssembler(
    inputCols=all_columns,
    outputCol='features'
)

In [36]:
classifier = RandomForestClassifier(seed=42)

In [37]:
# Particionamiento de datos
df_clasificacion_train, df_clasificacion_test = df_clasificacion.randomSplit([0.8, 0.2], seed=42)

In [38]:
pipeline = Pipeline(stages = [
    indexer_label, 
    *indexers_features, 
    imputer_categorical,
    *encoders_onehot,
    imputer_numerical,
    assembler_numerical,
    scaler,
    assembler_all,
    classifier
])

In [39]:
pipeline_model = pipeline.fit(df_clasificacion_train)
df_pred = pipeline_model.transform(df_clasificacion_test)

Py4JJavaError: An error occurred while calling o1331.fit.
: org.apache.spark.SparkException: surrogate cannot be computed. All the values in depth,x,y are Null, Nan or missingValue(NaN)
	at org.apache.spark.ml.feature.Imputer.fit(Imputer.scala:199)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


In [19]:
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')

In [20]:
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

NameError: name 'df_pred' is not defined

4. Gridsearch con CrossValidation sobre cualquiera de los pipelines

In [None]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(classifier.numTrees, [5, 10, 15, 20, 25, 30]) 
    .addGrid(classifier.maxDepth, [3, 5, 10, 15]) 
    .build()
)

In [None]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator_f1,
    numFolds=3, 
    parallelism=4,
    seed=42
)
cv_model = crossval.fit(df_train)
df_pred = cv_model.transform(df_test)

Py4JJavaError: An error occurred while calling o19366.fit.
: org.apache.spark.SparkException: surrogate cannot be computed. All the values in depth,x,y are Null, Nan or missingValue(NaN)
	at org.apache.spark.ml.feature.Imputer.fit(Imputer.scala:199)
	at jdk.internal.reflect.GeneratedMethodAccessor140.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


In [None]:
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

In [None]:
best_model = cv_model.bestModel
best_rf = best_model.stages[-1] # accede a la última fase del pipeline que es el modelo classifier
print(best_rf.extractParamMap())
print(best_rf.getNumTrees)
print(best_rf.getOrDefault('maxDepth'))
print(best_rf.featureImportances)

Exportar modelo 

In [None]:
pipeline_model.write().overwrite().save('pipeline_spark')

In [None]:
# Cargar el modelo
loaded_pipeline = PipelineModel.load('pipeline_spark')