# Ejercicio Módulo 6 de Carlos Pamias Mora Finalizado

* PARTE 1 (10 %) Carga de datos de diamonds desde CSV con schema: https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv

* PARTE 2 (40 %) Pipeline regresión price con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

* PARTE 3 (40 %) Pipeline clasificación multiclase sobre variable cut con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

* PARTE 4 (10 %) Gridsearch con CrossValidation sobre cualquiera de los pipelines

Los modelos, se puede utilizar RandomForest para los dos por ejemplo o el que se quiera. Ejemplo RandomForestRegressor para regresión y MultiLayerPerceptronClassifier para clasificación.


In [1]:
import seaborn as sns
import pandas as pd
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum 
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType,NumericType, StringType
from pyspark.ml.feature import StringIndexer, Imputer, OneHotEncoder, VectorAssembler,MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier, LogisticRegression,RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

## PARTE 1 - Carga de datos de diamonds desde CSV para 

In [2]:
def star_load_data(tipo, accion):
    if accion == 'I':
        spark = SparkSession.builder.appName(f'pipeline_diamonds{tipo}').getOrCreate()
        url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv'
        csv_path = 'diamonds.csv'

        with open(csv_path, 'wb') as file:
            file.write(requests.get(url).content)
            
        schema = StructType([
            StructField('carat', FloatType(), True),
            StructField('cut', StringType(), True),
            StructField('color', StringType(), True),
            StructField('clarity', StringType(), True),
            StructField('depth', FloatType(), True),
            StructField('table', FloatType(), True),
            StructField('price', IntegerType(), True),
            StructField('x', FloatType(), True),
            StructField('y', FloatType(), True),
            StructField('z', FloatType(), True),
            ])
        df = spark.read.csv(csv_path, header=True, inferSchema=False, schema=schema)
        df.show(5)
        df.printSchema()
        return df
    elif accion == 'S':
        try:
            # Detener la sesión de Spark
            spark = SparkSession.getActiveSession()
            if spark is not None:
                spark.stop()
                print("Sesión detenida.")
            else:
                print("No hay ninguna sesión activa.")
        except Exception as e:
            print(f"Error: {e}")

    else:
        print("Acción no válida. Use 'I' para inicializar o 'S' para detener.")

## PARTE 2 Pipeline regresión price con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler, VectorAssembler

In [3]:
df = star_load_data('regresion', 'I')

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)



In [4]:
# Como vamos a predecir island borramos filas donde island sea nan:
df = df.dropna(subset=['price'])

# contar nulos en todas las columnas: equivalente a pandas df.isna().sum()
df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()

+-----+---+-----+-------+-----+-----+-----+---+---+---+
|carat|cut|color|clarity|depth|table|price|  x|  y|  z|
+-----+---+-----+-------+-----+-----+-----+---+---+---+
|    0|  0|    0|      0|    0|    0|    0|  0|  0|  0|
+-----+---+-----+-------+-----+-----+-----+---+---+---+



In [5]:
# seleccionar los nombres de las columnas a las que aplicar Preprocesados
numerical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)and field.name != 'price']
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]

In [6]:
df = df.withColumnRenamed('price', 'label')

In [7]:
indexers_features = [
    StringIndexer(inputCol=c, outputCol=c + '_indexed', handleInvalid='keep') for c in categorical_cols
]
categorical_cols_indexed = [c + '_indexed' for c in categorical_cols]
print(categorical_cols_indexed)

['cut_indexed', 'color_indexed', 'clarity_indexed']


In [8]:
# Imputer con la moda para las columnas categóricas indexadas
imputer_categorical = Imputer(
    inputCols=categorical_cols_indexed,
    outputCols=[c + '_imputed' for c in categorical_cols_indexed],
    strategy='mode'
)
categorical_cols_indexed_imputed = [c + '_imputed' for c in categorical_cols_indexed]
print(categorical_cols_indexed_imputed)

['cut_indexed_imputed', 'color_indexed_imputed', 'clarity_indexed_imputed']


In [9]:
# one hot encoders para las categóricas indexadas imputadas
encoders_onehot = [
    OneHotEncoder(inputCol=c, outputCol=c + '_onehot') 
    for c in categorical_cols_indexed_imputed
]
categorical_cols_onehot = [c + '_onehot' for c in categorical_cols_indexed_imputed]
print(categorical_cols_onehot)

['cut_indexed_imputed_onehot', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [10]:
# Imputer con la mediana para la columnas numéricas
imputer_numerical = Imputer(
    inputCols=numerical_cols,
    outputCols=[c + '_imputed' for c in numerical_cols],
    strategy='median'
)
numerical_cols_imputed = [c + '_imputed' for c in numerical_cols]
print(numerical_cols_imputed)

['carat_imputed', 'depth_imputed', 'table_imputed', 'x_imputed', 'y_imputed', 'z_imputed']


In [11]:
# (Opcional) escalar numéricas con MinMaxScaler
assembler_numerical = VectorAssembler(
    inputCols=numerical_cols_imputed,
    outputCol='numeric_features'
)
scaler = MinMaxScaler(
    inputCol='numeric_features',
    outputCol='numeric_features_scaled'
)

In [12]:
all_columns = ['numeric_features_scaled'] + categorical_cols_onehot
print(all_columns)

['numeric_features_scaled', 'cut_indexed_imputed_onehot', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [13]:
# Ensamblar todo: numéricas + categóricas y obtener features
assembler_all = VectorAssembler(
    inputCols=all_columns,
    outputCol='features'
)

In [14]:
classifier = RandomForestRegressor(seed=42)

In [15]:
# particionamiento de datos
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

In [16]:
pipeline = Pipeline(stages = [
    # 1. Indexers para columnas categóricas: 
    *indexers_features, # ponemos * porque es una lista de objetos
    # 2. Imputer para categóricas
    imputer_categorical,
    # 3. One Hot Encoders para categóricas
    *encoders_onehot, # ponemos * porque es una lista de objetos
    # 4. Imputer para numéricas
    imputer_numerical,
    # 5. Ensamblar numéricas + escalado
    assembler_numerical,
    scaler,
    # 6. Ensamblar numéricas escaladas + categóricas en una sola columna 'features'
    assembler_all,
    # 7. modelo de clasificación
    classifier
])

In [17]:
pipeline_model = pipeline.fit(df_train)
df_pred = pipeline_model.transform(df_test)

In [18]:
evaluator_r2 = RegressionEvaluator(metricName='r2')
evaluator_mae = RegressionEvaluator(metricName='mae')
evaluator_mse = RegressionEvaluator(metricName='mse')
evaluator_rmse = RegressionEvaluator(metricName='rmse')

In [19]:
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.9070336070647529
mae 684.109865656974
mse 1512592.3852326302
rmse 1229.8749469895833


In [20]:
df = star_load_data('', 'S')

Sesión detenida.


## PARTE 3 Pipeline clasificación multiclase sobre variable cut con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, StandardScaler, VectorAssembler  

In [21]:
df = star_load_data('clasification', 'I')

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)



In [22]:
# Como vamos a predecir island borramos filas donde island sea nan:
df = df.dropna(subset=['cut'])

# contar nulos en todas las columnas: equivalente a pandas df.isna().sum()
df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()

+-----+---+-----+-------+-----+-----+-----+---+---+---+
|carat|cut|color|clarity|depth|table|price|  x|  y|  z|
+-----+---+-----+-------+-----+-----+-----+---+---+---+
|    0|  0|    0|      0|    0|    0|    0|  0|  0|  0|
+-----+---+-----+-------+-----+-----+-----+---+---+---+



In [23]:
# seleccionar los nombres de las columnas a las que aplicar Preprocesados
numerical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType) and field.name != 'cut']
label_col = 'cut'

In [24]:
# Indexer para 'cut' la columna a predecir
indexer_label = StringIndexer(
    inputCol=label_col,
    outputCol='label',
    handleInvalid='keep'
)
#num_classes = df.select("cut").distinct().count()

In [25]:
# (Opcional) escalar numéricas con StandardScaler
scaler1 = StandardScaler(
    inputCol='numeric_features',
    outputCol='numeric_features_scaled',
    withMean=True # centrar en la media
)

In [26]:
classifier1 = LogisticRegression()

In [27]:
# particionamiento de datos
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

In [28]:
pipeline = Pipeline(stages = [
    # 1. Indexer para columna categórica 'cut'
    indexer_label,
    # 2. Indexers para columnas categóricas: 'color', 'clarity'
    *indexers_features, # ponemos * porque es una lista de objetos
    # 3. Imputer para categóricas
    imputer_categorical,
    # 4. One Hot Encoders para categóricas
    *encoders_onehot, # ponemos * porque es una lista de objetos
    # 5. Imputer para numéricas
    imputer_numerical,
    # 6. Ensamblar numéricas + escalado
    assembler_numerical,
    scaler1,
    # 7. Ensamblar numéricas escaladas + categóricas en una sola columna 'features'
    assembler_all,
    # 8. modelo de clasificación
    classifier1
])

In [29]:
pipeline_model = pipeline.fit(df_train)
df_pred = pipeline_model.transform(df_test)

In [30]:
# PASO 4: Obtener la dimensión del vector `features`
#feature_vector_size = df_pred.select("features").first()[0].size

In [31]:
#layers = [feature_vector_size, 16, 64, num_classes]  
#mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label", layers=layers, seed=42, maxIter=50)

#mlp_model = mlp.fit(df_pred)

In [32]:
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')

In [33]:
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 1.0
f1 1.0
precision 1.0
recall 1.0


In [34]:
df = star_load_data('', 'S')

Sesión detenida.


## PARTE 4 - 1 Gridsearch con CrossValidation sobre pipeline de Clasification

In [35]:
df = star_load_data('CrossValidation_clasification', 'I')
df = df.dropna(subset=['cut'])
df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()
numerical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType) and field.name != 'cut']
label_col = 'cut'

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)

+-----+---+-----+-------+-----+-----+-----+---+--

In [36]:
classifier2 = RandomForestClassifier(seed=42)

In [37]:
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

In [38]:
pipeline = Pipeline(stages = [
    # 1. Indexer para columna categórica 'island' StringIndexer porque es la columna a predecir
    indexer_label,
    # 2. Indexers para columnas categóricas: 'species', 'sex' 
    *indexers_features, # ponemos * porque es una lista de objetos
    # 3. Imputer para categóricas
    imputer_categorical,
    # 4. One Hot Encoders para categóricas
    *encoders_onehot, # ponemos * porque es una lista de objetos
    # 5. Imputer para numéricas
    imputer_numerical,
    # 6. Ensamblar numéricas + escalado
    assembler_numerical,
    scaler,
    # 7. Ensamblar numéricas escaladas + categóricas en una sola columna 'features'
    assembler_all,
    # 8. modelo de clasificación
    classifier2
])

In [39]:

paramGrid1 = (
    ParamGridBuilder()
    .addGrid(classifier2.numTrees, [5, 10, 15, 20, 25, 30]) 
    .addGrid(classifier2.maxDepth, [3, 5, 10, 15, 20, 25, 30]) # por defecto es 5 rango de [0, 30]
    .build()
)

In [40]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid1, # Parámetros para grid search hyper parameter tuning
    evaluator=evaluator_f1,
    numFolds=3, # por defecto ya 3 folds
    parallelism=4,
    seed=42
)
cv_model = crossval.fit(df_train)
df_pred = cv_model.transform(df_test)

In [41]:
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 1.0
f1 1.0
precision 1.0
recall 1.0


In [42]:
best_model = cv_model.bestModel
best_rf = best_model.stages[-1] # accede a la última fase del pipeline que es el modelo classifier
print(best_rf.extractParamMap())
print(best_rf.getNumTrees)
print(best_rf.getOrDefault('maxDepth'))
print(best_rf.featureImportances)

{Param(parent='RandomForestClassifier_e12e609646c3', name='bootstrap', doc='Whether bootstrap samples are used when building trees.'): True, Param(parent='RandomForestClassifier_e12e609646c3', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False, Param(parent='RandomForestClassifier_e12e609646c3', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10, Param(parent='RandomForestClassifier_e12e609646c3', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Support

In [43]:
df = star_load_data('', 'S')

Sesión detenida.


## PARTE 4 -2 (10 %) Gridsearch con CrossValidation sobre pipeline de regresion

In [44]:
df = star_load_data('CrossValidation_regresion', 'I')
df = df.dropna(subset=['price'])
df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()
numerical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)and field.name != 'price']
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]
df = df.withColumnRenamed('price', 'label')

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)

+-----+---+-----+-------+-----+-----+-----+---+--

In [45]:
classifier3 = DecisionTreeRegressor()

In [46]:
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

In [47]:
pipeline = Pipeline(stages = [
    # 1. Indexers para columnas categóricas: 
    *indexers_features, # ponemos * porque es una lista de objetos
    # 2. Imputer para categóricas
    imputer_categorical,
    # 3. One Hot Encoders para categóricas
    *encoders_onehot, # ponemos * porque es una lista de objetos
    # 4. Imputer para numéricas
    imputer_numerical,
    # 5. Ensamblar numéricas + escalado
    assembler_numerical,
    scaler,
    # 6. Ensamblar numéricas escaladas + categóricas en una sola columna 'features'
    assembler_all,
    # 7. modelo de clasificación
    classifier3
])

In [48]:
paramGrid2 = (
    ParamGridBuilder()
    .addGrid(classifier3.maxDepth, [5, 10, 15, 20, 25]) # por defecto es 5
    .addGrid(classifier3.minInstancesPerNode, [1, 5, 10]) 
    .addGrid(classifier3.maxBins, [32, 64, 128])
    .build()
)

In [49]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid2, # Parámetros para grid search hyper parameter tuning
    evaluator=evaluator_r2,
    numFolds=3, # por defecto ya 3 folds
    seed=42
)
cv_model = crossval.fit(df_train)
df_pred = cv_model.transform(df_test)

In [50]:
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.9623060942894921
mae 363.36749042739615
mse 613291.6739826986
rmse 783.1294107506744


In [51]:
best_model = cv_model.bestModel
best_rf = best_model.stages[-1] # accede a la última fase del pipeline que es el modelo classifier
print(best_rf)
print(best_rf.extractParamMap())
print(best_rf.getOrDefault('maxDepth'))
print(best_rf.getOrDefault('minInstancesPerNode'))
print(best_rf.getOrDefault('maxBins'))
print(best_rf.getOrDefault('minInfoGain'))
print(best_rf.featureImportances)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_e2ee001b1299, depth=20, numNodes=12495, numFeatures=23
{Param(parent='DecisionTreeRegressor_e2ee001b1299', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False, Param(parent='DecisionTreeRegressor_e2ee001b1299', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10, Param(parent='DecisionTreeRegressor_e2ee001b1299', name='featuresCol', doc='features column name.'): 'features', Param(parent='DecisionTreeRegressor_e2ee001b1299', name='impurity', doc='Crite

In [52]:
df = star_load_data('', 'S')

Sesión detenida.
