In [0]:
%run ../ConfigFolder/ConfigSAS

In [0]:
model_combined_Path = generate_path('g-model-madrid-min-features-combined', 'goldlayer')

df_model_combined = spark.read.format("delta").load(model_combined_Path)

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col, substring

df_model_combined = df_model_combined.withColumn("year", substring(col("period"), 1, 4)) \
                                     .withColumn("month", substring(col("period"), 6, 2))

display(df_model_combined)

In [0]:
from pyspark.sql.functions import when

df_model_combined = df_model_combined.withColumn("quarter", 
    when(col("month") == "3", "1")
    .when(col("month") == "6", "2")
    .when(col("month") == "9", "3")
    .when(col("month") == "2", "4")
).drop("month", "period")

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col

# Convertir columnas al tipo adecuado
df_cleaned = df_model_combined \
    .withColumn('bathrooms', col('bathrooms').cast('int')) \
    .withColumn('isparking', col('isparkingspaceincludedinprice').cast('int')) \
    .withColumn('latitude', col('latitude').cast('float')) \
    .withColumn('longitude', col('longitude').cast('float')) \
    .withColumn('price', col('price').cast('float')) \
    .withColumn('rooms', col('rooms').cast('int')) \
    .withColumn('size', col('size').cast('float')) \
    .withColumn('year', col('year').cast('int')) \
    .withColumn('quarter', col('quarter').cast('int'))

# Verifica el esquema para asegurarte de que los tipos de datos sean correctos
df_cleaned.printSchema()


In [0]:
display(df_cleaned)

In [0]:
"""
df_combined.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-combined', 'goldlayer'))

df_filtered2024.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2024', 'goldlayer'))

df_selected_2018_renamed.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2018', 'goldlayer'))
"""

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Definir las columnas a usar como features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Ensamblar las características en un solo vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Preparar los datos
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Dividir los datos en entrenamiento y prueba (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo de regresión lineal
lr = LinearRegression(featuresCol='features', labelCol='price', predictionCol='prediction')

with mlflow.start_run(run_name="regresion_lineal_v1") as run:
    # Entrenar el modelo
    lr_model = lr.fit(train_data)
    
    # Realizar predicciones sobre el conjunto de prueba
    predictions = lr_model.transform(test_data)
    
    # Evaluar el modelo usando RMSE (Raíz del error cuadrático medio)
    evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
    rmse = evaluator.evaluate(predictions)
    
    # Registrar las métricas en MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_param("seed", 42)
    mlflow.log_param("feature_cols", feature_cols)
    
    # Registrar el modelo en MLflow
    mlflow.spark.log_model(lr_model, "linear-regression-model")
    
    print(f"RMSE: {rmse}")

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Definir las columnas a usar como features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Ensamblar las características en un solo vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Preparar los datos
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Dividir los datos en entrenamiento y prueba (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo de regresión lineal
lr = LinearRegression(featuresCol='features', labelCol='price', predictionCol='prediction')

with mlflow.start_run(run_name="regresion_lineal_v2") as run:
    # Entrenar el modelo
    lr_model = lr.fit(train_data)
    
    # Realizar predicciones sobre el conjunto de prueba
    predictions = lr_model.transform(test_data)
    
    # Evaluar el modelo usando diferentes métricas
    evaluator_rmse = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
    evaluator_mae = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='mae')
    evaluator_r2 = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='r2')
    
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)
    
    # Registrar las métricas en MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.log_param("seed", 42)
    mlflow.log_param("feature_cols", feature_cols)
    
    # Registrar el modelo en MLflow
    mlflow.spark.log_model(lr_model, "linear-regression-model")
    
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}")

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

# Crear el modelo de árbol de decisión
dt = DecisionTreeRegressor(featuresCol='features', labelCol='price')

# Entrenar el modelo
dt_model = dt.fit(train_data)

# Realizar predicciones
predictions = dt_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import RandomForestRegressor

# Crear el modelo de Random Forest
rf = RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)

# Entrenar el modelo
rf_model = rf.fit(train_data)

# Realizar predicciones
predictions = rf_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import GBTRegressor

# Crear el modelo de XGBoost
xgb = GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)

# Entrenar el modelo
xgb_model = xgb.fit(train_data)

# Realizar predicciones
predictions = xgb_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import GBTRegressor

# Crear el modelo de Gradient Boosting
gb = GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)

# Entrenar el modelo
gb_model = gb.fit(train_data)

# Realizar predicciones
predictions = gb_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
# Crear un diccionario para almacenar los resultados
results = {}

# Entrenar y evaluar cada modelo
models = [
    ('DecisionTree', DecisionTreeRegressor(featuresCol='features', labelCol='price')),
    ('RandomForest', RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)),
    ('XGBoost', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)),
    ('GradientBoosting', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5))
]

for name, model in models:
    model_model = model.fit(train_data)
    predictions = model_model.transform(test_data)
    rmse = evaluator.evaluate(predictions)
    results[name] = rmse

# Imprimir los resultados
for name, rmse in results.items():
    print(f"{name}: {rmse}")

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Definir las columnas a usar como features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Ensamblar las características en un solo vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Preparar los datos
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Dividir los datos en entrenamiento y prueba (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Crear un diccionario para almacenar los resultados
results = {}

# Entrenar y evaluar cada modelo
models = [
    ('DecisionTree', DecisionTreeRegressor(featuresCol='features', labelCol='price')),
    ('RandomForest', RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)),
    ('XGBoost', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5))
]

for name, model in models:
    with mlflow.start_run(run_name=name) as run:
        # Entrenar el modelo
        model_model = model.fit(train_data)
        
        # Realizar predicciones
        predictions = model_model.transform(test_data)
        
        # Evaluar el modelo
        evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
        rmse = evaluator.evaluate(predictions)
        
        # Registrar las métricas en MLflow
        mlflow.log_metric("rmse", rmse)
        mlflow.log_param("seed", 42)
        mlflow.log_param("feature_cols", feature_cols)
        
        # Registrar el modelo en MLflow
        mlflow.spark.log_model(model_model, f"{name}-model")
        
        # Almacenar los resultados
        results[name] = rmse

# Imprimir los resultados
for name, rmse in results.items():
    print(f"{name}: {rmse}")

In [0]:
model_model.save("DecisionTree-model")

In [0]:
# Cargar el modelo
model = PipelineModel.load("/dbfs/path/to/DecisionTree-model")

In [0]:
# Importar las bibliotecas necesarias
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.ml.feature import VectorAssembler

# Crear una sesión de Spark
spark = SparkSession.builder.appName("Predicción de precios").getOrCreate()

# Cargar el modelo entrenado con una ruta absoluta
model = PipelineModel.load("/dbfs/path/to/DecisionTree-model")

# Crear un archivo de prueba
data_prueba = spark.createDataFrame([
    (2, 0, 40.4168, -3.7038, 4, 120, 2022, 3)
], ["bathrooms", "isparking", "latitude", "longitude", "rooms", "size", "year", "quarter"])

# Crear un assembler para convertir los datos en un vector
assembler = VectorAssembler(inputCols=["bathrooms", "isparking", "latitude", "longitude", "rooms", "size", "year", "quarter"], outputCol="features")

# Transformar los datos de prueba
data_prueba_transformada = assembler.transform(data_prueba)

# Hacer predicciones
predicciones = model.transform(data_prueba_transformada)

# Mostrar las predicciones
display(predicciones)

In [0]:
# Cargar el modelo de RandomForest
model_rf = PipelineModel.load("RandomForest-model")

# Cargar el modelo de XGBoost
model_xgb = PipelineModel.load("XGBoost-model")

# Hacer predicciones con el modelo de RandomForest
predicciones_rf = model_rf.transform(data_prueba_transformada)

# Hacer predicciones con el modelo de XGBoost
predicciones_xgb = model_xgb.transform(data_prueba_transformada)

# Mostrar las predicciones
predicciones_rf.show()
predicciones_xgb.show()

In [0]:
import mlflow

# Define el URI del modelo registrado en MLflow
MODEL_URI = "dbfs:/databricks/mlflow-tracking/460952b6c02c4be8aa1a411e99c32040/0d97c667eaea40299b3b5a4a746a8bd2/artifacts/XGBoost-model"

# Descargar el modelo a un directorio local
mlflow.artifacts.download_artifacts(artifact_uri=MODEL_URI, dst_path="./local_model")

In [0]:
from mlflow.models import validate_serving_input
from mlflow.models.utils import _enforce_schema

model_uri = 'runs:/0d97c667eaea40299b3b5a4a746a8bd2/XGBoost-model'

# Define INPUT_EXAMPLE via assignment with your own input example to the model
# A valid input example is a data instance suitable for pyfunc prediction
INPUT_EXAMPLE = {
    # Add your input example here
}

# Convert input example to serving input
serving_payload = _enforce_schema(INPUT_EXAMPLE)

# Validate the serving payload works on the model
validate_serving_input(model_uri, serving_payload)