In [0]:
%run ../ConfigFolder/ConfigSAS

In [0]:
model_combined_Path = generate_path('g-model-madrid-min-features-combined', 'goldlayer')

df_model_combined = spark.read.format("delta").load(model_combined_Path)

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col, substring

df_model_combined = df_model_combined.withColumn("year", substring(col("period"), 1, 4)) \
                                     .withColumn("month", substring(col("period"), 6, 2))

display(df_model_combined)

In [0]:
from pyspark.sql.functions import when

df_model_combined = df_model_combined.withColumn("quarter", 
    when(col("month") == "3", "1")
    .when(col("month") == "6", "2")
    .when(col("month") == "9", "3")
    .when(col("month") == "2", "4")
).drop("month", "period")

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col

# Convertir columnas al tipo adecuado
df_cleaned = df_model_combined \
    .withColumn('bathrooms', col('bathrooms').cast('int')) \
    .withColumn('isparking', col('isparkingspaceincludedinprice').cast('int')) \
    .withColumn('latitude', col('latitude').cast('float')) \
    .withColumn('longitude', col('longitude').cast('float')) \
    .withColumn('price', col('price').cast('float')) \
    .withColumn('rooms', col('rooms').cast('int')) \
    .withColumn('size', col('size').cast('float')) \
    .withColumn('year', col('year').cast('int')) \
    .withColumn('quarter', col('quarter').cast('int'))

# Verifica el esquema para asegurarte de que los tipos de datos sean correctos
df_cleaned.printSchema()


In [0]:
display(df_cleaned)

In [0]:
"""
df_combined.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-combined', 'goldlayer'))

df_filtered2024.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2024', 'goldlayer'))

df_selected_2018_renamed.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2018', 'goldlayer'))
"""

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))

# Definir las columnas a usar como features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Ensamblar las características en un solo vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Dividir los datos en entrenamiento y prueba (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo de regresión lineal
lr = LinearRegression(featuresCol='features', labelCol='price', predictionCol='prediction')

# Entrenar el modelo
lr_model = lr.fit(train_data)

# Realizar predicciones sobre el conjunto de prueba
predictions = lr_model.transform(test_data)

# Evaluar el modelo usando RMSE (Raíz del error cuadrático medio)
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

# Registrar el modelo en MLflow
mlflow.spark.log_model(lr_model, "linear-regression-model")

In [0]:
model_uri = "runs:/<RUN_ID>/linear-regression-model"
mlflow.register_model(model_uri, "RealEstatePricePrediction")


In [0]:
databricks models serve --model-name RealEstatePricePrediction --port 5000


In [0]:
curl -X POST -H "Content-Type: application/json" --data '{"features": [3, 1, 40.454, -3.651, 5, 375]}' http://<endpoint-ip>:5000/predictions
