In [0]:
%run ../ConfigFolder/ConfigSAS

# Notebook Description

This notebook provides an analysis of the customer churn dataset, including data cleaning, exploratory data analysis, and model building. The goal is to gain insights and build predictive models, such as logistic regression and random forest, to solve the given problem. Additionally, MLflow is used for experiment tracking and model management.

In [0]:
model_combined_Path = generate_path('g-model-madrid-min-features-combined', 'goldlayer')

df_model_combined = spark.read.format("delta").load(model_combined_Path)

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col, substring

# Extract the year from the 'period' column
df_model_combined = df_model_combined.withColumn("year", substring(col("period"), 1, 4)) \
                                     # Extract the month from the 'period' column
                                     .withColumn("month", substring(col("period"), 6, 2))

display(df_model_combined)

In [0]:
from pyspark.sql.functions import when

# Create a new column 'quarter' based on the 'month' column values
df_model_combined = df_model_combined.withColumn("quarter", 
    when(col("month") == "3", "1")
    .when(col("month") == "6", "2")
    .when(col("month") == "9", "3")
    .when(col("month") == "2", "4")
).drop("month", "period")  # Drop the 'month' and 'period' columns

display(df_model_combined)

In [0]:
from pyspark.sql.functions import col

# Convert columns to the appropriate type
df_cleaned = df_model_combined \
    .withColumn('bathrooms', col('bathrooms').cast('int')) \
    .withColumn('isparking', col('isparkingspaceincludedinprice').cast('int')) \
    .withColumn('latitude', col('latitude').cast('float')) \
    .withColumn('longitude', col('longitude').cast('float')) \
    .withColumn('price', col('price').cast('float')) \
    .withColumn('rooms', col('rooms').cast('int')) \
    .withColumn('size', col('size').cast('float')) \
    .withColumn('year', col('year').cast('int')) \
    .withColumn('quarter', col('quarter').cast('int'))

# Verify the schema to ensure the data types are correct
df_cleaned.printSchema()

In [0]:
display(df_cleaned)

In [0]:
"""
df_combined.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-combined', 'goldlayer'))

df_filtered2024.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2024', 'goldlayer'))

df_selected_2018_renamed.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(generate_path('g-model-madrid-min-features-2018', 'goldlayer'))
"""

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Define the columns to use as features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Assemble the features into a single vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Prepare the data
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Split the data into training and test sets (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Create the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='price', predictionCol='prediction')

with mlflow.start_run(run_name="regresion_lineal_v1") as run:
    # Train the model
    lr_model = lr.fit(train_data)
    
    # Make predictions on the test set
    predictions = lr_model.transform(test_data)
    
    # Evaluate the model using RMSE (Root Mean Squared Error)
    evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
    rmse = evaluator.evaluate(predictions)
    
    # Log metrics in MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_param("seed", 42)
    mlflow.log_param("feature_cols", feature_cols)
    
    # Log the model in MLflow
    mlflow.spark.log_model(lr_model, "linear-regression-model")
    
    print(f"RMSE: {rmse}")

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Define the columns to use as features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Assemble the features into a single vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Prepare the data
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Split the data into training and test sets (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Create the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='price', predictionCol='prediction')

with mlflow.start_run(run_name="regresion_lineal_v2") as run:
    # Train the model
    lr_model = lr.fit(train_data)
    
    # Make predictions on the test set
    predictions = lr_model.transform(test_data)
    
    # Evaluate the model using different metrics
    evaluator_rmse = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
    evaluator_mae = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='mae')
    evaluator_r2 = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='r2')
    
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)
    
    # Log metrics in MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.log_param("seed", 42)
    mlflow.log_param("feature_cols", feature_cols)
    
    # Log the model in MLflow
    mlflow.spark.log_model(lr_model, "linear-regression-model")
    
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}")

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

# Create the decision tree model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='price')

# Train the model
dt_model = dt.fit(train_data)

# Make predictions
predictions = dt_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import RandomForestRegressor

# Create the Random Forest model
rf = RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions
predictions = rf_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import GBTRegressor

# Create the XGBoost model
xgb = GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)

# Train the model
xgb_model = xgb.fit(train_data)

# Make predictions
predictions = xgb_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
from pyspark.ml.regression import GBTRegressor

# Create the Gradient Boosting model
gb = GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)

# Train the model
gb_model = gb.fit(train_data)

# Make predictions
predictions = gb_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

In [0]:
# Create a dictionary to store the results
results = {}

# Train and evaluate each model
models = [
    ('DecisionTree', DecisionTreeRegressor(featuresCol='features', labelCol='price')),
    ('RandomForest', RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)),
    ('XGBoost', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5)),
    ('GradientBoosting', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5))
]

for name, model in models:
    model_model = model.fit(train_data)
    predictions = model_model.transform(test_data)
    rmse = evaluator.evaluate(predictions)
    results[name] = rmse

# Print the results
for name, rmse in results.items():
    print(f"{name}: {rmse}")

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark

# Define the columns to use as features
feature_cols = ['bathrooms', 'isparking', 'latitude', 'longitude', 'rooms', 'size', 'year', 'quarter']

# Assemble the features into a single vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid='skip')

# Prepare the data
df_cleaned = df_cleaned.withColumn('isparking', col('isparkingspaceincludedinprice').cast('int'))
df_prepared = assembler.transform(df_cleaned).select('features', 'price')

# Split the data into training and testing sets (80% / 20%)
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Create a dictionary to store the results
results = {}

# Train and evaluate each model
models = [
    ('DecisionTree', DecisionTreeRegressor(featuresCol='features', labelCol='price')),
    ('RandomForest', RandomForestRegressor(featuresCol='features', labelCol='price', numTrees=100)),
    ('XGBoost', GBTRegressor(featuresCol='features', labelCol='price', maxIter=100, maxDepth=5))
]

for name, model in models:
    with mlflow.start_run(run_name=name) as run:
        # Train the model
        model_model = model.fit(train_data)
        
        # Make predictions
        predictions = model_model.transform(test_data)
        
        # Evaluate the model
        evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')
        rmse = evaluator.evaluate(predictions)
        
        # Log metrics in MLflow
        mlflow.log_metric("rmse", rmse)
        mlflow.log_param("seed", 42)
        mlflow.log_param("feature_cols", feature_cols)
        
        # Log the model in MLflow
        mlflow.spark.log_model(model_model, f"{name}-model")
        
        # Store the results
        results[name] = rmse

# Print the results
for name, rmse in results.items():
    print(f"{name}: {rmse}")

In [0]:
# Save the trained model
model_model.save("DecisionTree-model")