In [1]:
# pip install optuna

In [2]:
# pip install xgboost

In [3]:
# pip install xgboost4j-spark

In [4]:
# Evita erros nas iterações: 
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [5]:
# inicializa sessão no spark com nome importação
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init() # inicializa interface pyspark jupyter
spark = SparkSession.builder.appName("XGBoostImob").getOrCreate()
#inicializa sessão no spark com nome importação

In [6]:
# Imports Chat GPT:
import optuna
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler
#from pyspark.ml.classification import XGBoostClassifier
from xgboost.spark import SparkXGBRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import RFormula
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import RFormula
from pyspark.sql.functions import col,isnan,when,count,countDistinct
from pyspark.sql.types import DoubleType

In [7]:
# Carregando dados:
casas = spark.read.csv("Housing.csv", header=True, inferSchema=True, sep=",")
casas.show(5)

+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000|7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000|8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000|9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000|7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000|7420|       4|        1|      2

In [8]:
# Schema:
# casas.printSchema()

In [9]:
# Descritiva:
# casas.summary().show(vertical=True)

In [10]:
# Contagemde nulos e vazios ou Nan:
# casas_nulls = casas.select([count(when(col(c).contains('None') | \
#                             col(c).contains('NULL') | \
#                             (col(c) == '' ) | \
#                             col(c).isNull() | \
#                             isnan(c), c 
#                            )).alias(c)
#                     for c in casas.columns])
# casas_nulls.show()

In [11]:
# Contagem de categorias por coluna:
# casas.agg(*(countDistinct(col(c)).alias(c) for c in casas.columns)).show(vertical=True)

In [12]:
# Instancia o RFormula: Vetorização das preditoras (independentes = features):
# Rformula = RFormula(formula="price ~ .", featuresCol="independente", labelCol="dependente")
# casasrf = Rformula.fit(casas).transform(casas)
# casasrf.select("independente","dependente").show(5, truncate=False)

In [13]:
# Seleção de atributos com UnivariateFeatureSelector
# from pyspark.ml.feature import UnivariateFeatureSelector
# selector = UnivariateFeatureSelector(featuresCol= "independente", outputCol="selecionados", 
#                                      labelCol="dependente")
# selector.setFeatureType("categorical").setLabelType("categorical").setSelectionThreshold(5)
# casasuni_modelo = selector.fit(casasrf)
# features_selecionadas = casasuni_modelo.selectedFeatures
# nomes_features_selecionadas = [casasrf.columns[i] for i in features_selecionadas]
# print("Features selecionadas:",nomes_features_selecionadas)
# casasuni = casasuni_modelo.transform(casasrf)
# casasuni.select("selecionados").show(5, truncate=False)

In [14]:
# Instancia o RFormula: Vetorização das preditoras (independentes = features):
# from pyspark.ml.feature import RFormula
rformula = RFormula(formula= "price ~ bedrooms + bathrooms + mainroad + parking + hotwaterheating", 
                    featuresCol="independente", labelCol="dependente")
casasrffeatselec = rformula.fit(casas).transform(casas)

In [15]:
# Define a métrica de avaliação:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="dependente",predictionCol="prediction", metricName="rmse")

In [16]:
# Divide em treino e teste:
casasTreino, casasTeste = casas.randomSplit([0.7,0.3], seed=42)

In [17]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline

In [18]:
# Função de otimização para o Optuna
def objective(trial):
        # Hiperparâmetros a serem otimizados
        params = {
            'maxDepth': trial.suggest_int('maxDepth', 3, 10),
            'maxBins': trial.suggest_int('maxBins', 32, 128),
            'maxIter': trial.suggest_int('numTrees', 50, 300),
            'stepSize': trial.suggest_float('stepSize', 0.01, 0.3, log=True)  # Substitui learningRate por stepSize,
        }
        # Defina os hiperparâmetros no modelo GBTRegressor
        gbt = GBTRegressor(**params, featuresCol="independente", labelCol="dependente")

        # Defina o modelo no pipeline
        pipeline = Pipeline(stages=[rformula, gbt])

        # Crie um grid de hiperparâmetros para a validação cruzada
        param_grid = ParamGridBuilder().addGrid(gbt.maxDepth, 
                                                [params['maxDepth']]).addGrid(gbt.maxBins, 
                                                                              [params['maxBins']]).build()
        # Crie um validador cruzado
        cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

        # Ajuste o modelo aos dados de treinamento usando a validação cruzada
        model = cv.fit(casasTreino)

        # Faça previsões no conjunto de teste
        test_predictions = model.transform(casasTeste)

        # Calcule a métrica de desempenho (RMSE)
        rmse = evaluator.evaluate(test_predictions)
        return rmse

In [19]:
# Crie um estudo Optuna para otimização de hiperparâmetros
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

[I 2023-11-02 23:59:32,213] A new study created in memory with name: no-name-1e27ca75-8c02-444e-9657-1bb1ab8baee3
[I 2023-11-03 00:00:51,442] Trial 0 finished with value: 1434307.5120519768 and parameters: {'maxDepth': 10, 'maxBins': 63, 'numTrees': 104, 'stepSize': 0.015010827136459055}. Best is trial 0 with value: 1434307.5120519768.
[I 2023-11-03 00:01:53,826] Trial 1 finished with value: 1434307.512051977 and parameters: {'maxDepth': 10, 'maxBins': 82, 'numTrees': 92, 'stepSize': 0.049582937917819855}. Best is trial 0 with value: 1434307.5120519768.
[I 2023-11-03 00:04:29,806] Trial 2 finished with value: 1434034.0147744592 and parameters: {'maxDepth': 3, 'maxBins': 44, 'numTrees': 235, 'stepSize': 0.27190192769344984}. Best is trial 2 with value: 1434034.0147744592.
[I 2023-11-03 00:07:20,024] Trial 3 finished with value: 1434307.5120519765 and parameters: {'maxDepth': 9, 'maxBins': 105, 'numTrees': 220, 'stepSize': 0.07958625435438539}. Best is trial 2 with value: 1434034.0147744

[W 2023-11-03 00:31:55,812] Trial 24 failed with value None.
[W 2023-11-03 00:31:57,862] Trial 25 failed with parameters: {'maxDepth': 3, 'maxBins': 91, 'numTrees': 293, 'stepSize': 0.011023395745164767} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:31:57,862] Trial 25 failed with value None.
[W 2023-11-03 00:31:59,895] Trial 26 failed with parameters: {'maxDepth': 3, 'maxBins': 94, 'numTrees': 295, 'stepSize': 0.011742040021247156} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:31:59,895] Trial 26 failed with value None.
[W 2023-11-03 00:32:01,945] Trial 27 failed with parameters: {'maxDepth': 3, 'maxBins': 87, 'numTrees': 298, 'stepSize': 0.010095489563754565} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:32:01,947] Trial 27 failed with value None.
[W 2023-11-03 00:32:03,993] Trial 28 failed with parameters: {'maxDepth': 3, 'maxBins': 93, 'numTre

[W 2023-11-03 00:32:57,237] Trial 54 failed with value None.
[W 2023-11-03 00:32:59,287] Trial 55 failed with parameters: {'maxDepth': 3, 'maxBins': 87, 'numTrees': 291, 'stepSize': 0.011944754802956932} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:32:59,287] Trial 55 failed with value None.
[W 2023-11-03 00:33:01,337] Trial 56 failed with parameters: {'maxDepth': 3, 'maxBins': 90, 'numTrees': 299, 'stepSize': 0.01126494212455587} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:33:01,337] Trial 56 failed with value None.
[W 2023-11-03 00:33:03,386] Trial 57 failed with parameters: {'maxDepth': 3, 'maxBins': 96, 'numTrees': 291, 'stepSize': 0.010562542635474551} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:33:03,387] Trial 57 failed with value None.
[W 2023-11-03 00:33:05,436] Trial 58 failed with parameters: {'maxDepth': 3, 'maxBins': 92, 'numTree

[W 2023-11-03 00:33:58,713] Trial 84 failed with value None.
[W 2023-11-03 00:34:00,762] Trial 85 failed with parameters: {'maxDepth': 3, 'maxBins': 93, 'numTrees': 56, 'stepSize': 0.011385258848140855} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:34:00,763] Trial 85 failed with value None.
[W 2023-11-03 00:34:02,812] Trial 86 failed with parameters: {'maxDepth': 3, 'maxBins': 98, 'numTrees': 51, 'stepSize': 0.010342221330436741} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:34:02,812] Trial 86 failed with value None.
[W 2023-11-03 00:34:04,862] Trial 87 failed with parameters: {'maxDepth': 3, 'maxBins': 90, 'numTrees': 300, 'stepSize': 0.012098024164966024} because of the following error: The value None could not be cast to float..
[W 2023-11-03 00:34:04,862] Trial 87 failed with value None.
[W 2023-11-03 00:34:06,895] Trial 88 failed with parameters: {'maxDepth': 3, 'maxBins': 94, 'numTrees

In [20]:
# Obtendo os melhores hiperparâmetros:
best_params = study.best_params

In [21]:
# Defina os hiperparâmetros otimizados no modelo GBTRegressor
final_gbt = GBTRegressor(maxDepth=best_params['maxDepth'], maxBins=best_params['maxBins'], maxIter=best_params['maxIter'], stepSize=best_params['stepSize'])

KeyError: 'maxIter'

In [None]:
# Defina o modelo no pipeline
pipeline.setStages([rformula, final_gbt])

In [None]:
# Ajuste o modelo aos dados de treinamento
final_model = pipeline.fit(casasTreino)

In [None]:
# Faça previsões no conjunto de teste
test_predictions = final_model.transform(casasTeste)

In [None]:
# Avalie o modelo final (por exemplo, RMSE)
rmse = evaluator.evaluate(test_predictions)

In [None]:
# Imprima os melhores hiperparâmetros e métricas de desempenho
print("Melhores hiperparâmetros:", best_params)
print("RMSE:", rmse)