In [3]:
# Carregando libs e iniciando Session
from pyspark.sql import SparkSession, functions as func
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.getOrCreate()

In [5]:
# Importando dados
carros_t = spark.read.csv('../arquivos/Carros.csv', inferSchema=True, header=True, sep=';')
carros_t.show()

                                                                                

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
|    181|        6|        225|            276| 346| 2022|        1|          0|      3|          1|105|
|    143|        8|        360|            321| 357| 15

A vari'avel dependente 'e consumo (HP)

In [16]:
# Separando variaveis relevantes
carros = carros_t.select('Consumo','Cilindros','Cilindradas','HP')
carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [17]:
# Criando um vetor com as variaveis preditoras
vetor_preditoras = VectorAssembler(inputCols=[('Consumo'),('Cilindros'),('Cilindradas')], outputCol=('Preditoras'))

In [18]:
# Adicionando o vetor
carros = vetor_preditoras.transform(carros)
carros.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|       Preditoras|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [23]:
# Divisao entre teste e treino
carros_treino, carros_teste = carros.randomSplit([0.7, 0.3])

In [27]:
# Criando o modelo de Regressao linear
regressor = LinearRegression(featuresCol = 'Preditoras', labelCol = 'HP')
modelo = regressor.fit(carros_treino)

23/02/10 07:25:15 WARN Instrumentation: [1b900761] regParam is zero, which might cause numerical instability and overfitting.


[Stage 20:>                                                         (0 + 1) / 1]                                                                                

In [46]:
# Obtendo previsoes
previsao = modelo.transform(carros_teste)
previsao.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|        Preditoras|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     15|        8|        301|335|  [15.0,8.0,301.0]|199.87491675613677|
|    152|        8|       2758|180|[152.0,8.0,2758.0]| 190.6682678895501|
|    155|        8|        318|150| [155.0,8.0,318.0]|201.86067505493907|
|    158|        8|        351|264| [158.0,8.0,351.0]|201.75411670293704|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|190.97778240754846|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



In [47]:
# Avaliacao das previsoes
avaliador_rl = RegressionEvaluator(predictionCol='prediction', labelCol='HP', metricName='rmse')
rmse = avaliador_rl.evaluate(previsao)
print(rmse)

53.527641650915655


In [61]:
# Criando o modelo de Regressao Random Forest
regressor2 = RandomForestRegressor(featuresCol = 'Preditoras', labelCol = 'HP')
modelo2 = regressor2.fit(carros_treino)

23/02/10 07:41:49 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 22 (= number of training instances)


In [62]:
# Obtendo previsoes
previsao2 = modelo2.transform(carros_teste)
previsao2.show(5)



+-------+---------+-----------+---+------------------+-----------------+
|Consumo|Cilindros|Cilindradas| HP|        Preditoras|       prediction|
+-------+---------+-----------+---+------------------+-----------------+
|     15|        8|        301|335|  [15.0,8.0,301.0]|188.4892857142857|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|          165.285|
|    155|        8|        318|150| [155.0,8.0,318.0]|        171.70625|
|    158|        8|        351|264| [158.0,8.0,351.0]|        184.20625|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|          161.035|
+-------+---------+-----------+---+------------------+-----------------+
only showing top 5 rows



In [66]:
# Avaliacao das previsoes
avaliador_rf = RegressionEvaluator(predictionCol='prediction', labelCol='HP', metricName='rmse')
rmse2 = avaliador_rf.evaluate(previsao2)
print(rmse)

53.527641650915655
