##### Fernando Amaral
##### RandomForest

In [None]:
# RandomForest Example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("randomforest").getOrCreate()

In [2]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [3]:
from pyspark.ml.regression import RandomForestRegressor

In [4]:
# we need a column with the features vectorized
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas", featuresCol="independente", labelCol="dependente")
carrosrf = Rformula.fit(carros).transform(carros)
carrosrf.select("independente", "dependente").show(5,truncate=False)

+-----------------+----------+
|independente     |dependente|
+-----------------+----------+
|[21.0,6.0,160.0] |110.0     |
|[21.0,6.0,160.0] |110.0     |
|[228.0,4.0,108.0]|93.0      |
|[214.0,6.0,258.0]|110.0     |
|[187.0,8.0,360.0]|175.0     |
+-----------------+----------+
only showing top 5 rows



In [5]:
# data normalization
from pyspark.ml.feature import Normalizer
normalizador = Normalizer(inputCol="independente",outputCol="independentenorm", p=1.0)
carrosnorm = normalizador.transform(carrosrf)

In [6]:
carrosnorm.select("independente","dependente","independentenorm").show(5,truncate=False)

+-----------------+----------+-------------------------------------------------------------+
|independente     |dependente|independentenorm                                             |
+-----------------+----------+-------------------------------------------------------------+
|[21.0,6.0,160.0] |110.0     |[0.11229946524064172,0.03208556149732621,0.8556149732620321] |
|[21.0,6.0,160.0] |110.0     |[0.11229946524064172,0.03208556149732621,0.8556149732620321] |
|[228.0,4.0,108.0]|93.0      |[0.6705882352941176,0.011764705882352941,0.3176470588235294] |
|[214.0,6.0,258.0]|110.0     |[0.4476987447698745,0.012552301255230125,0.5397489539748954] |
|[187.0,8.0,360.0]|175.0     |[0.33693693693693694,0.014414414414414415,0.6486486486486487]|
+-----------------+----------+-------------------------------------------------------------+
only showing top 5 rows



In [7]:
# split for train and test
CarrosTreino, CarrosTeste = carrosnorm.randomSplit([0.7,0.3], seed=1)
print(CarrosTreino.count())
print(CarrosTeste.count())

24
8


In [8]:
# create the model and fit it
rf = RandomForestRegressor(featuresCol="independentenorm", labelCol="dependente", maxDepth=10, 
                          numTrees=500, seed=20)
modelo = rf.fit(CarrosTreino)

In [9]:
# predict
previsao = modelo.transform(CarrosTeste)
previsao.select("dependente","prediction").show()

+----------+------------------+
|dependente|        prediction|
+----------+------------------+
|     215.0|            175.44|
|     205.0|            175.44|
|     180.0|           158.596|
|     180.0|           158.596|
|     123.0|           125.568|
|     105.0| 167.3863333333333|
|     175.0|169.75033333333332|
|     113.0|           114.878|
+----------+------------------+



In [10]:
# evaluate
from pyspark.ml.evaluation import RegressionEvaluator
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="dependente",metricName="rmse")
rmse = avaliar.evaluate(previsao)
print(rmse)

30.176117889170413
