##### Fernando Amaral
##### GeneralizedLinearRegression

In [None]:
# GeneralizedLinearRegression example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("generalized").getOrCreate()

In [2]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [3]:
# we need a column with the features vectorized
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas", featuresCol="independente", labelCol="dependente")
carrosrf = Rformula.fit(carros).transform(carros)
carrosrf.select("independente", "dependente").show(5,truncate=False)

+-----------------+----------+
|independente     |dependente|
+-----------------+----------+
|[21.0,6.0,160.0] |110.0     |
|[21.0,6.0,160.0] |110.0     |
|[228.0,4.0,108.0]|93.0      |
|[214.0,6.0,258.0]|110.0     |
|[187.0,8.0,360.0]|175.0     |
+-----------------+----------+
only showing top 5 rows



In [4]:
# split for train and test
CarrosTreino, CarrosTeste = carrosrf.randomSplit([0.7,0.3])
print(CarrosTreino.count())
print(CarrosTeste.count())

23
9


In [5]:
# create the model and fit it
from pyspark.ml.regression import GeneralizedLinearRegression
geral = GeneralizedLinearRegression(family="gaussian", featuresCol="independente", labelCol="dependente",
                                   link="identity", maxIter=1000, regParam=0.08)
modelo = geral.fit(CarrosTreino)

In [6]:
# predict
previsao = modelo.transform(CarrosTeste)
previsao.select("dependente", "prediction").show()

+----------+------------------+
|dependente|        prediction|
+----------+------------------+
|     335.0|217.99115578655366|
|     110.0| 166.8766488949735|
|     110.0| 166.8766488949735|
|     150.0|198.71015900635447|
|     264.0| 198.0695777276623|
|     180.0|179.24363184803596|
|     123.0|132.90696356190432|
|     175.0|142.89113033874565|
|      66.0| 81.67272981806948|
+----------+------------------+



In [7]:
# evaluate
from pyspark.ml.evaluation import RegressionEvaluator
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="dependente", metricName="rmse")
rmse = avaliar.evaluate(previsao)
print(rmse)

56.03154881683042
