##### Fernando Amaral
##### Linear Regression

In [None]:
#linear Regression example
# HP as dependent variable

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("univariatefeatureselector").getOrCreate()

In [4]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
print(carros.count())
carros.show(5)

32
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import RFormula

In [6]:
# we need a column with the features vectorized
Rformula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas", featuresCol="independente", labelCol="dependente")
carrosrf = Rformula.fit(carros).transform(carros)
carrosrf.select("independente", "dependente").show(5,truncate=False)

+-----------------+----------+
|independente     |dependente|
+-----------------+----------+
|[21.0,6.0,160.0] |110.0     |
|[21.0,6.0,160.0] |110.0     |
|[228.0,4.0,108.0]|93.0      |
|[214.0,6.0,258.0]|110.0     |
|[187.0,8.0,360.0]|175.0     |
+-----------------+----------+
only showing top 5 rows



In [8]:
# split for train and test
CarrosTreino, CarrosTeste = carrosrf.randomSplit([0.8,0.2])
print(CarrosTreino.count())
print(CarrosTeste.count())

26
6


In [10]:
# create the model and fit it
from pyspark.ml.regression import LinearRegression
reglin = LinearRegression(featuresCol="independente", labelCol="dependente", maxIter=1000, loss="squaredError",
                         standardization=True)
modelo = reglin.fit(CarrosTreino)

In [11]:
# predict
previsao = modelo.transform(CarrosTeste)
previsao.select("dependente","prediction").show()

+----------+------------------+
|dependente|        prediction|
+----------+------------------+
|     110.0| 179.0603221930096|
|     205.0|217.47147234184533|
|     123.0| 138.1636533968134|
|     105.0|152.17352242096229|
|     123.0|135.86790133743273|
|      52.0| 71.54678097228751|
+----------+------------------+



In [12]:
# evaluate
from pyspark.ml.evaluation import RegressionEvaluator
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="dependente", metricName="rmse")
rmse = avaliar.evaluate(previsao)
print(rmse)

36.34969705725187
