##### Fernando Amaral
##### PolynomialExpansion

In [None]:
# feature engineering
# Expands a feature according to degree, creating new features

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("polynomialexpansion").getOrCreate()

In [2]:
from pyspark.ml.feature import PolynomialExpansion

In [3]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [5]:
# we need to create a column with the features as a vector : outputCol="vetor"
from pyspark.ml.feature import VectorAssembler
vecassembler = VectorAssembler(inputCols=["Consumo","Cilindros","Cilindradas"], outputCol="vetor")
carros_vetor = vecassembler.transform(carros)
carros_vetor.select("Consumo","Cilindros","Cilindradas","vetor").show()

+-------+---------+-----------+------------------+
|Consumo|Cilindros|Cilindradas|             vetor|
+-------+---------+-----------+------------------+
|     21|        6|        160|  [21.0,6.0,160.0]|
|     21|        6|        160|  [21.0,6.0,160.0]|
|    228|        4|        108| [228.0,4.0,108.0]|
|    214|        6|        258| [214.0,6.0,258.0]|
|    187|        8|        360| [187.0,8.0,360.0]|
|    181|        6|        225| [181.0,6.0,225.0]|
|    143|        8|        360| [143.0,8.0,360.0]|
|    244|        4|       1467|[244.0,4.0,1467.0]|
|    228|        4|       1408|[228.0,4.0,1408.0]|
|    192|        6|       1676|[192.0,6.0,1676.0]|
|    178|        6|       1676|[178.0,6.0,1676.0]|
|    164|        8|       2758|[164.0,8.0,2758.0]|
|    173|        8|       2758|[173.0,8.0,2758.0]|
|    152|        8|       2758|[152.0,8.0,2758.0]|
|    104|        8|        472| [104.0,8.0,472.0]|
|    104|        8|        460| [104.0,8.0,460.0]|
|    147|        8|        440|

In [6]:
# degree=2
# the output is vetorpolyfeatures
pe = PolynomialExpansion(degree=2, inputCol="vetor", outputCol="vetorpolyfeatures")
carros_poly = pe.transform(carros_vetor)
carros_poly.select("vetor","vetorpolyfeatures").show(truncate=False)

+------------------+-----------------------------------------------------------------+
|vetor             |vetorpolyfeatures                                                |
+------------------+-----------------------------------------------------------------+
|[21.0,6.0,160.0]  |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]           |
|[21.0,6.0,160.0]  |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]           |
|[228.0,4.0,108.0] |[228.0,51984.0,4.0,912.0,16.0,108.0,24624.0,432.0,11664.0]       |
|[214.0,6.0,258.0] |[214.0,45796.0,6.0,1284.0,36.0,258.0,55212.0,1548.0,66564.0]     |
|[187.0,8.0,360.0] |[187.0,34969.0,8.0,1496.0,64.0,360.0,67320.0,2880.0,129600.0]    |
|[181.0,6.0,225.0] |[181.0,32761.0,6.0,1086.0,36.0,225.0,40725.0,1350.0,50625.0]     |
|[143.0,8.0,360.0] |[143.0,20449.0,8.0,1144.0,64.0,360.0,51480.0,2880.0,129600.0]    |
|[244.0,4.0,1467.0]|[244.0,59536.0,4.0,976.0,16.0,1467.0,357948.0,5868.0,2152089.0]  |
|[228.0,4.0,1408.0]|[228.0,51984.0,4.0,912.