##### Fernando Amaral
##### PCA

In [None]:
# principal component analysis
# Creates synthetic features without "business meaning"

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("PCA").getOrCreate()

In [2]:
from pyspark.ml.feature import PCA

In [3]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import VectorAssembler
vectas = VectorAssembler(inputCols=["Consumo","Cilindros","Cilindradas","RelEixoTraseiro","Peso",
                                    "Tempo","TipoMotor","Transmissao","Marchas","Carburadors"],outputCol="caracteristicas")
carros = vectas.transform(carros)
carros.select("caracteristicas").show(truncate=False)

+-----------------------------------------------------+
|caracteristicas                                      |
+-----------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |
|[181.0,6.0,225.0,276.0,346.0,2022.0,1.0,0.0,3.0,1.0] |
|[143.0,8.0,360.0,321.0,357.0,1584.0,0.0,0.0,3.0,4.0] |
|[244.0,4.0,1467.0,369.0,319.0,20.0,1.0,0.0,4.0,2.0]  |
|[228.0,4.0,1408.0,392.0,315.0,229.0,1.0,0.0,4.0,2.0] |
|[192.0,6.0,1676.0,392.0,344.0,183.0,1.0,0.0,4.0,4.0] |
|[178.0,6.0,1676.0,392.0,344.0,189.0,1.0,0.0,4.0,4.0] |
|[164.0,8.0,2758.0,307.0,407.0,174.0,0.0,0.0,3.0,3.0] |
|[173.0,8.0,2758.0,307.0,373.0,176.0,0.0,0.0,3.0,3.0] |
|[152.0,8.0,2758.0,307.0,378.0,18.0,0.0,0.0,3.0,3.0]  |
|[104.0,8.0,472.0,293.0,525.0,1798.0,0.0,0.0,3.0

In [5]:
pca = PCA(k=3, inputCol="caracteristicas", outputCol="caracteristicaspca")
modelo = pca.fit(carros)

In [7]:
resultado = modelo.transform(carros)
resultado.select("caracteristicaspca").show(truncate=False)

+-----------------------------------------------------------+
|caracteristicaspca                                         |
+-----------------------------------------------------------+
|[618.7707206779613,-937.712394997354,1231.963352994551]    |
|[3112.9887675342197,-161.05746385491523,1191.8619913054383]|
|[640.4959007710695,-1120.718886511042,1320.0756315189049]  |
|[3466.0956877556673,-149.69421418298353,1401.204178036853] |
|[661.4577445758732,-812.4592128844115,1395.2949328316356]  |
|[769.2343671787738,-1120.4160559477316,1518.7436632279525] |
|[644.8369503533214,-727.9539376169618,1313.6815210979353]  |
|[9.10188066170976,1061.295403667789,1045.171050021569]     |
|[67.13360966508404,878.4793682045013,1143.9379120496164]   |
|[31.390504477140627,1095.369449828574,1306.012486190633]   |
|[32.8916592220896,1091.1521230845226,1310.0881577350908]   |
|[-118.372737516754,1832.771927405815,2088.6955393326043]   |
|[-150.18148405358025,1820.8730926512776,2091.1033550766124]|
|[-184.0