##### Fernando Amaral
##### UnivariateFeatureSelector

In [None]:
# feature selection

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("univariatefeatureselector").getOrCreate()

In [2]:
from pyspark.ml.feature import UnivariateFeatureSelector

In [3]:
carros = spark.read.csv("Carros.csv", header=True, inferSchema=True, sep=";")
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="HP ~ .", featuresCol="independente", labelCol="dependente")
carrosrf = Rformula.fit(carros).transform(carros)
carrosrf.select("independente", "dependente").show(truncate=False)

+-----------------------------------------------------+----------+
|independente                                         |dependente|
+-----------------------------------------------------+----------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |110.0     |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |110.0     |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |93.0      |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|110.0     |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |175.0     |
|[181.0,6.0,225.0,276.0,346.0,2022.0,1.0,0.0,3.0,1.0] |105.0     |
|[143.0,8.0,360.0,321.0,357.0,1584.0,0.0,0.0,3.0,4.0] |245.0     |
|[244.0,4.0,1467.0,369.0,319.0,20.0,1.0,0.0,4.0,2.0]  |62.0      |
|[228.0,4.0,1408.0,392.0,315.0,229.0,1.0,0.0,4.0,2.0] |95.0      |
|[192.0,6.0,1676.0,392.0,344.0,183.0,1.0,0.0,4.0,4.0] |123.0     |
|[178.0,6.0,1676.0,392.0,344.0,189.0,1.0,0.0,4.0,4.0] |123.0     |
|[164.0,8.0,2758.0,307.0,407.0,174.0,0.0,0.0,3.0,3.0] |180.0  

In [7]:
# You can define the attribute types of the dependent and independent variables 
# you want to use (categorical or continuous)
#setFeatureType("categorical").setLabelType("categorical")   => chi square
#setFeatureType("continuous").setLabelType("categorical")    => Anova
#setFeatureType("continuous").setLabelType("continuous")   => F Statistic
selector = UnivariateFeatureSelector(featuresCol="independente", outputCol="selecionados", labelCol="dependente")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(5)
carrosuni = selector.fit(carrosrf).transform(carrosrf)
carrosuni.select("selecionados").show(5, truncate=False)

+-----------------------+
|selecionados           |
+-----------------------+
|[21.0,6.0,0.0,1.0,4.0] |
|[21.0,6.0,0.0,1.0,4.0] |
|[228.0,4.0,1.0,1.0,1.0]|
|[214.0,6.0,1.0,0.0,1.0]|
|[187.0,8.0,0.0,0.0,2.0]|
+-----------------------+
only showing top 5 rows

