# Creación de Nuevas Variables - Feature Extraction

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("file:/home/cloudera/Documents/Ficheros de trabajo/bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Variables Dummy

In [3]:
bd5 = bd5.withColumn('Horario1',(bd5.Horario==1) 
).withColumn('Horario2',(bd5.Horario==2) 
).withColumn('Horario3',(bd5.Horario==3))

## Variables Discretizadas Binarias

In [4]:
from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=15.0, inputCol='DepDelay', outputCol='SalidaBin')
binarizer.transform(bd5).head()


Row(Year=2016, Month=12, DayofMonth=9, DayOfWeek=5, CRSDepTime=815, UniqueCarrier='WN', TailNum='N8322X', ArrDelay=-16.0, DepDelay=0.0, Origin='LAS', Dest='ATL', Distance=1747.0, Cancelled=0.0, Diverted=0.0, CarrierDelay=0.0, WeatherDelay=0.0, NASDelay=0.0, SecurityDelay=0.0, LateAircraftDelay=0.0, LogD=3.2422929049829303, Retraso=0, RetrasoNeto=-16.0, Horario=2, Horario1=False, Horario2=True, Horario3=False, SalidaBin=0.0)

In [5]:
binarizer.transform(bd5).select('DepDelay','SalidaBin').show()

+--------+---------+
|DepDelay|SalidaBin|
+--------+---------+
|     0.0|      0.0|
|     0.0|      0.0|
|    -2.0|      0.0|
|   130.0|      1.0|
|    -8.0|      0.0|
|     2.0|      0.0|
|    11.0|      0.0|
|    -2.0|      0.0|
|    -3.0|      0.0|
|    -3.0|      0.0|
|     2.0|      0.0|
|    -4.0|      0.0|
|     1.0|      0.0|
|     6.0|      0.0|
|    -4.0|      0.0|
|    18.0|      1.0|
|     0.0|      0.0|
|     0.0|      0.0|
|     7.0|      0.0|
|    -3.0|      0.0|
+--------+---------+
only showing top 20 rows



## Variables Discretizadas en Buckets

In [6]:
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[-float("inf"), 0.0, 15.0, float("inf")],
                        inputCol='DepDelay', outputCol='SalidaCat')
bucketizer.transform(bd5).select('DepDelay','SalidaCat').show()


+--------+---------+
|DepDelay|SalidaCat|
+--------+---------+
|     0.0|      1.0|
|     0.0|      1.0|
|    -2.0|      0.0|
|   130.0|      2.0|
|    -8.0|      0.0|
|     2.0|      1.0|
|    11.0|      1.0|
|    -2.0|      0.0|
|    -3.0|      0.0|
|    -3.0|      0.0|
|     2.0|      1.0|
|    -4.0|      0.0|
|     1.0|      1.0|
|     6.0|      1.0|
|    -4.0|      0.0|
|    18.0|      2.0|
|     0.0|      1.0|
|     0.0|      1.0|
|     7.0|      1.0|
|    -3.0|      0.0|
+--------+---------+
only showing top 20 rows



Versiones más nuevas de Pyspark incluyen otras transformaciones, por ejemplo QuantileDiscretizer

## Expansión polinómica de Variables 
(términos cuadráticos, productos, etc.) 

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion

assembler = VectorAssembler(
    inputCols=['DepDelay','Distance'],
    outputCol='features')

px = PolynomialExpansion(
    degree=2, 
    inputCol="features", 
    outputCol="Polyn")

bd6 = px.transform(assembler.transform(bd5))

bd6.select('DepDelay','Distance','Polyn').head(5)

[Row(DepDelay=0.0, Distance=1747.0, Polyn=DenseVector([0.0, 0.0, 1747.0, 0.0, 3052009.0])),
 Row(DepDelay=0.0, Distance=1747.0, Polyn=DenseVector([0.0, 0.0, 1747.0, 0.0, 3052009.0])),
 Row(DepDelay=-2.0, Distance=1747.0, Polyn=DenseVector([-2.0, 4.0, 1747.0, -3494.0, 3052009.0])),
 Row(DepDelay=130.0, Distance=628.0, Polyn=DenseVector([130.0, 16900.0, 628.0, 81640.0, 394384.0])),
 Row(DepDelay=-8.0, Distance=628.0, Polyn=DenseVector([-8.0, 64.0, 628.0, -5024.0, 394384.0]))]

## Estandarización de las variables

In [8]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="stdfeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

bd6std.select('features','stdfeatures').show()

+-------------+--------------------+
|     features|         stdfeatures|
+-------------+--------------------+
| [0.0,1747.0]|[-0.3455994145866...|
| [0.0,1747.0]|[-0.3455994145866...|
|[-2.0,1747.0]|[-0.3857378410949...|
|[130.0,628.0]|[2.26339830845110...|
| [-8.0,628.0]|[-0.5061531206197...|
|  [2.0,628.0]|[-0.3054609880783...|
| [11.0,628.0]|[-0.1248380687911...|
|[-2.0,1199.0]|[-0.3857378410949...|
|[-3.0,1199.0]|[-0.4058070543490...|
|[-3.0,1747.0]|[-0.4058070543490...|
| [2.0,1747.0]|[-0.3054609880783...|
|[-4.0,1946.0]|[-0.4258762676032...|
| [1.0,1946.0]|[-0.3255302013325...|
| [6.0,1587.0]|[-0.2251841350618...|
|[-4.0,1587.0]|[-0.4258762676032...|
|[18.0,1199.0]|[0.01564642398779...|
| [0.0,1199.0]|[-0.3455994145866...|
|  [0.0,628.0]|[-0.3455994145866...|
|  [7.0,628.0]|[-0.2051149218077...|
| [-3.0,628.0]|[-0.4058070543490...|
+-------------+--------------------+
only showing top 20 rows



## Tranformación manual

In [9]:
bd7 = bd6.withColumn('DepDelay2',(bd6.DepDelay**2)
).withColumn('DepD_Distance',(bd6.DepDelay * bd6.Distance)) 