# Reducción de la dimensionalidad

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("file:/home/cloudera/Documents/Ficheros de trabajo/bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Reducción de dimensionalidad: PCA

In [3]:
bd5.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [4]:
from pyspark.ml.feature import VectorAssembler

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario','LogD'],
    outputCol='features')

bd6 = a1.transform(bd5)

In [5]:
bd6

DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, CRSDepTime: int, UniqueCarrier: string, TailNum: string, ArrDelay: double, DepDelay: double, Origin: string, Dest: string, Distance: double, Cancelled: double, Diverted: double, CarrierDelay: double, WeatherDelay: double, NASDelay: double, SecurityDelay: double, LateAircraftDelay: double, LogD: double, Retraso: int, RetrasoNeto: double, Horario: int, features: vector]

## PCA sin estandarización

In [6]:
from pyspark.ml.feature import PCA

#2 componentes
pca=PCA(k=2,inputCol='features',outputCol='pca_features')

In [7]:
model=pca.fit(bd6)
bd6pca=model.transform(bd6)
bd6pca.select('features','pca_features').show()

+--------------------+--------------------+
|            features|        pca_features|
+--------------------+--------------------+
|[0.0,1747.0,5.0,8...|[1257.18531329463...|
|[0.0,1747.0,5.0,1...|[1080.33126488641...|
|[-2.0,1747.0,5.0,...|[1174.93608766874...|
|[130.0,628.0,5.0,...|[-337.05691583953...|
|[-8.0,628.0,5.0,2...|[-291.25162109352...|
|[2.0,628.0,5.0,73...|[272.172514924741...|
|[11.0,628.0,5.0,1...|[-91.855232354303...|
|[-2.0,1199.0,6.0,...|[835.838571711756...|
|[-3.0,1199.0,6.0,...|[338.183492718131...|
|[-3.0,1747.0,6.0,...|[934.336620967114...|
|[2.0,1747.0,6.0,9...|[1216.04839251910...|
|[-4.0,1946.0,6.0,...|[1399.51720648796...|
|[1.0,1946.0,6.0,1...|[1113.65099329332...|
|[6.0,1587.0,6.0,1...|[788.460308609670...|
|[-4.0,1587.0,6.0,...|[1150.43512521245...|
|[18.0,1199.0,6.0,...|[381.282585719361...|
|[0.0,1199.0,6.0,1...|[628.129794834484...|
|[0.0,628.0,6.0,17...|[-143.22105980336...|
|[7.0,628.0,6.0,83...|[228.966272404802...|
|[-3.0,628.0,6.0,6...|[309.20882

In [8]:
#Extraemos la primera y segunda componentes

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

p1=udf(lambda v:float(v[0]),FloatType())
p2=udf(lambda v:float(v[1]),FloatType())

bd6pca=bd6pca.withColumn('pca1',p1('pca_features')).withColumn('pca2',p2('pca_features'))


In [9]:
bd6pca.select('pca1','pca2').describe().show()

+-------+-----------------+-------------------+
|summary|             pca1|               pca2|
+-------+-----------------+-------------------+
|  count|            30466|              30466|
|   mean|300.9325448883614|-1601.4748862329209|
| stddev|557.2501211351524|  495.7211976411703|
|    min|       -731.57623|         -2935.9211|
|    max|         1751.152|         -487.57773|
+-------+-----------------+-------------------+



## PCA con estandarización

In [10]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

bd6std.select('features','scaledFeatures').show()

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[0.0,1747.0,5.0,8...|[-0.3455994145866...|
|[0.0,1747.0,5.0,1...|[-0.3455994145866...|
|[-2.0,1747.0,5.0,...|[-0.3857378410949...|
|[130.0,628.0,5.0,...|[2.26339830845110...|
|[-8.0,628.0,5.0,2...|[-0.5061531206197...|
|[2.0,628.0,5.0,73...|[-0.3054609880783...|
|[11.0,628.0,5.0,1...|[-0.1248380687911...|
|[-2.0,1199.0,6.0,...|[-0.3857378410949...|
|[-3.0,1199.0,6.0,...|[-0.4058070543490...|
|[-3.0,1747.0,6.0,...|[-0.4058070543490...|
|[2.0,1747.0,6.0,9...|[-0.3054609880783...|
|[-4.0,1946.0,6.0,...|[-0.4258762676032...|
|[1.0,1946.0,6.0,1...|[-0.3255302013325...|
|[6.0,1587.0,6.0,1...|[-0.2251841350618...|
|[-4.0,1587.0,6.0,...|[-0.4258762676032...|
|[18.0,1199.0,6.0,...|[0.01564642398779...|
|[0.0,1199.0,6.0,1...|[-0.3455994145866...|
|[0.0,628.0,6.0,17...|[-0.3455994145866...|
|[7.0,628.0,6.0,83...|[-0.2051149218077...|
|[-3.0,628.0,6.0,6...|[-0.405807

In [11]:
from pyspark.ml.feature import PCA

#2 componentes
pca2=PCA(k=2,inputCol='scaledFeatures',outputCol='pca_scaledfeatures')

In [12]:
model2=pca2.fit(bd6std)
bd6pca2=model2.transform(bd6std)
bd6pca2.select('pca_scaledfeatures').show()


+--------------------+
|  pca_scaledfeatures|
+--------------------+
|[2.21788821413829...|
|[1.75275837699323...|
|[2.11195267378450...|
|[-0.8409143667932...|
|[-1.2935834988431...|
|[0.13200405313116...|
|[-1.0554173009605...|
|[1.51778797343985...|
|[0.16715497337274...|
|[1.55728112741434...|
|[2.15423621000769...|
|[2.51024629339945...|
|[1.66294972378514...|
|[1.25831983271054...|
|[2.21267273398649...|
|[0.19636735996422...|
|[1.00929269095489...|
|[-1.1154366564473...|
|[-0.1700103932075...|
|[0.18076993873460...|
+--------------------+
only showing top 20 rows



In [13]:
bd6pca2=bd6pca2.withColumn('pca1',p1('pca_scaledfeatures')).withColumn('pca2',p2('pca_scaledfeatures')) 
bd6pca2.select('pca1','pca2').describe().toPandas()

Unnamed: 0,summary,pca1,pca2
0,count,30466.0,30466.0
1,mean,1.4992589980211242e-10,-2.3862638117704308e-11
2,stddev,1.424751251563491,1.2476208654224734
3,min,-3.9466622,-9.316214
4,max,3.2699375,2.831249


Nota: Las componentes obtenidas también se pueden estandarizar de nuevo.

In [14]:
pdf6 = bd6pca2.sample(False,0.1).select('DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario','LogD','pca1','pca2').toPandas()

pdf6.corr()

Unnamed: 0,DepDelay,Distance,DayOfWeek,CRSDepTime,Horario,LogD,pca1,pca2
DepDelay,1.0,-0.012365,0.086079,0.147063,0.139037,-0.015312,-0.106376,-0.30248
Distance,-0.012365,1.0,0.006931,-0.070598,-0.057209,0.957791,0.925434,-0.355429
DayOfWeek,0.086079,0.006931,1.0,0.008737,0.008176,0.004593,-0.010329,-0.045821
CRSDepTime,0.147063,-0.070598,0.008737,1.0,0.629858,-0.083569,-0.387876,-0.798788
Horario,0.139037,-0.057209,0.008176,0.629858,1.0,-0.054103,-0.36535,-0.808941
LogD,-0.015312,0.957791,0.004593,-0.083569,-0.054103,1.0,0.927564,-0.349886
pca1,-0.106376,0.925434,-0.010329,-0.387876,-0.36535,0.927564,1.0,-0.00672
pca2,-0.30248,-0.355429,-0.045821,-0.798788,-0.808941,-0.349886,-0.00672,1.0
