# Pre-Proceso (2)

Creación de nuevas variables

Definir SparkContext y SQLContext

In [1]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SQLContext
sqlContext=SQLContext(sc)

In [2]:
bd = sqlContext.read.format("com.databricks.spark.csv"
).option("header", "true"
).load("On_Time_On_Time_Performance_2016_12.csv", 
       inferSchema=True)

In [3]:
bd=bd.select('Year','Month','DayofMonth','DayOfWeek','CRSDepTime','UniqueCarrier', 'TailNum', 'ArrDelay','DepDelay','Origin','Dest','Distance','Cancelled', 'Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay') 

In [4]:
bd2 = bd.filter( bd.Origin.isin('ATL','ORD','DEN','LAX','DFW','SFO','PHX','LAS') & bd.Dest.isin(
    'ATL','ORD','DEN','LAX','DFW','SFO','PHX','LAS') )

sqlContext.registerDataFrameAsTable(bd2, "bd2")

In [5]:
bd3 = bd2.filter(bd2.Cancelled==0
).filter(bd2.Diverted==0)

In [6]:
bd4 = bd3.na.fill({'CarrierDelay':0, 'WeatherDelay':0,'NASDelay':0,'SecurityDelay':0, 'LateAircraftDelay':0})
#bd4 = bd3.na.fill(0)
#bd4 = bd3.na.drop()

## Transformación y Creación de nuevas variables

In [7]:
from pyspark.sql.functions import log

bd4 = bd4.withColumn('LogD', log(10.0, 'Distance'))

In [8]:
bd4 = bd4.withColumn('Retraso', (bd4.ArrDelay >=15).cast('int'))

In [9]:
bd4 = bd4.withColumn('RetrasoNeto', bd4.ArrDelay-bd4.DepDelay)

In [10]:
bd4.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double')]

In [11]:
bd4.describe('ArrDelay','LogD','Retraso','RetrasoNeto').show()

+-------+------------------+-------------------+------------------+------------------+
|summary|          ArrDelay|               LogD|           Retraso|       RetrasoNeto|
+-------+------------------+-------------------+------------------+------------------+
|  count|             30466|              30466|             30466|             30466|
|   mean|13.300695857677411|  2.886007454172114|0.2757500164117377|-3.919680955819602|
| stddev| 53.50946738163752|0.28179267734674995| 0.446898758463185|15.560721063721338|
|    min|             -53.0| 2.3729120029701067|                 0|             -48.0|
|    max|            2028.0|  3.330210784571528|                 1|             229.0|
+-------+------------------+-------------------+------------------+------------------+



In [12]:
bd4.describe('ArrDelay','LogD','Retraso','RetrasoNeto').toPandas()

Unnamed: 0,summary,ArrDelay,LogD,Retraso,RetrasoNeto
0,count,30466.0,30466.0,30466.0,30466.0
1,mean,13.300695857677413,2.886007454172114,0.2757500164117377,-3.919680955819602
2,stddev,53.50946738163752,0.2817926773467499,0.446898758463185,15.560721063721338
3,min,-53.0,2.3729120029701067,0.0,-48.0
4,max,2028.0,3.330210784571528,1.0,229.0


In [13]:
sqlContext.registerDataFrameAsTable(bd4, "bd4")

bd5 = sqlContext.sql("select *, case \
               when CRSDepTime <= 800 then 1 \
               when 800 < CRSDepTime and CRSDepTime <= 1200 then 2 \
               when 1200 < CRSDepTime and CRSDepTime <= 1600 then 3 \
               when 1600 < CRSDepTime and CRSDepTime <= 2100 then 4 \
               else 1 end as Horario \
               from bd4")

In [15]:
bd5.select('Horario').describe().show()

+-------+------------------+
|summary|           Horario|
+-------+------------------+
|  count|             30466|
|   mean| 2.539322523468785|
| stddev|1.1369016091531152|
|    min|                 1|
|    max|                 4|
+-------+------------------+



In [16]:
bd5.select('Horario').groupBy('Horario').count().show()

+-------+-----+
|Horario|count|
+-------+-----+
|      1| 7519|
|      3| 7180|
|      4| 8385|
|      2| 7382|
+-------+-----+



## Exportar la base de datos  a HDFS

In [21]:
bd5.write.format(
    "com.databricks.spark.csv"
).option(
    "header", "true"
).mode('overwrite').save('bd5.csv') 
