In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()
import datetime

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Semana 3 - Desafio Transformação"))

In [5]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [6]:
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/flights.csv"))

df_airports.show(5)
df_planes.show(5)
df_flights.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      

In [7]:
# Criacao das visões temporarias
df_airports.createOrReplaceTempView('airports')
df_planes.createOrReplaceTempView('planes')
df_flights.createOrReplaceTempView('flights')

# Functions

In [8]:
def CriaVw(df):
    return df.createOrReplaceTempView("Data")

def Consolida_SQL(_col):
    return spark.getOrCreate().sql(f"Select {_col}, count(*) from Data Group By {_col} order by 1").show()
    
def Consolida(_col, df):
    CriaVw(df)
    return Consolida_SQL(_col)

# Flights

## Pergunta 1

In [9]:
df_flights_Final = df_flights.withColumn('hour',  (
                                                    F.when(F.col('hour').isNull(), 0)
                                                    .otherwise(F.col('hour'))
                                                    )
                                          )

df_flights_Final = df_flights_Final.withColumn('minute',  (
                                                            F.when(F.col('minute').isNull(), 0)
                                                            .otherwise(F.col('minute'))
                                                          )
                                              )

df_flights_Final.groupBy(F.col("hour")).count().distinct().orderBy(F.col("hour")).show(100)
df_flights_Final.groupBy(F.col("minute")).count().distinct().orderBy(F.col("minute")).show(100)

+----+-----+
|hour|count|
+----+-----+
|   0|  137|
|   1|   17|
|   2|    4|
|   5|  431|
|   6|  899|
|   7|  709|
|   8|  659|
|   9|  456|
|  10|  803|
|  11|  723|
|  12|  539|
|  13|  653|
|  14|  540|
|  15|  468|
|  16|  388|
|  17|  394|
|  18|  570|
|  19|  409|
|  20|  354|
|  21|  281|
|  22|  314|
|  23|  251|
|  24|    1|
+----+-----+

+------+-----+
|minute|count|
+------+-----+
|     0|  226|
|     1|  184|
|     2|  173|
|     3|  161|
|     4|  157|
|     5|  167|
|     6|  157|
|     7|  153|
|     8|  153|
|     9|  183|
|    10|  160|
|    11|  161|
|    12|  161|
|    13|  150|
|    14|  136|
|    15|  162|
|    16|  152|
|    17|  155|
|    18|  130|
|    19|  155|
|    20|  163|
|    21|  139|
|    22|  156|
|    23|  142|
|    24|  177|
|    25|  188|
|    26|  154|
|    27|  172|
|    28|  159|
|    29|  180|
|    30|  155|
|    31|  153|
|    32|  149|
|    33|  173|
|    34|  125|
|    35|  154|
|    36|  133|
|    37|  135|
|    38|  150|
|    39|  155|
|  

## Pergunta 2

In [10]:
df_flights_Final = df_flights_Final.withColumn('hour',  (
                                                    F.when(F.col('hour') > 23, 0)
                                                    .otherwise(F.col('hour'))
                                                    )
                                          )

df_flights_Final.groupBy(F.col("hour")).count().distinct().orderBy(F.col("hour")).show(100)

+----+-----+
|hour|count|
+----+-----+
|   0|  138|
|   1|   17|
|   2|    4|
|   5|  431|
|   6|  899|
|   7|  709|
|   8|  659|
|   9|  456|
|  10|  803|
|  11|  723|
|  12|  539|
|  13|  653|
|  14|  540|
|  15|  468|
|  16|  388|
|  17|  394|
|  18|  570|
|  19|  409|
|  20|  354|
|  21|  281|
|  22|  314|
|  23|  251|
+----+-----+



## Pergunta 3

In [11]:
df_flights_Final = df_flights_Final.withColumn('dep_datetime',  (
                                                    F.when((F.col('Year').isNotNull()) 
                                                            & (F.col('month').isNotNull()) 
                                                            & (F.col('day').isNotNull()) 
                                                            & (F.col('hour').isNotNull()) 
                                                            & (F.col('minute').isNotNull())
                                                           ,F.to_timestamp(F.concat(F.col('Year'),F.lit("-"),F.col('month'),F.lit("-"),F.col('day'),F.lit(" "),F.col('hour'),F.lit(":"),F.col('minute'),F.lit(":00"))))
                                                            .otherwise(None)
                                                    )
                                          )

df_flights_Final.groupBy(F.col("year"),F.col("month"),F.col("day"),F.col("hour"),F.col("minute"),F.col("dep_datetime")).count().distinct().orderBy(F.col("dep_datetime")).show()
df_flights_Final.printSchema()


# F.lit(f"{F.col('year')}-{F.col('month'):02d}-{F.col('day'):02d} {F.col('hour'):02d}:{F.col('minute'):02d}:00".strftime("...."))

+----+-----+---+----+------+-------------------+-----+
|year|month|day|hour|minute|       dep_datetime|count|
+----+-----+---+----+------+-------------------+-----+
|2014|    1|  1|   0|     0|2014-01-01 00:00:00|    1|
|2014|    1|  1|   5|    50|2014-01-01 05:50:00|    1|
|2014|    1|  1|   6|     0|2014-01-01 06:00:00|    1|
|2014|    1|  1|   6|    17|2014-01-01 06:17:00|    1|
|2014|    1|  1|   6|    19|2014-01-01 06:19:00|    1|
|2014|    1|  1|   6|    22|2014-01-01 06:22:00|    1|
|2014|    1|  1|   6|    39|2014-01-01 06:39:00|    1|
|2014|    1|  1|   7|     1|2014-01-01 07:01:00|    1|
|2014|    1|  1|   7|    13|2014-01-01 07:13:00|    1|
|2014|    1|  1|   8|     2|2014-01-01 08:02:00|    1|
|2014|    1|  1|   8|    17|2014-01-01 08:17:00|    1|
|2014|    1|  1|   8|    19|2014-01-01 08:19:00|    1|
|2014|    1|  1|   8|    25|2014-01-01 08:25:00|    1|
|2014|    1|  1|   8|    33|2014-01-01 08:33:00|    1|
|2014|    1|  1|  11|    23|2014-01-01 11:23:00|    1|
|2014|    

## Pergunta 4

In [12]:
df_flights_Final = df_flights_Final.withColumn('dep_time',  (
                                                             F.when((F.col('hour').isNotNull()) 
                                                                    & (F.col('minute').isNotNull())
                                                                    & ((F.col('dep_time').isNull())
                                                                       |(F.col('dep_time') == 'NA'))
                                                                    ,F.concat(F.lpad(F.col('hour'),2,'0'),F.lpad(F.col('minute'),2,'0')))
                                                             .otherwise(F.col('dep_time'))
                                                         )
                                                )


df_flights_Final.groupBy(F.col('hour'), F.col('minute'), F.col('dep_time')).count().distinct().orderBy("dep_time").show()

+----+------+--------+-----+
|hour|minute|dep_time|count|
+----+------+--------+-----+
|   0|     0|    0000|   48|
|   0|     1|       1|    6|
|  10|     0|    1000|    8|
|  10|     1|    1001|   16|
|  10|     2|    1002|    5|
|  10|     3|    1003|   12|
|  10|     4|    1004|    9|
|  10|     5|    1005|    6|
|  10|     6|    1006|   17|
|  10|     7|    1007|   10|
|  10|     8|    1008|    9|
|  10|     9|    1009|   18|
|   1|     1|     101|    1|
|  10|    10|    1010|   12|
|  10|    11|    1011|   10|
|  10|    12|    1012|   18|
|  10|    13|    1013|   12|
|  10|    14|    1014|   13|
|  10|    15|    1015|   17|
|  10|    16|    1016|   15|
+----+------+--------+-----+
only showing top 20 rows



## Pergunta 5

In [13]:
df_flights_Final = df_flights_Final.withColumn('dep_delay',  (
                                                            F.when(F.col('dep_delay').isNull(), 0)
                                                            .otherwise(F.col('dep_delay'))
                                                          )
                                              )

df_flights_Final.groupBy(F.col("dep_delay")).count().distinct().orderBy(F.col("dep_delay")).show()

+---------+-----+
|dep_delay|count|
+---------+-----+
|      -19|    1|
|      -18|    3|
|      -17|    4|
|      -16|    5|
|      -15|   10|
|      -14|   14|
|      -13|   37|
|      -12|   63|
|      -11|   80|
|      -10|  196|
|       -9|  208|
|       -8|  319|
|       -7|  471|
|       -6|  625|
|       -5|  742|
|       -4|  838|
|       -3|  852|
|       -2|  760|
|       -1|  666|
|        0|  646|
+---------+-----+
only showing top 20 rows



## Pergunta 6

In [14]:
df_flights_Final = df_flights_Final.withColumn('arr_delay',  (
                                                            F.when(F.col('arr_delay').isNull(), 0)
                                                            .otherwise(F.col('arr_delay'))
                                                          )
                                              )

df_flights_Final.groupBy(F.col("arr_delay")).count().distinct().orderBy(F.col("arr_delay")).show(55)

+---------+-----+
|arr_delay|count|
+---------+-----+
|      -58|    1|
|      -53|    1|
|      -50|    1|
|      -48|    1|
|      -47|    2|
|      -46|    1|
|      -45|    2|
|      -44|    2|
|      -43|    2|
|      -42|    6|
|      -41|    4|
|      -40|    7|
|      -39|    3|
|      -38|   14|
|      -37|    7|
|      -36|   15|
|      -35|   14|
|      -34|   17|
|      -33|   14|
|      -32|   17|
|      -31|   34|
|      -30|   27|
|      -29|   35|
|      -28|   43|
|      -27|   60|
|      -26|   66|
|      -25|   49|
|      -24|   82|
|      -23|   83|
|      -22|  101|
|      -21|  109|
|      -20|  127|
|      -19|  141|
|      -18|  163|
|      -17|  188|
|      -16|  179|
|      -15|  202|
|      -14|  257|
|      -13|  270|
|      -12|  285|
|      -11|  288|
|      -10|  254|
|       -9|  300|
|       -8|  316|
|       -7|  330|
|       -6|  323|
|       -5|  331|
|       -4|  314|
|       -3|  295|
|       -2|  279|
|       -1|  286|
|        0|  327|
|        1

## Pergunta 7

In [15]:
df_flights_Final = df_flights_Final.drop("year", "month", "day", "hour", "minute")
df_flights_Final.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)



## Pergunta 8

In [16]:
df_flights_Final = df_flights_Final.withColumn('air_time_projected', (
                                                                F.when((F.col("distance").isNotNull()) , (F.col("distance")*0.1+20).cast('int'))
                                                                .otherwise(None)
                                                             )
                                            )

df_flights_Final.groupBy("air_time_projected").count().distinct().orderBy("air_time_projected").show()
df_flights_Final.printSchema()

+------------------+-----+
|air_time_projected|count|
+------------------+-----+
|                29|    5|
|                30|   41|
|                31|   54|
|                32|  301|
|                42|  105|
|                44|   10|
|                54|    7|
|                64|   13|
|                67|   93|
|                74|  121|
|                75|  305|
|                76|  164|
|                80|  190|
|                83|  171|
|                87|  695|
|                88|  302|
|                89|  213|
|                94|   47|
|                96|  156|
|                98|   23|
+------------------+-----+
only showing top 20 rows

root
 |-- dep_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-

## Pergunta 9

In [17]:
# criando df auxiliar com media dos coos com mesma origem e destino
df_aux_avg = (df_flights_Final.select(F.col('origin'), F.col('dest'), F.col('air_time'))
                             .groupBy(F.col('origin'), F.col('dest'))
                             .agg(F.avg(F.col("air_time")).cast('int').alias("air_time_expected"))
                             .orderBy(F.col('origin'), F.col('dest'))
                             .withColumnRenamed("origin","origin2")
                             .withColumnRenamed("dest","dest2")
             )

df_flights_Final = (df_flights_Final.join(df_aux_avg, 
                                         (df_flights_Final.origin ==  df_aux_avg.origin2)
                                          & (df_flights_Final.dest ==  df_aux_avg.dest2)
                                        ,"left")
                                     .drop("origin2", "dest2")
                   )

df_flights_Final.groupBy("air_time_expected").count().distinct().orderBy("air_time_expected").show()
df_flights_Final.printSchema()

+-----------------+-----+
|air_time_expected|count|
+-----------------+-----+
|               22|    5|
|               27|   41|
|               29|   54|
|               32|  157|
|               34|  144|
|               37|  105|
|               49|    7|
|               53|   10|
|               59|   13|
|               68|   93|
|               74|    8|
|               81|  121|
|               82|  361|
|               85|  461|
|               88|  225|
|               98|  124|
|               99|  213|
|              101|  482|
|              103|  213|
|              105|  179|
+-----------------+-----+
only showing top 20 rows

root
 |-- dep_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable

## Pergunta 10

In [18]:
df_flights_Final = df_flights_Final.withColumn('air_time', (
                                                                F.when((F.col("air_time").isNull()) 
                                                                       & (F.col("air_time_projected") >= F.col("air_time_expected")) 
                                                                ,F.col("air_time_projected"))
                                                                .otherwise(F.col("air_time_expected"))
                                                             )
                                            )

df_flights_Final.groupBy("air_time", F.col("air_time_projected"), F.col("air_time_expected")).count().distinct().orderBy("air_time").show()

+--------+------------------+-----------------+-----+
|air_time|air_time_projected|air_time_expected|count|
+--------+------------------+-----------------+-----+
|      22|                29|               22|    5|
|      27|                30|               27|   39|
|      29|                31|               29|   54|
|      30|                30|               27|    2|
|      32|                32|               32|  157|
|      34|                32|               34|  144|
|      37|                42|               37|  105|
|      49|                54|               49|    7|
|      53|                44|               53|   10|
|      59|                64|               59|   13|
|      68|                67|               68|   93|
|      74|                76|               74|    8|
|      81|                74|               81|  121|
|      82|                80|               82|  190|
|      82|                83|               82|  171|
|      85|                76

# Faltou transformar campos dep_time e air_time em horas

## Pergunta 11

In [19]:
df_flights_Final = df_flights_Final.withColumn('arr_time', (
                                                                F.when(((F.col("arr_time").isNull())
                                                                        | (F.col("arr_time")=="NA"))
                                                                       &(F.col("dep_time").isNotNull())
                                                                       &(F.col("air_time").isNotNull())
                                                                , (F.col("dep_time")+F.col("air_time")).cast('int'))
                                                                .otherwise(F.col("arr_time").cast('int'))
                                                             )
                                            )

df_flights_Final.groupBy("arr_time").count().distinct().orderBy(F.col("arr_time").desc()).show()

+--------+-----+
|arr_time|count|
+--------+-----+
|    2400|    5|
|    2376|    1|
|    2359|    6|
|    2358|    4|
|    2357|    5|
|    2356|    7|
|    2355|   10|
|    2354|    5|
|    2353|    4|
|    2352|    7|
|    2351|    8|
|    2350|    7|
|    2349|    4|
|    2348|    6|
|    2347|   10|
|    2346|    6|
|    2345|    3|
|    2344|    5|
|    2343|    3|
|    2342|    9|
+--------+-----+
only showing top 20 rows



## Pergunta 12

In [20]:
df_flights_Final = df_flights_Final.withColumn('haul_duration', (
                                                                F.when((F.col("air_time").isNotNull())
                                                                       &(F.col("air_time").between(20, 180)), "SHORT-HAUL")
                                                                .when((F.col("air_time").isNotNull())
                                                                       &(F.col("air_time").between(181, 360)), "MEDIUM-HAUL")
                                                                .when((F.col("air_time").isNotNull())
                                                                       &(F.col("air_time")>360), "LONG-HAUL")
                                                                .otherwise(None)
                                                             )
                                            )

df_flights_Final.groupBy("air_time", "haul_duration").count().distinct().orderBy(F.col("air_time").desc()).show()

+--------+-------------+-----+
|air_time|haul_duration|count|
+--------+-------------+-----+
|     357|  MEDIUM-HAUL|   12|
|     347|  MEDIUM-HAUL|   70|
|     343|  MEDIUM-HAUL|   86|
|     341|  MEDIUM-HAUL|    7|
|     339|  MEDIUM-HAUL|   63|
|     334|  MEDIUM-HAUL|   28|
|     329|  MEDIUM-HAUL|   57|
|     317|  MEDIUM-HAUL|   29|
|     308|  MEDIUM-HAUL|   19|
|     298|  MEDIUM-HAUL|   31|
|     293|  MEDIUM-HAUL|   11|
|     291|  MEDIUM-HAUL|   33|
|     286|  MEDIUM-HAUL|  142|
|     283|  MEDIUM-HAUL|    9|
|     281|  MEDIUM-HAUL|  148|
|     279|  MEDIUM-HAUL|   72|
|     277|  MEDIUM-HAUL|  122|
|     276|  MEDIUM-HAUL|    5|
|     269|  MEDIUM-HAUL|   51|
|     268|  MEDIUM-HAUL|   47|
+--------+-------------+-----+
only showing top 20 rows



## Pergunta 13

In [21]:
df_flights_Final = df_flights_Final.withColumn('dep_season', (                    
                                                                F.when((F.col("dep_datetime").isNotNull())
                                                                       &(F.col("dep_datetime").between(F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-12-21 21:48:00")), F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-12-31 23:59:00"))))
                                                                       |(F.col("dep_datetime").between(F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-01-01 00:00:00")), F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-03-20 15:33:00"))))
                                                                       , "WINTER")
                                                                .when((F.col("dep_datetime").isNotNull())
                                                                       &(F.col("dep_datetime").between(F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-03-20 15:34:00")), F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-06-21 10:14:00"))))
                                                                       , "SPRING")
                                                                .when((F.col("dep_datetime").isNotNull())
                                                                       &(F.col("dep_datetime").between(F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-06-21 10:15:00")), F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-09-23 02:04:00"))))
                                                                       , "SUMMER")
                                                                .when((F.col("dep_datetime").isNotNull())
                                                                       &(F.col("dep_datetime").between(F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-09-23 02:05:00")), F.concat(F.lit(F.year(F.col('dep_datetime'))), F.lit("-12-21 21:47:00"))))
                                                                       , "FALL")
                                                                                                                                
                                                                .otherwise(None)
                                                             )
                                            )

(df_flights_Final.groupBy('dep_season')
                .agg(F.min('dep_datetime'),
                     F.max('dep_datetime'),
                     F.count('dep_datetime')).orderBy('dep_season').show())

# df_flights_Final.select("dep_datetime", F.year("dep_datetime"), F.month("dep_datetime"), F.dayofmonth("dep_datetime"), F.hour("dep_datetime") , F.minute("dep_datetime"), "dep_season").write.options(header='True', delimiter=',').csv("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Tranformacao_flights_teste.csv")


+----------+-------------------+-------------------+-------------------+
|dep_season|  min(dep_datetime)|  max(dep_datetime)|count(dep_datetime)|
+----------+-------------------+-------------------+-------------------+
|      FALL|2014-09-23 05:13:00|2014-12-21 18:34:00|               2373|
|    SPRING|2014-03-20 15:38:00|2014-06-21 10:05:00|               2560|
|    SUMMER|2014-06-21 11:02:00|2014-09-22 23:26:00|               2918|
|    WINTER|2014-01-01 00:00:00|2014-12-31 22:57:00|               2149|
+----------+-------------------+-------------------+-------------------+



## Pergunta 14

In [22]:
df_flights_Final = df_flights_Final.withColumn('dep_delay_category', (
                                                                F.when((F.col("dep_delay").isNotNull())
                                                                       &(F.col("dep_delay")<0), "ANTECIPATED")
                                                                .when((F.col("dep_delay").isNotNull())
                                                                       &(F.col("dep_delay")==0), "INTIME")
                                                                .when((F.col("dep_delay").isNotNull())
                                                                       &((F.col("dep_delay")>0) & (F.col("dep_delay")<60)), "MINOR")
                                                                .when((F.col("dep_delay").isNotNull())
                                                                       &(F.col("dep_delay")>=60), "MAJOR")
                                                                .otherwise(None)
                                                             )
                                            )

df_flights_Final.groupBy("dep_delay", "dep_delay_category").count().distinct().orderBy(F.col("dep_delay").desc()).show()
(df_flights_Final.groupBy('dep_delay_category').agg(F.count('dep_delay_category')).orderBy('dep_delay_category').show())


+---------+------------------+-----+
|dep_delay|dep_delay_category|count|
+---------+------------------+-----+
|      886|             MAJOR|    1|
|      739|             MAJOR|    1|
|      667|             MAJOR|    1|
|      385|             MAJOR|    1|
|      370|             MAJOR|    1|
|      352|             MAJOR|    1|
|      328|             MAJOR|    1|
|      310|             MAJOR|    1|
|      302|             MAJOR|    1|
|      274|             MAJOR|    2|
|      273|             MAJOR|    1|
|      271|             MAJOR|    1|
|      266|             MAJOR|    1|
|      249|             MAJOR|    1|
|      229|             MAJOR|    1|
|      226|             MAJOR|    2|
|      223|             MAJOR|    1|
|      222|             MAJOR|    2|
|      214|             MAJOR|    1|
|      213|             MAJOR|    1|
+---------+------------------+-----+
only showing top 20 rows

+------------------+-------------------------+
|dep_delay_category|count(dep_delay_cat

In [23]:
df_flights_Final.write.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Outputs_Flights_Transformacao.parquet")