In [118]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [119]:
import findspark
findspark.init()
import datetime
import pandas as pd

In [120]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType

In [121]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[5]")
                     .appName("Semana 4 - Desafio Relatórios"))

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at C:\Users\amarti40\AppData\Local\Temp/ipykernel_14196/1377807283.py:2 

In [None]:
df_airports=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Outputs_Airports_Transformacao.parquet")
df_planes=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Outputs_Planes_Transformacao.parquet")
df_flights=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Outputs_Flights_Transformacao.parquet")

In [None]:
print(df_airports.count())
print(df_planes.count())
print(df_flights.count())

In [None]:
df_airports_origin = df_airports.select([F.col(c).alias(c+"_air_origin") for c in df_airports.columns])
df_airports_dest = df_airports.select([F.col(c).alias(c+"_air_dest") for c in df_airports.columns])

df_planes = df_planes.select([F.col(c).alias(c+"_pl") for c in df_planes.columns])
df_flights = df_flights.select([F.col(c).alias(c+"_fl") for c in df_flights.columns])

df_airports_origin.printSchema()
df_airports_dest.printSchema()
df_planes.printSchema()
df_flights.printSchema()

# Perguntas para negócio

## Pergunta 1

In [None]:
 df_final = (df_flights.join(df_planes, df_planes.tailnum_pl == df_flights.tailnum_fl
                             ,"left" 
                            )                                             
                       .join(df_airports_origin, df_airports_origin.faa_air_origin == df_flights.origin_fl
                             ,"left"
                            )
                       .join(df_airports_dest, df_airports_dest.faa_air_dest == df_flights.dest_fl
                             ,"left"
                            )          
            )

df_final.printSchema()

print(df_final.count())


In [None]:
df_final.createOrReplaceTempView('db_final')
df_airports.createOrReplaceTempView('db_airports')
df_planes.createOrReplaceTempView('db_planes')
df_flights.createOrReplaceTempView('db_flights')

## Pergunta 2

In [42]:
spark.getOrCreate().sql(f"""
SELECT  region
        ,Count(Faa) as Numero_Aeroportos
FROM db_airports
group by region
""").show()

+-------------+-----------------+
|       region|Numero_Aeroportos|
+-------------+-----------------+
|       ALASKA|              243|
|     OFFSHORE|               22|
|MAINLAND-EAST|              696|
|MAINLAND-WEST|              436|
+-------------+-----------------+



In [43]:
(df_airports.groupBy("region")
            .agg(F.count('Faa').alias('Count_Name')).distinct()
            .select('region', 'Count_Name').show())

+-------------+----------+
|       region|Count_Name|
+-------------+----------+
|       ALASKA|       243|
|     OFFSHORE|        22|
|MAINLAND-EAST|       696|
|MAINLAND-WEST|       436|
+-------------+----------+



## Pergunta 3

In [31]:
spark.getOrCreate().sql(f"""
SELECT MAX(ABS(alt_air_origin - alt_air_dest)) AS Dif_Altitude_Ori_Dest
FROM db_final
""").show(5)

+---------------------+
|Dif_Altitude_Ori_Dest|
+---------------------+
|                 6169|
+---------------------+



In [67]:
(df_final.select(F.max(F.abs(F.col("alt_air_origin")-F.col("alt_air_dest"))).alias("Dif_Altitude_Ori_Dest"))
         .show())

+---------------------+
|Dif_Altitude_Ori_Dest|
+---------------------+
|                 6169|
+---------------------+



## Pergunta 4

In [43]:
spark.getOrCreate().sql(f"""
SELECT Round(AVG(arr_delay_fl),2) AS Atraso_Medio_Arrive
FROM db_final
WHERE arr_delay_fl > 0 
""").show()

+------------+
|Atraso_Medio|
+------------+
|       24.65|
+------------+



In [66]:
(df_final.filter(F.col("arr_delay_fl") > 0)
         .select(F.round(F.avg(F.col("arr_delay_fl")),2).alias("Atraso_Medio"))
         .show())

+------------+
|Atraso_Medio|
+------------+
|       24.65|
+------------+



## Pergunta 5

In [49]:
spark.getOrCreate().sql(f"""
SELECT  region_air_dest AS Regiao_Destino,
        Round(AVG(arr_delay_fl),2) AS Atraso_Medio_Arrive
FROM db_final
WHERE arr_delay_fl > 0 
GROUP BY region_air_dest
""").show()

+--------------+-------------------+
|Regiao_Destino|Atraso_Medio_Arrive|
+--------------+-------------------+
|        ALASKA|              20.86|
|      OFFSHORE|              25.46|
| MAINLAND-EAST|              28.53|
| MAINLAND-WEST|              23.79|
+--------------+-------------------+



In [54]:
(df_final.filter(F.col("arr_delay_fl") > 0)
         .groupBy("region_air_dest")
         .agg(F.round(F.avg(F.col("arr_delay_fl")),2).alias("Atraso_Medio")).distinct()
         .select('region_air_dest', 'Atraso_Medio').show())

+---------------+------------+
|region_air_dest|Atraso_Medio|
+---------------+------------+
|         ALASKA|       20.86|
|       OFFSHORE|       25.46|
|  MAINLAND-EAST|       28.53|
|  MAINLAND-WEST|       23.79|
+---------------+------------+



## Pergunta 6

In [47]:
spark.getOrCreate().sql(f"""
SELECT  YEAR(dep_datetime_fl) AS Year,
        Round(SUM(arr_delay_fl),2) AS Atraso_Acumulado_Arrive
FROM db_final
WHERE arr_delay_fl > 0 
GROUP BY YEAR(dep_datetime_fl)
""").show()

+----+-----------------------+
|Year|Atraso_Acumulado_Arrive|
+----+-----------------------+
|2014|                  91820|
+----+-----------------------+



In [65]:
(df_final.filter(F.col("arr_delay_fl") > 0)
         .groupBy(F.year("dep_datetime_fl").alias("Year"))
         .agg(F.round(F.sum(F.col("arr_delay_fl")),2).alias("Atraso_Acumulado_Arrive"))
         .show())

+----+-----------------------+
|Year|Atraso_Acumulado_Arrive|
+----+-----------------------+
|2014|                  91820|
+----+-----------------------+



## Pergunta 7

In [50]:
spark.getOrCreate().sql(f"""
SELECT  YEAR(dep_datetime_fl) AS Year,
        region_air_dest AS Regiao_Destino,
        Round(SUM(arr_delay_fl),2) AS Atraso_Acumulado_Arrive
FROM db_final
WHERE arr_delay_fl > 0 
GROUP BY YEAR(dep_datetime_fl),
         region_air_dest
""").show()

+----+--------------+-----------------------+
|Year|Regiao_Destino|Atraso_Acumulado_Arrive|
+----+--------------+-----------------------+
|2014|        ALASKA|                   5737|
|2014| MAINLAND-EAST|                  22938|
|2014|      OFFSHORE|                   2903|
|2014| MAINLAND-WEST|                  60242|
+----+--------------+-----------------------+



In [69]:
(df_final.filter(F.col("arr_delay_fl") > 0)
         .groupBy(F.year("dep_datetime_fl").alias("Year"), F.col("region_air_dest").alias("Regiao_Destino"))
         .agg(F.round(F.sum(F.col("arr_delay_fl")),2).alias("Atraso_Acumulado_Arrive"))
         .show())

+----+--------------+-----------------------+
|Year|Regiao_Destino|Atraso_Acumulado_Arrive|
+----+--------------+-----------------------+
|2014|        ALASKA|                   5737|
|2014| MAINLAND-EAST|                  22938|
|2014|      OFFSHORE|                   2903|
|2014| MAINLAND-WEST|                  60242|
+----+--------------+-----------------------+



## Pergunta 8

In [56]:
spark.getOrCreate().sql(f"""
SELECT  Round(AVG(air_time_fl),2) AS Tempo_Medio_Voo
FROM db_final
""").show()

+---------------+
|Tempo_Medio_Voo|
+---------------+
|         152.32|
+---------------+



In [70]:
(df_final.agg(F.round(F.avg(F.col("air_time_fl")),2).alias("Tempo_Medio_Voo"))
         .show())

+---------------+
|Tempo_Medio_Voo|
+---------------+
|         152.32|
+---------------+



## Pergunta 9

In [57]:
spark.getOrCreate().sql(f"""
SELECT  region_air_dest AS Regiao_Destino,
        Round(AVG(air_time_fl),2) AS Tempo_Medio_Voo
FROM db_final
GROUP BY region_air_dest
""").show()

+--------------+---------------+
|Regiao_Destino|Tempo_Medio_Voo|
+--------------+---------------+
|        ALASKA|          174.4|
|      OFFSHORE|         340.31|
| MAINLAND-EAST|         236.59|
| MAINLAND-WEST|         114.83|
+--------------+---------------+



In [71]:
(df_final.groupBy(F.col("region_air_dest").alias("Regiao_Destino"))
         .agg(F.round(F.avg(F.col("air_time_fl")),2).alias("Tempo_Medio_Voo"))
         .show())

+--------------+---------------+
|Regiao_Destino|Tempo_Medio_Voo|
+--------------+---------------+
|        ALASKA|          174.4|
|      OFFSHORE|         340.31|
| MAINLAND-EAST|         236.59|
| MAINLAND-WEST|         114.83|
+--------------+---------------+



## Pergunta 10

In [65]:
spark.getOrCreate().sql(f"""
SELECT  origin_fl AS Origem,
        dest_fl AS Destino,
        Round(AVG(air_time_fl),2) AS Tempo_Medio_Voo
FROM db_final
GROUP BY origin_fl,
         dest_fl
ORDER BY 1, 2
""").show(150)

+------+-------+---------------+
|Origem|Destino|Tempo_Medio_Voo|
+------+-------+---------------+
|   PDX|    ABQ|          136.0|
|   PDX|    ANC|          202.0|
|   PDX|    ATL|          247.0|
|   PDX|    AUS|          204.0|
|   PDX|    BOI|           49.0|
|   PDX|    BOS|          291.0|
|   PDX|    BUR|          108.0|
|   PDX|    BWI|          276.0|
|   PDX|    CLT|          257.0|
|   PDX|    DCA|          269.0|
|   PDX|    DEN|          123.0|
|   PDX|    DFW|          191.0|
|   PDX|    DTW|          221.0|
|   PDX|    EUG|          27.15|
|   PDX|    EWR|          281.0|
|   PDX|    FAI|          205.0|
|   PDX|    HNL|          329.0|
|   PDX|    HOU|          226.0|
|   PDX|    IAD|          267.0|
|   PDX|    IAH|          213.0|
|   PDX|    JFK|          286.0|
|   PDX|    KOA|          357.0|
|   PDX|    LAS|          105.0|
|   PDX|    LAX|          114.0|
|   PDX|    LGB|          116.0|
|   PDX|    LIH|          341.0|
|   PDX|    LMT|           53.0|
|   PDX|  

In [73]:
(df_final.groupBy(F.col("origin_fl").alias("Origem"), F.col("dest_fl").alias("Destino"))
         .agg(F.round(F.avg(F.col("air_time_fl")),2).alias("Tempo_Medio_Voo"))
         .orderBy("Origem", "Destino")
         .show(150))

+------+-------+---------------+
|Origem|Destino|Tempo_Medio_Voo|
+------+-------+---------------+
|   PDX|    ABQ|          136.0|
|   PDX|    ANC|          202.0|
|   PDX|    ATL|          247.0|
|   PDX|    AUS|          204.0|
|   PDX|    BOI|           49.0|
|   PDX|    BOS|          291.0|
|   PDX|    BUR|          108.0|
|   PDX|    BWI|          276.0|
|   PDX|    CLT|          257.0|
|   PDX|    DCA|          269.0|
|   PDX|    DEN|          123.0|
|   PDX|    DFW|          191.0|
|   PDX|    DTW|          221.0|
|   PDX|    EUG|          27.15|
|   PDX|    EWR|          281.0|
|   PDX|    FAI|          205.0|
|   PDX|    HNL|          329.0|
|   PDX|    HOU|          226.0|
|   PDX|    IAD|          267.0|
|   PDX|    IAH|          213.0|
|   PDX|    JFK|          286.0|
|   PDX|    KOA|          357.0|
|   PDX|    LAS|          105.0|
|   PDX|    LAX|          114.0|
|   PDX|    LGB|          116.0|
|   PDX|    LIH|          341.0|
|   PDX|    LMT|           53.0|
|   PDX|  

## Pergunta 11

In [67]:
spark.getOrCreate().sql(f"""
SELECT  YEAR(dep_datetime_fl) AS Year,
        Round(SUM(air_time_fl),2) AS Tempo_Acumulado_Voo
FROM db_final
GROUP BY YEAR(dep_datetime_fl)
ORDER BY 1, 2
""").show()

+----+-------------------+
|Year|Tempo_Acumulado_Voo|
+----+-------------------+
|2014|            1523228|
+----+-------------------+



In [74]:
(df_final.groupBy(F.year("dep_datetime_fl").alias("Year"))
         .agg(F.round(F.sum(F.col("air_time_fl")),2).alias("Tempo_Acumulado_Voo"))
         .show())

+----+-------------------+
|Year|Tempo_Acumulado_Voo|
+----+-------------------+
|2014|            1523228|
+----+-------------------+



## Pergunta 12

In [68]:
spark.getOrCreate().sql(f"""
SELECT  region_air_dest AS Regiao_Destino,
        Round(SUM(air_time_fl),2) AS Tempo_Acumulado_Voo
FROM db_final
GROUP BY region_air_dest
ORDER BY 1, 2
""").show()

+--------------+-------------------+
|Regiao_Destino|Tempo_Acumulado_Voo|
+--------------+-------------------+
|        ALASKA|             120162|
| MAINLAND-EAST|             507240|
| MAINLAND-WEST|             785905|
|      OFFSHORE|             109921|
+--------------+-------------------+



In [75]:
(df_final.groupBy(F.col("region_air_dest").alias("Regiao_Destino"))
         .agg(F.round(F.sum(F.col("air_time_fl")),2).alias("Tempo_Acumulado_Voo"))
         .show())

+--------------+-------------------+
|Regiao_Destino|Tempo_Acumulado_Voo|
+--------------+-------------------+
|        ALASKA|             120162|
|      OFFSHORE|             109921|
| MAINLAND-EAST|             507240|
| MAINLAND-WEST|             785905|
+--------------+-------------------+



## Pergunta 13

In [72]:
spark.getOrCreate().sql(f"""
SELECT  Round(AVG(distance_fl),2) AS Distancia_Media_Voo
FROM db_final
ORDER BY 1
""").show()

+-------------------+
|Distancia_Media_Voo|
+-------------------+
|            1208.15|
+-------------------+



In [76]:
(df_final.agg(F.round(F.avg(F.col("distance_fl")),2).alias("Distancia_Media_Voo"))
         .show())

+-------------------+
|Distancia_Media_Voo|
+-------------------+
|            1208.15|
+-------------------+



## Pergunta 14

In [73]:
spark.getOrCreate().sql(f"""
SELECT  region_air_dest AS Regiao_Destino,
        Round(AVG(distance_fl),2) AS Distancia_Media_Voo
FROM db_final
GROUP BY region_air_dest
ORDER BY 1, 2
""").show()

+--------------+-------------------+
|Regiao_Destino|Distancia_Media_Voo|
+--------------+-------------------+
|        ALASKA|            1317.35|
| MAINLAND-EAST|             2042.4|
| MAINLAND-WEST|             867.92|
|      OFFSHORE|            2646.75|
+--------------+-------------------+



In [77]:
(df_final.groupBy(F.col("region_air_dest").alias("Regiao_Destino"))
         .agg(F.round(F.avg(F.col("distance_fl")),2).alias("Distancia_Media_Voo"))
         .show())

+--------------+-------------------+
|Regiao_Destino|Distancia_Media_Voo|
+--------------+-------------------+
|        ALASKA|            1317.35|
|      OFFSHORE|            2646.75|
| MAINLAND-EAST|             2042.4|
| MAINLAND-WEST|             867.92|
+--------------+-------------------+



## Pergunta 15

In [77]:
spark.getOrCreate().sql(f"""
SELECT  origin_fl AS Origem,
        dest_fl AS Destino,
        Round(AVG(distance_fl),2) AS Distancia_Media_Voo
FROM db_final
GROUP BY origin_fl,
         dest_fl
ORDER BY 1, 2
""").show(150)

+------+-------+-------------------+
|Origem|Destino|Distancia_Media_Voo|
+------+-------+-------------------+
|   PDX|    ABQ|             1111.0|
|   PDX|    ANC|             1542.0|
|   PDX|    ATL|             2172.0|
|   PDX|    AUS|             1715.0|
|   PDX|    BOI|              344.0|
|   PDX|    BOS|             2537.0|
|   PDX|    BUR|              817.0|
|   PDX|    BWI|             2358.0|
|   PDX|    CLT|             2282.0|
|   PDX|    DCA|             2350.0|
|   PDX|    DEN|              991.0|
|   PDX|    DFW|             1616.0|
|   PDX|    DTW|             1953.0|
|   PDX|    EUG|              106.0|
|   PDX|    EWR|             2434.0|
|   PDX|    FAI|             1640.0|
|   PDX|    HNL|             2603.0|
|   PDX|    HOU|             1843.0|
|   PDX|    IAD|             2327.0|
|   PDX|    IAH|             1825.0|
|   PDX|    JFK|             2454.0|
|   PDX|    KOA|             2607.0|
|   PDX|    LAS|              763.0|
|   PDX|    LAX|              834.0|
|

In [78]:
(df_final.groupBy(F.col("origin_fl").alias("Origem"), F.col("dest_fl").alias("Destino"))
         .agg(F.round(F.avg(F.col("distance_fl")),2).alias("Distancia_Media_Voo"))
         .orderBy("Origem", "Destino")
         .show(150))

+------+-------+-------------------+
|Origem|Destino|Distancia_Media_Voo|
+------+-------+-------------------+
|   PDX|    ABQ|             1111.0|
|   PDX|    ANC|             1542.0|
|   PDX|    ATL|             2172.0|
|   PDX|    AUS|             1715.0|
|   PDX|    BOI|              344.0|
|   PDX|    BOS|             2537.0|
|   PDX|    BUR|              817.0|
|   PDX|    BWI|             2358.0|
|   PDX|    CLT|             2282.0|
|   PDX|    DCA|             2350.0|
|   PDX|    DEN|              991.0|
|   PDX|    DFW|             1616.0|
|   PDX|    DTW|             1953.0|
|   PDX|    EUG|              106.0|
|   PDX|    EWR|             2434.0|
|   PDX|    FAI|             1640.0|
|   PDX|    HNL|             2603.0|
|   PDX|    HOU|             1843.0|
|   PDX|    IAD|             2327.0|
|   PDX|    IAH|             1825.0|
|   PDX|    JFK|             2454.0|
|   PDX|    KOA|             2607.0|
|   PDX|    LAS|              763.0|
|   PDX|    LAX|              834.0|
|

## Pergunta 16

In [81]:
spark.getOrCreate().sql(f"""
SELECT  YEAR(dep_datetime_fl) AS Year,
        Round(SUM(distance_fl),2) AS Distancia_Acumulada_Voo
FROM db_final
GROUP BY YEAR(dep_datetime_fl)
ORDER BY 1, 2
""").show()

+----+-----------------------+
|Year|Distancia_Acumulada_Voo|
+----+-----------------------+
|2014|               12081516|
+----+-----------------------+



In [79]:
(df_final.groupBy(F.year("dep_datetime_fl").alias("Year"))
         .agg(F.round(F.sum(F.col("distance_fl")),2).alias("Distancia_Acumulada_Voo"))
         .show())

+----+-----------------------+
|Year|Distancia_Acumulada_Voo|
+----+-----------------------+
|2014|               12081516|
+----+-----------------------+



## Pergunta 17

In [82]:
spark.getOrCreate().sql(f"""
SELECT  region_air_dest AS Regiao_Destino,
        Round(SUM(distance_fl),2) AS Distancia_Acumulada_Voo
FROM db_final
GROUP BY region_air_dest
ORDER BY 1, 2
""").show(150)

+--------------+-----------------------+
|Regiao_Destino|Distancia_Acumulada_Voo|
+--------------+-----------------------+
|        ALASKA|                 907653|
| MAINLAND-EAST|                4378902|
| MAINLAND-WEST|                5940061|
|      OFFSHORE|                 854900|
+--------------+-----------------------+



In [80]:
(df_final.groupBy(F.col("region_air_dest").alias("Regiao_Destino"))
         .agg(F.round(F.sum(F.col("distance_fl")),2).alias("Distancia_Acumulada_Voo"))
         .show())

+--------------+-----------------------+
|Regiao_Destino|Distancia_Acumulada_Voo|
+--------------+-----------------------+
|        ALASKA|                 907653|
|      OFFSHORE|                 854900|
| MAINLAND-EAST|                4378902|
| MAINLAND-WEST|                5940061|
+--------------+-----------------------+



## Pergunta 18
(Pergunta sem resposta exata)

In [84]:
spark.getOrCreate().sql(f"""
SELECT  origin_fl AS Origem,
        dest_fl AS Destino,
        Round(AVG(seats_pl),2) AS Num_Passageiros_Medio_Voo
FROM db_final
GROUP BY origin_fl,
         dest_fl
ORDER BY 1, 2
""").show(150)

+------+-------+-------------------------+
|Origem|Destino|Num_Passageiros_Medio_Voo|
+------+-------+-------------------------+
|   PDX|    ABQ|                   144.85|
|   PDX|    ANC|                   165.43|
|   PDX|    ATL|                   219.03|
|   PDX|    AUS|                    140.0|
|   PDX|    BOI|                     80.0|
|   PDX|    BOS|                   179.27|
|   PDX|    BUR|                    79.65|
|   PDX|    BWI|                    141.8|
|   PDX|    CLT|                   208.14|
|   PDX|    DCA|                    149.0|
|   PDX|    DEN|                   149.14|
|   PDX|    DFW|                   159.63|
|   PDX|    DTW|                   202.64|
|   PDX|    EUG|                     32.0|
|   PDX|    EWR|                    189.1|
|   PDX|    FAI|                   168.25|
|   PDX|    HNL|                   300.61|
|   PDX|    HOU|                    141.5|
|   PDX|    IAD|                   187.39|
|   PDX|    IAH|                   182.64|
|   PDX|   

In [81]:
(df_final.groupBy(F.col("origin_fl").alias("Origem"), F.col("dest_fl").alias("Destino"))
         .agg(F.round(F.avg(F.col("seats_pl")),2).alias("Num_Passageiros_Medio_Voo"))
         .orderBy("Origem", "Destino")
         .show(150))

+------+-------+-------------------------+
|Origem|Destino|Num_Passageiros_Medio_Voo|
+------+-------+-------------------------+
|   PDX|    ABQ|                   144.85|
|   PDX|    ANC|                   165.43|
|   PDX|    ATL|                   219.03|
|   PDX|    AUS|                    140.0|
|   PDX|    BOI|                     80.0|
|   PDX|    BOS|                   179.27|
|   PDX|    BUR|                    79.65|
|   PDX|    BWI|                    141.8|
|   PDX|    CLT|                   208.14|
|   PDX|    DCA|                    149.0|
|   PDX|    DEN|                   149.14|
|   PDX|    DFW|                   159.63|
|   PDX|    DTW|                   202.64|
|   PDX|    EUG|                     32.0|
|   PDX|    EWR|                    189.1|
|   PDX|    FAI|                   168.25|
|   PDX|    HNL|                   300.61|
|   PDX|    HOU|                    141.5|
|   PDX|    IAD|                   187.39|
|   PDX|    IAH|                   182.64|
|   PDX|   

## Pergunta 19
(Pergunta sem resposta exata)

In [86]:
spark.getOrCreate().sql(f"""
SELECT  YEAR(dep_datetime_fl) AS Year,
        Round(SUM(seats_pl),2) AS Num_Passageiros_Acumulado_Voo
FROM db_final
GROUP BY YEAR(dep_datetime_fl)
ORDER BY 1, 2
""").show()

+----+-----------------------------+
|Year|Num_Passageiros_Acumulado_Voo|
+----+-----------------------------+
|2014|                      1509544|
+----+-----------------------------+



In [82]:
(df_final.groupBy(F.year("dep_datetime_fl").alias("Year"))
         .agg(F.round(F.sum(F.col("seats_pl")),2).alias("Num_Passageiros_Acumulado_Voo"))
         .show())

+----+-----------------------------+
|Year|Num_Passageiros_Acumulado_Voo|
+----+-----------------------------+
|2014|                      1509544|
+----+-----------------------------+



## Pergunta 20
(Pode ter mais de uma resposta)

In [90]:
spark.getOrCreate().sql(f"""

WITH MAX_VIAGENS
AS
(SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        Count(dest_fl) as Qtd_Viagens
FROM db_final
GROUP BY dest_fl
         ,region_air_dest)


SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        Count(tailnum_fl) as Qtd_Viagens
FROM db_final
GROUP BY dest_fl
         ,region_air_dest
HAVING Qtd_Viagens = (SELECT MAX(Qtd_Viagens) FROM MAX_VIAGENS)
ORDER BY 3 DESC
""").show()

+-------+--------------+-----------+
|Destino|Regiao_Destino|Qtd_Viagens|
+-------+--------------+-----------+
|    SFO| MAINLAND-WEST|        787|
+-------+--------------+-----------+



In [95]:
(df_final.groupBy('dest_fl', 'region_air_dest')
.agg(F.count('tailnum_fl').alias('Max_seats'))
.agg(F.max(F.struct(F.col('Max_seats'),
     F.col('dest_fl').alias("Destino"),
     F.col('region_air_dest').alias("Regiao_Destino"))).alias('Max_Dest'))
.select("Max_Dest.Destino", "Max_Dest.Regiao_Destino", "Max_Dest.Max_seats")
.show())

+-------+--------------+---------+
|Destino|Regiao_Destino|Max_seats|
+-------+--------------+---------+
|    SFO| MAINLAND-WEST|      787|
+-------+--------------+---------+



## Pergunta 21

In [97]:
spark.getOrCreate().sql(f"""

WITH MAX_PASSAGEIROS
AS
(SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        SUM(seats_pl) as Qtd_Passageiros
FROM db_final
GROUP BY dest_fl
         ,region_air_dest)


SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        SUM(seats_pl) as Qtd_Passageiros
FROM db_final
GROUP BY dest_fl
         ,region_air_dest
HAVING Qtd_Passageiros = (SELECT MAX(Qtd_Passageiros) FROM MAX_PASSAGEIROS)
ORDER BY 3 DESC
""").show()

+-------+--------------+---------------+
|Destino|Regiao_Destino|Qtd_Passageiros|
+-------+--------------+---------------+
|    SFO| MAINLAND-WEST|         119635|
+-------+--------------+---------------+



In [98]:
(df_final.groupBy('dest_fl', 'region_air_dest')
.agg(F.sum('seats_pl').alias('Max_seats'))
.agg(F.max(F.struct(F.col('Max_seats'),
     F.col('dest_fl').alias("Destino"),
     F.col('region_air_dest').alias("Regiao_Destino"))).alias('Max_Dest'))
.select("Max_Dest.Destino", "Max_Dest.Regiao_Destino", "Max_Dest.Max_seats")
.show())

+-------+--------------+---------+
|Destino|Regiao_Destino|Max_seats|
+-------+--------------+---------+
|    SFO| MAINLAND-WEST|   119635|
+-------+--------------+---------+



## Pergunta 22

In [108]:
spark.getOrCreate().sql(f"""

SELECT  Distinct dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        distance_fl
FROM db_final
WHERE origin_fl = 'PDX'
AND distance_fl = (SELECT MAX(distance_fl) from FROM db_final WHERE origin_fl = 'PDX')
""").show()

+-------+--------------+-----------+
|Destino|Regiao_Destino|distance_fl|
+-------+--------------+-----------+
|    LIH|      OFFSHORE|       2631|
+-------+--------------+-----------+



In [100]:
(df_final.filter(F.col("origin_fl") == "PDX")
         .groupBy('dest_fl', 'region_air_dest')
         .agg(F.max('distance_fl').alias('Max_distance'))
         .agg(F.max(F.struct(F.col('Max_distance'),
              F.col('dest_fl').alias("Destino"),
              F.col('region_air_dest').alias("Regiao_Destino"))).alias('Max_Dest'))
         .select("Max_Dest.Destino", "Max_Dest.Regiao_Destino", "Max_Dest.Max_distance")
         .show())

+-------+--------------+------------+
|Destino|Regiao_Destino|Max_distance|
+-------+--------------+------------+
|    LIH|      OFFSHORE|        2631|
+-------+--------------+------------+



## Pergunta 23
(Pode haver mais de uma resposta)

In [119]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_VOOS
AS
(SELECT  YEAR(dep_datetime_fl) AS Year,
        MONTH(dep_datetime_fl) AS Month,
        dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        Count(*) AS Qtd_Voos
FROM db_final
GROUP BY YEAR(dep_datetime_fl),
         MONTH(dep_datetime_fl),
         dest_fl,
         region_air_dest)

SELECT  YEAR(dep_datetime_fl) AS Year,
        MONTH(dep_datetime_fl) AS Month,
        dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        Count(*) AS Qtd_Voos
FROM db_final
GROUP BY YEAR(dep_datetime_fl),
         MONTH(dep_datetime_fl),
         dest_fl,
         region_air_dest
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_VOOS)
ORDER BY 3, 4, 2, 1 DESC

""").show()

+----+-----+-------+--------------+--------+
|Year|Month|Destino|Regiao_Destino|Qtd_Voos|
+----+-----+-------+--------------+--------+
|2014|    5|    LAX| MAINLAND-WEST|      77|
+----+-----+-------+--------------+--------+



In [102]:
(df_final.groupBy(F.year(F.col("dep_datetime_fl")).alias("Year"), F.month(F.col("dep_datetime_fl")).alias("Month"),'dest_fl', 'region_air_dest')
         .agg(F.count('tailnum_fl').alias('Qtd_Voos'))
         .agg(F.max(F.struct(F.col('Qtd_Voos'),
              F.col('dest_fl').alias("Destino"),
              F.col('region_air_dest').alias("Regiao_Destino"),
              F.col('Year'),
              F.col('Month'))).alias('Max_Voos'))
         .select("Max_Voos.Year", "Max_Voos.Month", "Max_Voos.Destino", "Max_Voos.Regiao_Destino", "Max_Voos.Qtd_Voos")
         .show())

+----+-----+-------+--------------+--------+
|Year|Month|Destino|Regiao_Destino|Qtd_Voos|
+----+-----+-------+--------------+--------+
|2014|    5|    LAX| MAINLAND-WEST|      77|
+----+-----+-------+--------------+--------+



## Pergunta 24

In [126]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_MODELOS
AS
(SELECT manufacturer_pl AS Fabricante,
        model_pl AS Modelo_Aviao,
        Count(*) AS Qtd_Voos
FROM db_final
WHERE model_pl IS NOT NULL
GROUP BY manufacturer_pl,
         model_pl)

SELECT  manufacturer_pl AS Fabricante,
        model_pl AS Modelo_Aviao,
        Count(*) AS Qtd_Voos
FROM db_final
WHERE model_pl IS NOT NULL
GROUP BY manufacturer_pl,
         model_pl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_MODELOS)
ORDER BY 3 DESC

""").show()

+----------+------------+--------+
|Fabricante|Modelo_Aviao|Qtd_Voos|
+----------+------------+--------+
|    BOEING|     737-890|    1463|
+----------+------------+--------+



In [104]:
(df_final.groupBy('manufacturer_pl', 'model_pl')
         .agg(F.count('tailnum_fl').alias('Qtd_Voos'))
         .agg(F.max(F.struct(F.col('Qtd_Voos'),
              F.col('manufacturer_pl').alias("Fabricante"),
              F.col('model_pl').alias("Modelo"),
                            )).alias('Max_Voos'))
         .select("Max_Voos.Fabricante", "Max_Voos.Modelo", "Max_Voos.Qtd_Voos")
         .show())

+----------+-------+--------+
|Fabricante| Modelo|Qtd_Voos|
+----------+-------+--------+
|    BOEING|737-890|    1463|
+----------+-------+--------+



## Pergunta 25

In [133]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_MODELOS
AS
(SELECT dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        manufacturer_pl AS Fabricante,
        model_pl AS Modelo_Aviao,
        Count(*) AS Qtd_Voos
FROM db_final
WHERE model_pl IS NOT NULL
GROUP BY dest_fl,
         region_air_dest, 
         manufacturer_pl,
         model_pl)

SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        manufacturer_pl AS Fabricante,
        model_pl AS Modelo_Aviao,
        Count(*) AS Qtd_Voos
FROM db_final a
WHERE model_pl IS NOT NULL
GROUP BY dest_fl,
         region_air_dest,
         manufacturer_pl,
         model_pl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_MODELOS b WHERE a.dest_fl = b.Destino)
ORDER BY 1, 2, 5  DESC

""").show(150)

+-------+--------------+----------+------------+--------+
|Destino|Regiao_Destino|Fabricante|Modelo_Aviao|Qtd_Voos|
+-------+--------------+----------+------------+--------+
|    ABQ| MAINLAND-WEST|    BOEING|     737-7H4|      26|
|    ANC|        ALASKA|    BOEING|     737-890|     138|
|    ATL| MAINLAND-EAST|    BOEING|   737-932ER|      80|
|    AUS| MAINLAND-WEST|    BOEING|   737-990ER|      10|
|    BLI| MAINLAND-WEST|    BOEING|     737-890|       3|
|    BNA| MAINLAND-EAST|    BOEING|     737-8H4|       6|
|    BOI| MAINLAND-WEST|BOMBARDIER| CL-600-2C10|       7|
|    BOS| MAINLAND-EAST|    AIRBUS|    A320-232|      55|
|    BUR| MAINLAND-WEST|BOMBARDIER| CL-600-2C10|      70|
|    BWI| MAINLAND-EAST|    BOEING|     737-7H4|      16|
|    CLE| MAINLAND-EAST|    AIRBUS|    A320-214|       2|
|    CLT| MAINLAND-EAST|    AIRBUS|    A321-231|      33|
|    COS| MAINLAND-WEST|BOMBARDIER| CL-600-2C10|      26|
|    CVG| MAINLAND-EAST|    BOEING|     757-232|       2|
|    CVG| MAIN

In [128]:
(df_final.groupBy(F.col('dest_fl').alias("Destino")
                  ,F.col('region_air_dest').alias("Regiao_Destino")
                  ,F.col('manufacturer_pl').alias("Fabricante")
                  ,F.col('model_pl').alias("Modelo"))
         .agg(F.count('tailnum_fl').alias('Qtd_Voos'))
         .agg(F.max(F.struct(F.col('Qtd_Voos'),
                    F.col("Destino"),
                    F.col("Regiao_Destino"),              
                    F.col("Fabricante"),
                    F.col("Modelo"))).alias('Max_Voos'))
         .select("Max_Voos.Destino", "Max_Voos.Regiao_Destino", "Max_Voos.Fabricante", "Max_Voos.Modelo", "Max_Voos.Qtd_Voos")
         .orderBy("Max_Voos.Destino", "Max_Voos.Regiao_Destino", "Max_Voos.Qtd_Voos")
         .show())

+-------+--------------+----------+------+--------+
|Destino|Regiao_Destino|Fabricante|Modelo|Qtd_Voos|
+-------+--------------+----------+------+--------+
|    DFW| MAINLAND-WEST|      null|  null|     248|
+-------+--------------+----------+------+--------+



## Pergunta 26

In [137]:
spark.getOrCreate().sql(f"""
SELECT  haul_duration_fl AS Haul_Duration,
        ROUND(AVG(engines_pl),2) AS Qtd_Media_Motores
FROM db_final a
GROUP BY haul_duration_fl
ORDER BY 1, 2 DESC

""").show()

+-------------+-----------------+
|Haul_Duration|Qtd_Media_Motores|
+-------------+-----------------+
|  MEDIUM-HAUL|              2.0|
|   SHORT-HAUL|              2.0|
+-------------+-----------------+



## Pergunta 27

In [141]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_VOOS_EST
AS
(SELECT  dep_season_fl AS Estacao_Ano,
        COUNT(*) AS Qtd_Voos
FROM db_final
GROUP BY dep_season_fl)
         
SELECT  dep_season_fl AS Estacao_Ano,
        COUNT(*) AS Qtd_Voos
FROM db_final
GROUP BY dep_season_fl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_VOOS_EST)
ORDER BY 1, 2 DESC

""").show()

+-----------+--------+
|Estacao_Ano|Qtd_Voos|
+-----------+--------+
|     SUMMER|    2918|
+-----------+--------+



## Pergunta 28

In [144]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_VOOS_EST
AS
(SELECT dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        dep_season_fl AS Estacao_Ano,
        Count(*) AS Qtd_Voos
FROM db_final
GROUP BY dest_fl,
         region_air_dest, 
         dep_season_fl)

SELECT  dest_fl AS Destino,
        region_air_dest AS Regiao_Destino,
        dep_season_fl AS Estacao_Ano,
        Count(*) AS Qtd_Voos
FROM db_final a
GROUP BY dest_fl,
         region_air_dest,
         dep_season_fl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_VOOS_EST b WHERE a.dest_fl = b.Destino)
ORDER BY 1, 2, 4  DESC

""").show(150)

+-------+--------------+-----------+--------+
|Destino|Regiao_Destino|Estacao_Ano|Qtd_Voos|
+-------+--------------+-----------+--------+
|    ABQ| MAINLAND-WEST|       FALL|      19|
|    ANC|        ALASKA|     SUMMER|     145|
|    ATL| MAINLAND-EAST|     SUMMER|      86|
|    AUS| MAINLAND-WEST|     SPRING|       9|
|    BLI| MAINLAND-WEST|     SUMMER|       4|
|    BNA| MAINLAND-EAST|     SUMMER|       5|
|    BOI| MAINLAND-WEST|     SPRING|       7|
|    BOS| MAINLAND-EAST|     SUMMER|      50|
|    BUR| MAINLAND-WEST|     WINTER|      44|
|    BWI| MAINLAND-EAST|     SUMMER|      14|
|    CLE| MAINLAND-EAST|     SUMMER|       2|
|    CLT| MAINLAND-EAST|     SPRING|      28|
|    COS| MAINLAND-WEST|       FALL|       8|
|    CVG| MAINLAND-EAST|     SPRING|       3|
|    DCA| MAINLAND-EAST|     SUMMER|      20|
|    DEN| MAINLAND-WEST|     SUMMER|     172|
|    DFW| MAINLAND-WEST|     SUMMER|     122|
|    DTW| MAINLAND-EAST|     SUMMER|      43|
|    EUG| MAINLAND-WEST|     SPRIN

## Pergunta 29

In [147]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_VOOS_DELAY
AS
(SELECT  dep_delay_category_fl AS Categoria_Atraso,
        COUNT(*) AS Qtd_Voos
FROM db_final
GROUP BY dep_delay_category_fl)
         
SELECT  dep_delay_category_fl AS Categoria_Atraso,
        COUNT(*) AS Qtd_Voos
FROM db_final
GROUP BY dep_delay_category_fl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_VOOS_DELAY)
ORDER BY 1, 2 DESC

""").show()

+----------------+--------+
|Categoria_Atraso|Qtd_Voos|
+----------------+--------+
|     ANTECIPATED|    5894|
+----------------+--------+



## Pergunta 30

In [150]:
spark.getOrCreate().sql(f"""
WITH MAX_QTD_VOOS_DELAY
AS
(SELECT origin_fl AS Origem,
        dest_fl AS Destino,
        dep_delay_category_fl AS Categoria_Atraso,
        COUNT(*) AS Qtd_Voos
FROM db_final
GROUP BY origin_fl,
         dest_fl,
         dep_delay_category_fl)
         
SELECT  origin_fl AS Origem,
        dest_fl AS Destino,
        dep_delay_category_fl AS Categoria_Atraso,
        COUNT(*) AS Qtd_Voos
FROM db_final a
GROUP BY origin_fl,
         dest_fl,
         dep_delay_category_fl
HAVING Qtd_Voos = (SELECT MAX(Qtd_Voos) FROM MAX_QTD_VOOS_DELAY b WHERE a.origin_fl = b.Origem AND a.dest_fl = b.Destino)
ORDER BY 1, 2

""").show(150)

+------+-------+----------------+--------+
|Origem|Destino|Categoria_Atraso|Qtd_Voos|
+------+-------+----------------+--------+
|   PDX|    ABQ|           MINOR|       7|
|   PDX|    ABQ|     ANTECIPATED|       7|
|   PDX|    ANC|     ANTECIPATED|      46|
|   PDX|    ATL|     ANTECIPATED|      75|
|   PDX|    AUS|     ANTECIPATED|       2|
|   PDX|    BOI|     ANTECIPATED|       6|
|   PDX|    BOS|     ANTECIPATED|      21|
|   PDX|    BUR|     ANTECIPATED|      60|
|   PDX|    BWI|     ANTECIPATED|       2|
|   PDX|    CLT|     ANTECIPATED|      13|
|   PDX|    DCA|     ANTECIPATED|      16|
|   PDX|    DEN|     ANTECIPATED|     135|
|   PDX|    DFW|     ANTECIPATED|      68|
|   PDX|    DTW|     ANTECIPATED|      16|
|   PDX|    EUG|     ANTECIPATED|      29|
|   PDX|    EWR|     ANTECIPATED|      11|
|   PDX|    FAI|           MINOR|       4|
|   PDX|    FAI|     ANTECIPATED|       4|
|   PDX|    HNL|     ANTECIPATED|      48|
|   PDX|    HOU|          INTIME|       2|
|   PDX|   

In [20]:
df_final.groupBy('dest_fl', 'name_air_dest').count().distinct().orderBy('count',ascending=False).show()

+-------+--------------------+-----+
|dest_fl|       name_air_dest|count|
+-------+--------------------+-----+
|    SFO|  San Francisco Intl|  787|
|    LAX|    Los Angeles Intl|  615|
|    DEN|         Denver Intl|  586|
|    PHX|Phoenix Sky Harbo...|  530|
|    LAS|      Mc Carran Intl|  520|
|    ANC|Ted Stevens Ancho...|  449|
|    ORD|  Chicago Ohare Intl|  439|
|    SLC| Salt Lake City Intl|  396|
|    DFW|Dallas Fort Worth...|  371|
|    SJC|Norman Y Mineta S...|  369|
|    OAK|Metropolitan Oakl...|  334|
|    SMF|     Sacramento Intl|  283|
|    SAN|      San Diego Intl|  271|
|    ATL|Hartsfield Jackso...|  258|
|    MSP|Minneapolis St Pa...|  238|
|    IAH|George Bush Inter...|  226|
|    SNA|John Wayne Arpt O...|  198|
|    LGB|          Long Beach|  175|
|    JFK| John F Kennedy Intl|  173|
|    PDX|       Portland Intl|  157|
+-------+--------------------+-----+
only showing top 20 rows



In [26]:
(df_final.groupBy('dest_fl')
.agg(F.count('seats_pl').alias('most_popular_dest'))
.agg(F.max(F.struct(F.col('most_popular_dest'),
F.col('dest_fl'))).alias('most_popular'))
.select('most_popular.dest_fl', 'most_popular.most_popular_dest').show())

+-------+-----------------+
|dest_fl|most_popular_dest|
+-------+-----------------+
|    SFO|              772|
+-------+-----------------+

