In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark

import findspark
findspark.init()



In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType

In [4]:
def CriaVw(df):
    return df.createOrReplaceTempView("Data")

def Consolida_SQL(_col):
    return spark.getOrCreate().sql(f"Select {_col}, count(*) from Data Group By {_col} order by 1").show()
    
def Consolida(_col, df):
    CriaVw(df)
    return Consolida_SQL(_col)

In [5]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Semana 3 - Desafio Transformação"))

In [6]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [7]:
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/flights.csv"))

df_airports.show(5)
df_planes.show(5)
df_flights.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      

In [8]:
# Criacao das visões temporarias
df_airports.createOrReplaceTempView('airports')
df_planes.createOrReplaceTempView('planes')
df_flights.createOrReplaceTempView('flights')

# Airport


## Pergunta 1

In [9]:
df_airports_Final = df_airports.withColumn('alt',  (
                                                    F.when(F.col('alt') < 0, 0)
                                                    .otherwise(F.col('alt'))
                                                    )
                                          )
df_airports.groupBy("alt").count().distinct().orderBy("alt").show()
df_airports_Final.groupBy("alt").count().distinct().orderBy("alt").show()

+---+-----+
|alt|count|
+---+-----+
|-54|    1|
|-42|    1|
|  0|   49|
|  1|    2|
|  2|    1|
|  3|    2|
|  4|    3|
|  5|    3|
|  6|    3|
|  7|    7|
|  8|    8|
|  9|    7|
| 10|   11|
| 11|    6|
| 12|    8|
| 13|   12|
| 14|   11|
| 15|   11|
| 16|    3|
| 17|    7|
+---+-----+
only showing top 20 rows

+---+-----+
|alt|count|
+---+-----+
|  0|   51|
|  1|    2|
|  2|    1|
|  3|    2|
|  4|    3|
|  5|    3|
|  6|    3|
|  7|    7|
|  8|    8|
|  9|    7|
| 10|   11|
| 11|    6|
| 12|    8|
| 13|   12|
| 14|   11|
| 15|   11|
| 16|    3|
| 17|    7|
| 18|   10|
| 19|    6|
+---+-----+
only showing top 20 rows



## Pergunta 2

In [10]:
df_airports_Final = df_airports_Final.withColumn('dst',  (
                                                            F.when(F.col('tz').between(-7,-5), 'A')
                                                            .otherwise(F.col('dst'))
                                                         )
                                                )

df_airports.groupBy("tz","dst").count().distinct().orderBy("tz", "dst").show()
df_airports_Final.groupBy("tz","dst").count().distinct().orderBy("tz", "dst").show()
df_airports_Final.groupBy("dst").count().distinct().orderBy("dst").show()

+---+---+-----+
| tz|dst|count|
+---+---+-----+
|-11|  A|    2|
|-10|  A|   16|
|-10|  N|    9|
|-10|  U|    1|
| -9|  A|  221|
| -9|  U|    1|
| -8|  A|  147|
| -8|  U|    5|
| -7|  A|  142|
| -7|  N|   13|
| -7|  U|    6|
| -6|  A|  299|
| -6|  U|   13|
| -5|  A|  431|
| -5|  N|    1|
| -5|  U|   18|
| -4|  A|   65|
| -4|  U|    1|
|  5|  A|    2|
|  6|  A|    1|
+---+---+-----+
only showing top 20 rows

+---+---+-----+
| tz|dst|count|
+---+---+-----+
|-11|  A|    2|
|-10|  A|   16|
|-10|  N|    9|
|-10|  U|    1|
| -9|  A|  221|
| -9|  U|    1|
| -8|  A|  147|
| -8|  U|    5|
| -7|  A|  161|
| -6|  A|  312|
| -5|  A|  450|
| -4|  A|   65|
| -4|  U|    1|
|  5|  A|    2|
|  6|  A|    1|
|  7|  A|    1|
|  8|  A|    2|
+---+---+-----+

+---+-----+
|dst|count|
+---+-----+
|  A| 1380|
|  N|    9|
|  U|    8|
+---+-----+



## Pergunta 3

In [11]:
df_airports_Final = df_airports_Final.withColumn('dst',  (
                                                            F.when(F.col('dst') == 'U', 'A')
                                                            .otherwise(F.col('dst'))
                                                         )
                                                )

df_airports.groupBy("dst").count().distinct().orderBy("dst").show()
df_airports_Final.groupBy("dst").count().distinct().orderBy("dst").show()

+---+-----+
|dst|count|
+---+-----+
|  A| 1329|
|  N|   23|
|  U|   45|
+---+-----+

+---+-----+
|dst|count|
+---+-----+
|  A| 1388|
|  N|    9|
+---+-----+



## Pergunta 4

In [12]:
#[−124, −50] EUA

df_airports_Final = df_airports_Final.withColumn('region',  (
                                                             F.when((F.col('lon') > -50) | (F.col('lat') < 24), 'OFFSHORE')
                                                             .when((F.col('lon') >= -124) & (F.col('lon') <= -95), 'MAINLAND-WEST')
                                                             .when((F.col('lon') > -95) & (F.col('lon') <= -50), 'MAINLAND-EAST')
                                                             .when((F.col('lon') < -124), 'ALASKA')
                                                             .otherwise('NaN')
                                                         )
                                                )

df_airports_Final.createOrReplaceTempView('airports_transformada')

spark.getOrCreate().sql(f"""
SELECT  'Total' as region,
        max(a.lon) as Max_lon,
        min(a.lon) as Min_lon,
        max(a.lat) as Max_lat,
        min(a.lat) as Min_lat,
        Count(*) as Quantidade
FROM airports_transformada a
UNION ALL
SELECT  a.region,
        max(a.lon) as Max_lon,
        min(a.lon) as Min_lon,
        max(a.lat) as Max_lat,
        min(a.lat) as Min_lat,
        Count(*) as Quantidade
FROM airports_transformada a
Group By a.region
""").show()



+-------------+-----------+-----------+---------+---------+----------+
|       region|    Max_lon|    Min_lon|  Max_lat|  Min_lat|Quantidade|
+-------------+-----------+-----------+---------+---------+----------+
|        Total|  174.11362|   -176.646|72.270836|19.721375|      1397|
|       ALASKA|-124.057915|   -176.646|71.285446| 40.97811|       243|
|     OFFSHORE|  174.11362|   -159.785|72.270836|19.721375|        22|
|MAINLAND-EAST| -67.012695|  -94.93472|48.728443| 24.55611|       696|
|MAINLAND-WEST| -95.109406|-123.936554| 48.97972|25.906834|       436|
+-------------+-----------+-----------+---------+---------+----------+



## Pergunta 5

In [13]:
df_airports_Final = df_airports_Final.withColumn('type',  (                                                    
                                                             F.when((F.col('name').contains(' Arpt'))
                                                                    | (F.col('name').contains(' Airpor'))
                                                                    | (F.col('name').contains(' Airport'))
                                                                    | (F.col('name').contains(' Tradeport'))
                                                                    | (F.col('name').contains(' Heliport'))
                                                             , 'AP')
                                                             .when((F.col('name').contains(' Aerodrome'))
                                                             , 'AD')
                                                             .when((F.col('name').contains(' Airpark'))
                                                                    | (F.col('name').contains(' Aero Park'))
                                                             , 'AK')
                                                             .when((F.col('name').contains(' Air Station'))
                                                                    | (F.col('name').contains(' Station'))
                                                             , 'AS')
                                                             .when((F.col('name').contains(' Fld'))
                                                                    | (F.col('name').contains(' Field'))
                                                             , 'FL')
                                                             .otherwise('NaN')
                                                         )
                                                )

F.col('name')
df_airports_Final.name

Consolida("type", df_airports_Final)

+----+--------+
|type|count(1)|
+----+--------+
|  AD|       1|
|  AK|      12|
|  AP|     624|
|  AS|      19|
|  FL|      78|
| NaN|     663|
+----+--------+



In [14]:
df_airports_Final2 = df_airports_Final.withColumn('type',  (
                                                             F.when((F.upper(F.col('name')).like("%AIRPORT%"))
                                                                    | (F.upper(F.col('name')).like("%TRADEPORT%"))
                                                                    | (F.upper(F.col('name')).like("%HELIPORT%"))
                                                                    | (F.upper(F.col('name')).like("%AIRPOR%"))
                                                                    | (F.upper(F.col('name')).like("%ARPT%"))
                                                             , 'AP')
                                                             .when((F.upper(F.col('name')).like("%AERODROME%"))
                                                             , 'AD')
                                                             .when((F.upper(F.col('name')).like("%AIRPARK%"))
                                                                    | (F.upper(F.col('name')).like("%AERO PARK%"))
                                                             , 'AK')
                                                             .when((F.upper(F.col('name')).like("%STATION%"))
                                                                    | (F.upper(F.col('name')).like("%AIR STATION%"))
                                                             , 'AS')
                                                             .when((F.upper(F.col('name')).like("%FIELD%"))
                                                                    | (F.upper(F.col('name')).like("%FLD%"))
                                                             , 'FL')
                                                             .otherwise('NaN')
                                                         )
                                                )

df_airports_Final2.groupBy("type").count().distinct().orderBy("type").show()

+----+-----+
|type|count|
+----+-----+
|  AD|    1|
|  AK|   12|
|  AP|  624|
|  AS|   19|
|  FL|   88|
| NaN|  653|
+----+-----+



## Pergunta 6

In [15]:
df_airports_Final = df_airports_Final.withColumn('military',  (
                                                                 F.when((F.col('name').contains(" Base"))
                                                                    | (F.col('name').contains(" Aaf"))
                                                                    | (F.col('name').contains(" AFs"))
                                                                    | (F.col('name').contains(" Ahp"))
                                                                    | (F.col('name').contains(" Afb"))
                                                                    | (F.col('name').contains(" LRRS"))
                                                                    | (F.col('name').contains(" Lrrs"))
                                                                    | (F.col('name').contains(" Arb"))
                                                                    | (F.col('name').contains(" Naf"))
                                                                    | (F.col('name').contains(" NAS"))
                                                                    | (F.col('name').contains(" Nas"))
                                                                    | (F.col('name').contains(" Jrb"))
                                                                    | (F.col('name').contains(" Ns"))
                                                                    | (F.col('name').contains(" As"))
                                                                    | (F.col('name').contains(" Cgas"))
                                                                    | (F.col('name').contains(" Angb"))
                                                                 , True)
                                                                 .otherwise(False)
                                                             )
                                                )


df_airports_Final.groupBy("military").count().distinct().orderBy("military").show()

+--------+-----+
|military|count|
+--------+-----+
|   false| 1237|
|    true|  160|
+--------+-----+



## Pergunta 7

In [16]:
df_airports_Final = df_airports_Final.withColumn('administration',  (
                                                                         F.when((F.col('name').contains("International"))
                                                                            | (F.col('name').contains(" Intl"))
                                                                            | (F.col('name').contains("Intercontinental"))    
                                                                         , "I")
                                                                         .when((F.col('name').contains("National"))
                                                                            | (F.col('name').contains(" Natl"))
                                                                         , "N")
                                                                         .when((F.col('name').contains("Regional"))
                                                                            | (F.col('name').contains("Reigonal"))
                                                                            | (F.col('name').contains(" Rgnl"))
                                                                            | (F.col('name').contains("County"))
                                                                            | (F.col('name').contains(" Metro"))
                                                                            | (F.col('name').contains("Metropolitan"))
                                                                         , "R")
                                                                         .when((F.col('name').contains("Municipal"))
                                                                            | (F.col('name').contains(" Muni"))
                                                                            | (F.col('name').contains("City"))
                                                                         , "M")
                                                                        .otherwise("NaN")
                                                                     )
                                                )


df_airports_Final.groupBy("administration").count().distinct().orderBy("administration").show()

+--------------+-----+
|administration|count|
+--------------+-----+
|             I|  164|
|             M|  180|
|             N|    5|
|           NaN|  761|
|             R|  287|
+--------------+-----+



In [17]:
df_airports_Final.write.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 3 - Desafio Transformação/Outputs_Airports_Transformacao.parquet")