In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()
import datetime
import pandas as pd

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[5]")
                     .appName("Semana 4 - Desafio Relatórios"))

In [5]:
df_airports=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_airports.parquet")
df_planes=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_planes.parquet")
df_flights=spark.getOrCreate().read.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_Flights.parquet")

In [6]:
print(df_airports.count())
print(df_planes.count())
print(df_flights.count())

1397
2628
10000


In [7]:
df_airports_origin = df_airports.select([F.col(c).alias(c+"_airports_origin") for c in df_airports.columns])
df_airports_dest = df_airports.select([F.col(c).alias(c+"_airports_dest") for c in df_airports.columns])

df_planes = df_planes.select([F.col(c).alias(c+"_planes") for c in df_planes.columns])
df_flights = df_flights.select([F.col(c).alias(c+"_flights") for c in df_flights.columns])

df_airports_origin.printSchema()
df_airports_dest.printSchema()
df_planes.printSchema()
df_flights.printSchema()

root
 |-- faa_airports_origin: string (nullable = true)
 |-- name_airports_origin: string (nullable = true)
 |-- lat_airports_origin: float (nullable = true)
 |-- lon_airports_origin: float (nullable = true)
 |-- alt_airports_origin: integer (nullable = true)
 |-- tz_airports_origin: integer (nullable = true)
 |-- dst_airports_origin: string (nullable = true)
 |-- qa_faa_airports_origin: string (nullable = true)
 |-- qa_name_airports_origin: string (nullable = true)
 |-- qa_lat_airports_origin: string (nullable = true)
 |-- qa_lon_airports_origin: string (nullable = true)
 |-- qa_alt_airports_origin: string (nullable = true)
 |-- qa_tz_airports_origin: string (nullable = true)
 |-- qa_dst_airports_origin: string (nullable = true)

root
 |-- faa_airports_dest: string (nullable = true)
 |-- name_airports_dest: string (nullable = true)
 |-- lat_airports_dest: float (nullable = true)
 |-- lon_airports_dest: float (nullable = true)
 |-- alt_airports_dest: integer (nullable = true)
 |-- tz_a

# Perguntas para qualidade

## Pergunta 1

In [23]:
 df_final = (df_flights.join(df_planes, df_planes.tailnum_planes == df_flights.tailnum_flights
                             ,"left" ).select(df_flights.tailnum_flights
                                             ,df_flights.origin_flights
                                             ,df_flights.dest_flights
                                             ,df_flights.qa_year_month_day_flights
                                             ,df_flights.qa_hour_minute_flights
                                             ,df_flights.qa_dep_arr_time_flights
                                             ,df_flights.qa_dep_arr_delay_flights
                                             ,df_flights.qa_carrier_flights
                                             ,df_flights.qa_tailnum_flights
                                             ,df_flights.qa_flight_flights
                                             ,df_flights.qa_origin_dest_flights
                                             ,df_flights.qa_air_time_flights
                                             ,df_flights.qa_distance_flights
                                             ,df_flights.qa_distance_airtime_flights
                                             ,df_planes.qa_tailnum_planes
                                             ,df_planes.qa_year_planes
                                             ,df_planes.qa_type_planes
                                             ,df_planes.qa_manufacturer_planes
                                             ,df_planes.qa_model_planes
                                             ,df_planes.qa_engines_planes
                                             ,df_planes.qa_seats_planes
                                             ,df_planes.qa_speed_planes
                                             ,df_planes.qa_engine_planes
                                             )
                       .join(df_airports_origin, df_airports_origin.faa_airports_origin == df_flights.origin_flights
                             ,"left").drop("faa_airports_origin"
                                             ,"name_airports_origin"
                                             ,"lat_airports_origin"
                                             ,"lon_airports_origin"
                                             ,"alt_airports_origin"
                                             ,"tz_airports_origin"
                                             ,"dst_airports_origin"
                                          )
                       .join(df_airports_dest, df_airports_dest.faa_airports_dest == df_flights.dest_flights
                             ,"left").drop("faa_airports_dest"
                                             ,"name_airports_dest"
                                             ,"lat_airports_dest"
                                             ,"lon_airports_dest"
                                             ,"alt_airports_dest"
                                             ,"tz_airports_dest"
                                             ,"dst_airports_dest"
                                             ,"tailnum_flights"
                                             ,"origin_flights"
                                             ,"dest_flights"
                                          )          
            )

df_final.printSchema()

print(df_final.count())

df_final.show()

root
 |-- qa_year_month_day_flights: string (nullable = true)
 |-- qa_hour_minute_flights: string (nullable = true)
 |-- qa_dep_arr_time_flights: string (nullable = true)
 |-- qa_dep_arr_delay_flights: string (nullable = true)
 |-- qa_carrier_flights: string (nullable = true)
 |-- qa_tailnum_flights: string (nullable = true)
 |-- qa_flight_flights: string (nullable = true)
 |-- qa_origin_dest_flights: string (nullable = true)
 |-- qa_air_time_flights: string (nullable = true)
 |-- qa_distance_flights: string (nullable = true)
 |-- qa_distance_airtime_flights: string (nullable = true)
 |-- qa_tailnum_planes: string (nullable = true)
 |-- qa_year_planes: string (nullable = true)
 |-- qa_type_planes: string (nullable = true)
 |-- qa_manufacturer_planes: string (nullable = true)
 |-- qa_model_planes: string (nullable = true)
 |-- qa_engines_planes: string (nullable = true)
 |-- qa_seats_planes: string (nullable = true)
 |-- qa_speed_planes: string (nullable = true)
 |-- qa_engine_planes: s

## Pergunta 2

In [9]:
columns = ['Var', 'F', 'I', 'M', 'S', 'T']
Vals = [("Col1", 0, 0, 0, 0, 0)]

df_transposed = spark.getOrCreate().createDataFrame(Vals, columns)
df_transposed = df_transposed.filter(F.col("Var") != "Col1")

names = df_final.schema.names

for c in names:
    df_final = df_final.withColumn(c, (F.when((F.col(c).startswith("M")) ,"M")
                                                                .when((F.col(c).startswith("F")) ,"F")
                                                                .when((F.col(c).startswith("I")) ,"I")
                                                                .when((F.col(c).startswith("S")) ,"S")
                                                                .when((F.col(c).startswith("T")) ,"T")
                                                                .otherwise(F.col(c))
                                      )
                                  )
    df = df_final.filter((F.col(c).isNotNull()) & (F.col(c).isin("M","F","I","S","T"))).select(F.col(c)).groupBy(F.col(c).alias("Issues_")).agg(F.count(F.col(c)).cast('int').alias(c)).orderBy(F.col(c))
    df = (df.withColumn("F", (F.when((F.col("Issues_").startswith("F")), F.col(c)+0).otherwise(0)))
            .withColumn("I", (F.when((F.col("Issues_").startswith("I")), F.col(c)+0).otherwise(0)))
            .withColumn("M", (F.when((F.col("Issues_").startswith("M")), F.col(c)+0).otherwise(0)))
            .withColumn("S", (F.when((F.col("Issues_").startswith("S")), F.col(c)+0).otherwise(0)))
            .withColumn("T", (F.when((F.col("Issues_").startswith("T")), F.col(c)+0).otherwise(0)))
            .withColumn("Var", (F.when((F.col("Issues_").isNotNull()), c)))
         )
    
    df = df.select("Var", "F", "I", "M", "S", "T")
    
    df_transposed = df_transposed.union(df)

In [22]:
df_transposed.show()

+--------------------+----+---+----+---+----+
|                 Var|   F|  I|   M|  S|   T|
+--------------------+----+---+----+---+----+
|qa_hour_minute_fl...|   0|  1|   0|  0|   0|
|qa_hour_minute_fl...|   0|  0|  48|  0|   0|
|qa_dep_arr_time_f...|   0|  0|  55|  0|   0|
|qa_dep_arr_time_f...| 241|  0|   0|  0|   0|
|qa_dep_arr_delay_...|   0|  0|  75|  0|   0|
|  qa_tailnum_flights|   0|  0|   0| 14|   0|
|  qa_tailnum_flights| 989|  0|   0|  0|   0|
|   qa_flight_flights|6158|  0|   0|  0|   0|
| qa_air_time_flights|   0|  0|  75|  0|   0|
|qa_distance_airti...|   0|  0|  75|  0|   0|
|qa_distance_airti...|   0|  0|   0|  0|9925|
|   qa_tailnum_planes| 552|  0|   0|  0|   0|
|      qa_year_planes|   0|  8|   0|  0|   0|
|      qa_year_planes|   0|  0|  94|  0|   0|
|     qa_model_planes|   9|  0|   0|  0|   0|
|     qa_speed_planes|   0|  0|9443|  0|   0|
+--------------------+----+---+----+---+----+



## Pergunta 3

In [19]:
(df_transposed.groupBy("Var")
              .agg(F.max('M').alias('Max_M'))
              .agg(F.max(F.struct(F.col('Max_M'), F.col('Var'))).alias('Max_Var'))
              .select('Max_Var.Var', 'Max_Var.Max_M').show())

+---------------+-----+
|            Var|Max_M|
+---------------+-----+
|qa_speed_planes| 9443|
+---------------+-----+



## Pergunta 4

In [20]:
(df_transposed.groupBy("Var")
              .agg(F.max('F').alias('Max_F'))
              .agg(F.max(F.struct(F.col('Max_F'), F.col('Var'))).alias('Max_Var'))
              .select('Max_Var.Var', 'Max_Var.Max_F').show())

+-----------------+-----+
|              Var|Max_F|
+-----------------+-----+
|qa_flight_flights| 6158|
+-----------------+-----+



## Pergunta 5

In [21]:
(df_transposed.groupBy("Var")
              .agg(F.max('I').alias('Max_I'))
              .agg(F.max(F.struct(F.col('Max_I'), F.col('Var'))).alias('Max_Var'))
              .select('Max_Var.Var', 'Max_Var.Max_I').show())

+--------------+-----+
|           Var|Max_I|
+--------------+-----+
|qa_year_planes|    8|
+--------------+-----+

