# PROJETO ACELERAÇÃO PySpark
Autor: Andre Da Silva Martins
Data: 10/03/2022


# SEMANA 2 - DESAFIO QUALIDADE


## Dia 1 - Pergutas 1 [Airports] - 5 [Airports]

### Instalando Bibliotecas PySpark


In [2]:
!pip install pyspark
!pip install findspark



### Importando Bibliotecas

In [3]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, length, col, substring, trim, instr, concat_ws, concat, lpad
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType

### Criando Sessão Spark

In [4]:
sc = SparkContext()

spark = (SparkSession
            .builder
            .appName("Semana 2 - Desafio Qualidade (DataFrame)")
            .config("spark.some.config.option", "some-value")
            .getOrCreate())

#Inicializando Sessão
spark

### Importando Dataset Airports

In [8]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

df = (spark.read.format("csv") 
                  .option("header", True) 
                  .schema(schema_airports) 
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/Airports.csv")
                 )

In [9]:
df.show()
df.printSchema()

+---+--------------------+---------+-----------+----+---+---+
|faa|                name|      lat|        lon| alt| tz|dst|
+---+--------------------+---------+-----------+----+---+---+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|
|0P2|Shoestring Aviati...|39.794823| -76.647194|1000| -5|  U|
|0S9|Jefferson County ...| 48.05381|-122.810646| 108| -8|  A|
|0W3|Harford County Ai...|39.566837|   -76.2024| 409| -5|  A|
|10C|  Galt Field Airport| 42.40289| -88.375114| 875| -6|  U|
|17G|Port Bucyrus-Craw...|40.781555|  -82.97481|1003| -5|  A|
|19A|Jac

### Inseriondo Linhas com erros 

In [6]:
columns = ['faa', 'name', 'lat', 'lon', 'alt', 'tz', 'dst' ]
Vals = [("TESTES","TESTE",123.45,-0123.45,12345,-12,"J"), ("TE","TESTE",123.45,-0123.45,12345,-12,"K"), (None,"TESTE",123.45,-0123.45,12345,-12,"K"), ("TESTE",None,123.45,-0123.45,12345,-12,"K"), ("TESTE","TESTE",None,-0123.45,12345,-12,"K"), ("TESTE","TESTE",-181.00,-0123.45,12345,-12,"K"), ("TESTE","TESTE",181.00,-0123.45,12345,-12,"K"), ("TESTE","TESTE",181.00,None,12345,-12,"K"), ("TESTE","TESTE",181.00,181.00,12345,-12,"K"), ("TESTE","TESTE",181.00,-181.00,12345,-12,"K"), ("TESTE","TESTE",181.00,-181.00,None,-12,"K")]
newRow = spark.createDataFrame(Vals, columns)

df_teste = df.union(newRow)

### Pergunta 1 - Airports

#### Tabela Teste

In [7]:
df2 = (df_teste.withColumn("qa_faa",
            when((df_teste.faa.isNull()), lit("M"))
            .when((length(df_teste.faa) < 3) | (length(df_teste.faa) > 5), lit("F")) \
        .otherwise(lit(None)))
      )

df2.filter(df2.qa_faa.isNotNull()).show()

+------+-----+------+-------+-----+---+---+------+
|   faa| name|   lat|    lon|  alt| tz|dst|qa_faa|
+------+-----+------+-------+-----+---+---+------+
|TESTES|TESTE|123.45|-123.45|12345|-12|  J|     F|
|    TE|TESTE|123.45|-123.45|12345|-12|  K|     F|
|  null|TESTE|123.45|-123.45|12345|-12|  K|     M|
+------+-----+------+-------+-----+---+---+------+



#### Sumarizando Casos

In [8]:
df2.createOrReplaceTempView("Teste")

spark.sql("select qa_faa, Count(*) from Teste Group By qa_faa").show()

+------+--------+
|qa_faa|count(1)|
+------+--------+
|  null|    1405|
|     F|       2|
|     M|       1|
+------+--------+



#### Tabela Oficial

In [10]:
dfo2 = (df.withColumn("qa_faa",
            when((df.faa.isNull()), lit("M"))
            .when((length(df.faa) < 3) | (length(df.faa) > 5), lit("F")) \
        .otherwise(lit(None)))
      )

dfo2.filter(dfo2.qa_faa.isNotNull()).show()

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+



### Perguntas 2 - Airports

#### Tabela Teste

In [10]:
df3 = (df_teste.withColumn("qa_name",
            when((df_teste.name.isNull()), lit("M"))
        .otherwise(lit(None)))
      )

df3.filter(df3.qa_name.isNotNull()).show()

+-----+----+------+-------+-----+---+---+-------+
|  faa|name|   lat|    lon|  alt| tz|dst|qa_name|
+-----+----+------+-------+-----+---+---+-------+
|TESTE|null|123.45|-123.45|12345|-12|  K|      M|
+-----+----+------+-------+-----+---+---+-------+



#### Sumarizando Casos

In [11]:
df3.createOrReplaceTempView("Teste")

spark.sql("select qa_name, Count(*) from Teste Group By qa_name").show()

+-------+--------+
|qa_name|count(1)|
+-------+--------+
|   null|    1407|
|      M|       1|
+-------+--------+



#### Tabela Oficial

In [11]:
dfo3 = (dfo2.withColumn("qa_name",
            when((dfo2.name.isNull()), lit("M"))
        .otherwise(lit(None)))
      )

dfo3.filter(dfo3.qa_name.isNotNull()).show()

+---+----+---+---+---+---+---+------+-------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|
+---+----+---+---+---+---+---+------+-------+
+---+----+---+---+---+---+---+------+-------+



### Perguntas 3 - Airports

#### Tabela Teste

In [13]:
df4 = (df_teste.withColumn("qa_lat",
            when((df_teste.lat.isNull()), lit("M"))
            .when((df_teste.lat < -180) | (df_teste.lat > 180), lit("I"))
            .when((df_teste.lat.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

df4.filter(df4.qa_lat.isNotNull()).show()



+-----+-----+------+-------+-----+---+---+------+
|  faa| name|   lat|    lon|  alt| tz|dst|qa_lat|
+-----+-----+------+-------+-----+---+---+------+
|TESTE|TESTE|  null|-123.45|12345|-12|  K|     M|
|TESTE|TESTE|-181.0|-123.45|12345|-12|  K|     I|
|TESTE|TESTE| 181.0|-123.45|12345|-12|  K|     I|
|TESTE|TESTE| 181.0|   null|12345|-12|  K|     I|
|TESTE|TESTE| 181.0|  181.0|12345|-12|  K|     I|
|TESTE|TESTE| 181.0| -181.0|12345|-12|  K|     I|
|TESTE|TESTE| 181.0| -181.0| null|-12|  K|     I|
+-----+-----+------+-------+-----+---+---+------+



#### Sumarizando Casos

In [14]:
df4.createOrReplaceTempView("Teste")

spark.sql("select qa_lat, Count(*) from Teste Group By qa_lat").show()

+------+--------+
|qa_lat|count(1)|
+------+--------+
|  null|    1401|
|     M|       1|
|     I|       6|
+------+--------+



#### Tabela Oficial

In [12]:
dfo4 = (dfo3.withColumn("qa_lat",
            when((dfo3.lat.isNull()), lit("M"))
            .when((dfo3.lat < -180) | (dfo3.lat > 180), lit("I"))
            .when((dfo3.lat.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

dfo4.filter(dfo4.qa_lat.isNotNull()).show()

+---+----+---+---+---+---+---+------+-------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|
+---+----+---+---+---+---+---+------+-------+------+
+---+----+---+---+---+---+---+------+-------+------+



### Perguntas 4 - Airports

#### Tabela Teste

In [16]:
df5 = (df_teste.withColumn("qa_lon",
            when((df_teste.lon.isNull()), lit("M"))
            .when((df_teste.lon < -180) | (df_teste.lon > 180), lit("I"))
            .when((df_teste.lon.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

df5.filter(df5.qa_lon.isNotNull()).show()

+-----+-----+-----+------+-----+---+---+------+
|  faa| name|  lat|   lon|  alt| tz|dst|qa_lon|
+-----+-----+-----+------+-----+---+---+------+
|TESTE|TESTE|181.0|  null|12345|-12|  K|     M|
|TESTE|TESTE|181.0| 181.0|12345|-12|  K|     I|
|TESTE|TESTE|181.0|-181.0|12345|-12|  K|     I|
|TESTE|TESTE|181.0|-181.0| null|-12|  K|     I|
+-----+-----+-----+------+-----+---+---+------+



#### Sumarizando Casos

In [17]:
df5.createOrReplaceTempView("Teste")

spark.sql("select qa_lon, Count(*) from Teste Group By qa_lon").show()

+------+--------+
|qa_lon|count(1)|
+------+--------+
|  null|    1404|
|     M|       1|
|     I|       3|
+------+--------+



#### Tabela Oficial

In [13]:
dfo5 = (dfo4.withColumn("qa_lon",
            when((dfo4.lon.isNull()), lit("M"))
            .when((dfo4.lon < -180) | (dfo4.lon > 180), lit("I"))
            .when((dfo4.lon.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

dfo5.filter(dfo5.qa_lon.isNotNull()).show()

+---+----+---+---+---+---+---+------+-------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+----+---+---+---+---+---+------+-------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+



### Perguntas 5 - Airports

#### Tabela Teste

In [19]:
df6 = (df_teste.withColumn("qa_alt",
            when((df_teste.alt.isNull()), lit("M"))
            .when((df_teste.alt < 0), lit("I"))
            .when((df_teste.alt.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

df6.filter(df6.qa_alt.isNotNull()).show()

+-----+-------------+---------+-----------+----+---+---+------+
|  faa|         name|      lat|        lon| alt| tz|dst|qa_alt|
+-----+-------------+---------+-----------+----+---+---+------+
|  IPL|  Imperial Co|32.834219|-115.578744| -54| -8|  A|     I|
|  NJK|El Centro Naf|32.829222|-115.671667| -42| -8|  A|     I|
|TESTE|        TESTE|    181.0|     -181.0|null|-12|  K|     M|
+-----+-------------+---------+-----------+----+---+---+------+



#### Sumarizando Casos

In [20]:
df6.createOrReplaceTempView("Teste")

spark.sql("select qa_alt, Count(*) from Teste Group By qa_alt").show()

+------+--------+
|qa_alt|count(1)|
+------+--------+
|  null|    1405|
|     I|       2|
|     M|       1|
+------+--------+



#### Tabela Oficial

In [14]:
dfo6 = (dfo5.withColumn("qa_alt",
            when((dfo5.alt.isNull()), lit("M"))
            .when((dfo5.alt < 0), lit("I"))
            .when((dfo5.alt.cast("int").isNull()), lit("A"))
        .otherwise(lit(None)))
      )

dfo6.filter(dfo6.qa_alt.isNotNull()).show()

+---+-------------+---------+----------+---+---+---+------+-------+------+------+------+
|faa|         name|      lat|       lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+-------------+---------+----------+---+---+---+------+-------+------+------+------+
|IPL|  Imperial Co| 32.83422|-115.57874|-54| -8|  A|  null|   null|  null|  null|     I|
|NJK|El Centro Naf|32.829224|-115.67167|-42| -8|  A|  null|   null|  null|  null|     I|
+---+-------------+---------+----------+---+---+---+------+-------+------+------+------+



## Dia 2 - Pergutas 6 [Airports] - 3 [Planes]

### Pergunta 6 - Airports

#### Tabela Teste

In [22]:
df7 = (df_teste.withColumn("qa_tz",
            when((df_teste.tz.isNull()), lit("M"))
            .when((~df_teste.tz.between(-11, 14)), lit("I"))
            .when((df_teste.tz.rlike("([A-Z])") ==True), lit("A"))
        .otherwise(lit(None)))
      )

df7.filter(df7.qa_tz.isNotNull()).show()

+------+-----+------+-------+-----+---+---+-----+
|   faa| name|   lat|    lon|  alt| tz|dst|qa_tz|
+------+-----+------+-------+-----+---+---+-----+
|TESTES|TESTE|123.45|-123.45|12345|-12|  J|    I|
|    TE|TESTE|123.45|-123.45|12345|-12|  K|    I|
|  null|TESTE|123.45|-123.45|12345|-12|  K|    I|
| TESTE| null|123.45|-123.45|12345|-12|  K|    I|
| TESTE|TESTE|  null|-123.45|12345|-12|  K|    I|
| TESTE|TESTE|-181.0|-123.45|12345|-12|  K|    I|
| TESTE|TESTE| 181.0|-123.45|12345|-12|  K|    I|
| TESTE|TESTE| 181.0|   null|12345|-12|  K|    I|
| TESTE|TESTE| 181.0|  181.0|12345|-12|  K|    I|
| TESTE|TESTE| 181.0| -181.0|12345|-12|  K|    I|
| TESTE|TESTE| 181.0| -181.0| null|-12|  K|    I|
+------+-----+------+-------+-----+---+---+-----+



#### Sumarizando Casos

In [23]:
df7.createOrReplaceTempView("Teste")

spark.sql("select qa_tz, Count(*) from Teste Group By qa_tz").show()

+-----+--------+
|qa_tz|count(1)|
+-----+--------+
| null|    1397|
|    I|      11|
+-----+--------+



#### Tabela Oficial

In [15]:
dfo7 = (dfo6.withColumn("qa_tz",
            when((dfo6.tz.isNull()), lit("M"))
            .when((~dfo6.tz.between(-11, 14)), lit("I"))
            .when((dfo6.tz.rlike("([A-Z])") ==True), lit("A"))
        .otherwise(lit(None)))
      )

dfo7.filter(dfo7.qa_tz.isNotNull()).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+



### Pergunta 7 - Airports

#### Tabela Teste

In [25]:
df8 = (df_teste.withColumn("qa_dst",
            when((df_teste.dst.isNull()), lit("M"))
            .when((~df_teste.dst.isin("E","A","S","O","Z","N","U")), lit("C"))
            .when((df_teste.dst.rlike("([/d])") ==True), lit("N"))
        .otherwise(lit(None)))
      )

df8.filter(df8.qa_dst.isNotNull()).show()

+------+-----+------+-------+-----+---+---+------+
|   faa| name|   lat|    lon|  alt| tz|dst|qa_dst|
+------+-----+------+-------+-----+---+---+------+
|TESTES|TESTE|123.45|-123.45|12345|-12|  J|     C|
|    TE|TESTE|123.45|-123.45|12345|-12|  K|     C|
|  null|TESTE|123.45|-123.45|12345|-12|  K|     C|
| TESTE| null|123.45|-123.45|12345|-12|  K|     C|
| TESTE|TESTE|  null|-123.45|12345|-12|  K|     C|
| TESTE|TESTE|-181.0|-123.45|12345|-12|  K|     C|
| TESTE|TESTE| 181.0|-123.45|12345|-12|  K|     C|
| TESTE|TESTE| 181.0|   null|12345|-12|  K|     C|
| TESTE|TESTE| 181.0|  181.0|12345|-12|  K|     C|
| TESTE|TESTE| 181.0| -181.0|12345|-12|  K|     C|
| TESTE|TESTE| 181.0| -181.0| null|-12|  K|     C|
+------+-----+------+-------+-----+---+---+------+



#### Sumarizando Casos

In [26]:
df8.createOrReplaceTempView("Teste")

spark.sql("select qa_dst, Count(*) from Teste Group By qa_dst").show()

+------+--------+
|qa_dst|count(1)|
+------+--------+
|  null|    1397|
|     C|      11|
+------+--------+



#### Tabela Oficial

In [16]:
dfo8 = (dfo7.withColumn("qa_dst",
            when((dfo7.dst.isNull()), lit("M"))
            .when((~dfo7.dst.isin("E","A","S","O","Z","N","U")), lit("C"))
            .when((dfo7.dst.rlike("([/d])") ==True), lit("N"))
        .otherwise(lit(None)))
      )

dfo8.filter(dfo8.qa_dst.isNotNull()).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+



#### FINALIZANDO DATASET AIRPORTS TO PARQUET FILE

In [17]:
dfo8.write.options(header='True', delimiter=',').csv("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_airports.csv")

In [18]:
dfo8.write.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_airports.parquet")

### Importando Dataset Airports

In [14]:
schema = (StructType()
              .add("tailnum",StringType(),True) 
              .add("year",IntegerType(),True) 
              .add("type",StringType(),True) 
              .add("manufacturer",StringType(),True) 
              .add("model",StringType(),True) 
              .add("engines",IntegerType(),True) 
              .add("seats",IntegerType(),True) 
              .add("speed",IntegerType(),True) 
              .add("engine",StringType(),True) 
         )

df_planes = (spark.read.format("csv") 
                  .option("header", True) 
                  .schema(schema) 
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/planes.csv")
                 )

In [20]:
df_planes.show()
df_planes.printSchema()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N110UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null

### Inseriondo Linhas com erros 

In [32]:
Vals = [(None, None, None, None, None, None, None, None, None), ("T12345AZ", 1949, "Teste", "Teste", "Teste", 5, 1, 151, "Teste")]

newRow = spark.createDataFrame(Vals, ['tailnum', 'year', 'type', 'manufacturer', 'model', 'engines', 'seats', 'speed', 'engine'])

df_planes_teste = df_planes.union(newRow)

### Pergunta 1 - Planes

#### Tabela Teste

In [33]:
df_plane2 = (df_planes_teste.withColumn("qa_tailnum",
            when((df_planes_teste.tailnum.isNull()), lit("M"))
            .when(~(length(df_planes_teste.tailnum).isin(5,6)), lit("S"))
            .when((df_planes_teste.tailnum.rlike("^N([0-9]{1,4})([A-Z]{1,2}$)")) == False, lit("F"))
            .when((substring(df_planes_teste.tailnum,1,1) != "N"), lit("FN"))
            .when((instr(df_planes_teste.tailnum,"O")>0) | (instr(df_planes.tailnum,"I")>0) , lit("FE"))
            .when((substring(df_planes_teste.tailnum,2,1)==0) , lit("FE"))
        .otherwise(lit(None)))
      )

df_plane2.filter(df_plane2.qa_tailnum.isNotNull()).show()


+-------+----+--------------------+------------+-------+-------+-----+-----+---------+----------+
|tailnum|year|                type|manufacturer|  model|engines|seats|speed|   engine|qa_tailnum|
+-------+----+--------------------+------------+-------+-------+-----+-----+---------+----------+
| N11206|2000|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N12114|1995|Fixed wing multi ...|      BOEING|757-224|      2|  178| null|Turbo-jet|         F|
| N12216|1998|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N12218|1998|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N12221|1998|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N12225|1998|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N12238|1999|Fixed wing multi ...|      BOEING|737-824|      2|  149| null|Turbo-fan|         F|
| N13248|1999|Fixed 

#### Sumarizando Casos

In [34]:
df_plane2.createOrReplaceTempView("Teste")

spark.sql("select qa_tailnum, Count(*) from Teste Group By qa_tailnum").show()

+----------+--------+
|qa_tailnum|count(1)|
+----------+--------+
|         F|     298|
|      null|    2330|
|         M|       1|
|         S|       1|
+----------+--------+



#### Tabela Oficial

In [15]:
dfo_plane2 = (df_planes.withColumn("qa_tailnum",
            when((df_planes.tailnum.isNull()), lit("M"))
            .when(~(length(df_planes.tailnum).isin(5,6)), lit("S"))
            .when((df_planes.tailnum.rlike("^N([0-9]{1,4})([A-Z]{1,2}$)")) == False, lit("F"))
            .when((substring(df_planes.tailnum,1,1) != "N"), lit("FN"))
            .when((instr(df_planes.tailnum,"O")>0) | (instr(df_planes.tailnum,"I")>0) , lit("FE"))
            .when((substring(df_planes.tailnum,2,1)==0) , lit("FE"))
        .otherwise(lit(None)))
      )

dfo_plane2.groupBy(col("qa_tailnum")).count().distinct().orderBy(col("qa_tailnum")).show(100)

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|      null| 2330|
|         F|  298|
+----------+-----+



### Pergunta 2 - Planes

#### Tabela Teste

In [36]:
df_plane3 = (df_planes_teste.withColumn("qa_year",
            when((df_planes_teste.year.isNull()), lit("M"))
            .when((df_planes_teste.year <= 1950), lit("I"))
        .otherwise(lit(None)))
      )

df_plane3.filter(df_plane3.qa_year.isNotNull()).show()

+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+-------+
|tailnum|year|                type|    manufacturer|        model|engines|seats|speed|       engine|qa_year|
+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+-------+
| N174US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      M|
| N177US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      M|
| N181UW|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      M|
| N194UW|null|Fixed wing multi ...|          AIRBUS|     A321-211|      2|  199| null|    Turbo-fan|      M|
| N235SW|   0|Fixed wing multi ...|         EMBRAER|    EMB-120ER|      2|   32| null|   Turbo-prop|      I|
| N271LV|null|Fixed wing multi ...|          BOEING|      737-705|      2|  149| null|    Turbo-fan|      M|
| N298WN|null|Fixed

#### Sumarizando Casos

In [37]:
df_plane3.createOrReplaceTempView("Teste")

spark.sql("select qa_year, Count(*) from Teste Group By qa_year").show()

+-------+--------+
|qa_year|count(1)|
+-------+--------+
|   null|    2567|
|      M|      61|
|      I|       2|
+-------+--------+



#### Tabela Oficial

In [18]:
dfo_plane3 = (dfo_plane2.withColumn("qa_year",
            when((dfo_plane2.year.isNull()), lit("M"))
            .when((dfo_plane2.year <= 1950), lit("I"))
        .otherwise(lit(None)))
      )

dfo_plane3.filter(dfo_plane3.qa_year.isNotNull()).show()

+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+----------+-------+
|tailnum|year|                type|    manufacturer|        model|engines|seats|speed|       engine|qa_tailnum|qa_year|
+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+----------+-------+
| N174US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N177US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N181UW|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N194UW|null|Fixed wing multi ...|          AIRBUS|     A321-211|      2|  199| null|    Turbo-fan|      null|      M|
| N235SW|   0|Fixed wing multi ...|         EMBRAER|    EMB-120ER|      2|   32| null|   Turbo-prop|      null|      I|
| N271LV|null|Fixed wing multi ...|     

### Pergunta 3 - Planes

#### Tabela Teste

In [39]:
Types = ["Fixed wing multi engine","Fixed wing single engine","Rotorcraft"]

df_plane4 = (df_planes_teste.withColumn("qa_type",
            when((df_planes_teste.type.isNull()), lit("M"))
            .when(~(df_planes_teste.type.isin(Types)), lit("C"))
        .otherwise(lit(None)))
      )

df_plane4.filter(df_plane4.qa_type.isNotNull()).show()

+--------+----+-----+------------+-----+-------+-----+-----+------+-------+
| tailnum|year| type|manufacturer|model|engines|seats|speed|engine|qa_type|
+--------+----+-----+------------+-----+-------+-----+-----+------+-------+
|    null|null| null|        null| null|   null| null| null|  null|      M|
|T12345AZ|1949|Teste|       Teste|Teste|      5|    1|  151| Teste|      C|
+--------+----+-----+------------+-----+-------+-----+-----+------+-------+



#### Sumarizando Casos

In [40]:
df_plane4.createOrReplaceTempView("Teste")

spark.sql("select qa_type, Count(*) from Teste Group By qa_type").show()

+-------+--------+
|qa_type|count(1)|
+-------+--------+
|   null|    2628|
|      M|       1|
|      C|       1|
+-------+--------+



#### Tabela Oficial

In [19]:
Types = ["Fixed wing multi engine","Fixed wing single engine","Rotorcraft"]

dfo_plane4 = (dfo_plane3.withColumn("qa_type",
            when((dfo_plane3.type.isNull()), lit("M"))
            .when(~(dfo_plane3.type.isin(Types)), lit("C"))
        .otherwise(lit(None)))
      )

dfo_plane4.filter(dfo_plane4.qa_type.isNotNull()).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+



## Dia 3 - Pergutas 4 [Planes] - 9 [Planes]

### Pergunta 4 - Planes

#### Tabela Teste

##### Proposta 1

In [42]:
df_plane5 = (df_planes_teste.withColumn("qa_manufacturer",
                when((df_planes_teste.manufacturer.isNull()), "M")
                .when((instr(df_planes_teste.manufacturer,"AIRBUS")==0)
                    &(instr(df_planes_teste.manufacturer,"BOEING")==0)
                    &(instr(df_planes_teste.manufacturer,"BOMBARDIER")==0)
                    &(instr(df_planes_teste.manufacturer,"CESSNA")==0)
                    &(instr(df_planes_teste.manufacturer,"EMBRAER")==0)
                    &(instr(df_planes_teste.manufacturer,"SIKORSKY")==0)
                    &(instr(df_planes_teste.manufacturer,"CANADAIR")==0)
                    &(instr(df_planes_teste.manufacturer,"PIPER")==0)
                    &(instr(df_planes_teste.manufacturer,"MCDONNELL DOUGLAS")==0)
                    &(instr(df_planes_teste.manufacturer,"CIRRUS")==0)
                    &(instr(df_planes_teste.manufacturer,"BELL")==0)
                    &(instr(df_planes_teste.manufacturer,"KILDALL GARY")==0)
                    &(instr(df_planes_teste.manufacturer,"LAMBERT RICHARD")==0)
                    &(instr(df_planes_teste.manufacturer,"BARKER JACK")==0)
                    &(instr(df_planes_teste.manufacturer,"ROBINSON HELICOPTER")==0)
                    &(instr(df_planes_teste.manufacturer,"GULFSTREAM")==0)
                    # &(instr(df_planes_teste.manufacturer,"MARZ BARRY")==0)
                    &(df_planes_teste.manufacturer.like("%MARZ BARRY%")==0)
                    , "C")
                .otherwise(lit(None)))
            )

df_plane5.filter(df_plane5.qa_manufacturer.isNotNull()).show()

+--------+----+-----+------------+-----+-------+-----+-----+------+---------------+
| tailnum|year| type|manufacturer|model|engines|seats|speed|engine|qa_manufacturer|
+--------+----+-----+------------+-----+-------+-----+-----+------+---------------+
|    null|null| null|        null| null|   null| null| null|  null|              M|
|T12345AZ|1949|Teste|       Teste|Teste|      5|    1|  151| Teste|              C|
+--------+----+-----+------------+-----+-------+-----+-----+------+---------------+



##### Proposta 2

In [43]:
Manufacturer = ["AIRBUS","BOEING","BOMBARDIER","CESSNA","EMBRAER","SIKORSKY","CANADAIR","PIPER","MCDONNELL DOUGLAS","CIRRUS","BELL","KILDALL GARY","LAMBERT RICHARD","BARKER JACK","ROBINSON HELICOPTER","GULFSTREAM","MARZ BARRY"]

df_plane5 = (df_planes_teste.withColumn("qa_manufacturer",
            when((df_planes_teste.manufacturer.isNull()), "M")
            .when(~(df_planes_teste.manufacturer.isin(Manufacturer)), "C")
        .otherwise(lit(None)))
      )

df_plane5.filter(df_plane5.qa_manufacturer.isNotNull()).show()

+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+---------------+
|tailnum|year|                type|    manufacturer|      model|engines|seats|speed|   engine|qa_manufacturer|
+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+---------------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
|

#### Sumarizando Casos

In [44]:
df_plane5.createOrReplaceTempView("Teste")

spark.sql("select qa_manufacturer, Count(*) from Teste Group By qa_manufacturer").show()

+---------------+--------+
|qa_manufacturer|count(1)|
+---------------+--------+
|           null|    2007|
|              C|     622|
|              M|       1|
+---------------+--------+



#### Tabela Oficial

In [20]:
Manufacturer = ["AIRBUS","BOEING","BOMBARDIER","CESSNA","EMBRAER","SIKORSKY","CANADAIR","PIPER","MCDONNELL DOUGLAS","CIRRUS","BELL","KILDALL GARY","LAMBERT RICHARD","BARKER JACK","ROBINSON HELICOPTER","GULFSTREAM","MARZ BARRY"]

dfo_plane5 = (dfo_plane4.withColumn("qa_manufacturer",
            when((dfo_plane4.manufacturer.isNull()), "M")
            .when(~(dfo_plane4.manufacturer.isin(Manufacturer)), "C")
        .otherwise(lit(None)))
      )

dfo_plane5.filter(dfo_plane5.qa_manufacturer.isNotNull()).show()

+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+----------+-------+-------+---------------+
|tailnum|year|                type|    manufacturer|      model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|
+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+----------+-------+-------+---------------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|
| N107US|1999|Fixed wing multi ...

### Pergunta 5 - Planes

#### Tabela Teste

In [46]:
df_plane6 = (df_planes_teste.withColumn("qa_model",
                when((df_planes_teste.model.isNull()), "M")
                .when((df_planes_teste.manufacturer == "AIRBUS") & (substring(df_planes_teste.model,1,1) != "A") , "F")
                .when((df_planes_teste.manufacturer == "BOEING") & (substring(df_planes_teste.model,1,1) != "7") , "F")
                .when(((df_planes_teste.manufacturer == "BOMBARDIER") | (df_planes_teste.manufacturer == "CANADAIR")) & (substring(df_planes_teste.model,1,2) != "CL") , "F")
                .when((df_planes_teste.manufacturer == "MCDONNELL DOUGLAS") & ((substring(df_planes_teste.model,1,2) != "MD") & (substring(df_planes_teste.model,1,2) != "DC")) , "F")
            .otherwise(lit(None)))
      )

df_plane6.filter(df_plane6.qa_model.isNotNull()).show()

+-------+----+--------------------+------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|manufacturer|   model|engines|seats|speed|   engine|qa_model|
+-------+----+--------------------+------------+--------+-------+-----+-----+---------+--------+
| N923DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N924DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N925DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N926DH|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N927DN|1999|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N928DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N945DN|1998|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|       F|
| N952DN|1998|Fixed wing multi

#### Sumarizando Casos

In [47]:
df_plane6.createOrReplaceTempView("Teste")

spark.sql("select qa_model, Count(*) from Teste Group By qa_model").show()

+--------+--------+
|qa_model|count(1)|
+--------+--------+
|       F|      15|
|    null|    2614|
|       M|       1|
+--------+--------+



#### Tabela Oficial

In [21]:
dfo_plane6 = (dfo_plane5.withColumn("qa_model",
                when((dfo_plane5.model.isNull()), "M")
                .when((dfo_plane5.manufacturer == "AIRBUS") & (substring(dfo_plane5.model,1,1) != "A") , "F")
                .when((dfo_plane5.manufacturer == "BOEING") & (substring(dfo_plane5.model,1,1) != "7") , "F")
                .when(((dfo_plane5.manufacturer == "BOMBARDIER") | (dfo_plane5.manufacturer == "CANADAIR")) & (substring(dfo_plane5.model,1,2) != "CL") , "F")
                .when((dfo_plane5.manufacturer == "MCDONNELL DOUGLAS") & ((substring(dfo_plane5.model,1,2) != "MD") & (substring(dfo_plane5.model,1,2) != "DC")) , "F")
            .otherwise(lit(None)))
      )

dfo_plane6.filter(dfo_plane6.qa_model.isNotNull()).show()

+-------+----+--------------------+------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+
|tailnum|year|                type|manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|
+-------+----+--------------------+------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+
| N923DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|      null|      M|   null|           null|       F|
| N924DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|      null|      M|   null|           null|       F|
| N925DN|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|      null|      M|   null|           null|       F|
| N926DH|null|Fixed wing multi ...|      BOEING|MD-90-30|      2|  142| null|Turbo-fan|      null|      M|   null|           null|       F|
| N927DN|1999|Fixed 

### Pergunta 6 - Planes

#### Tabela Teste

In [49]:
df_plane7 = (df_planes_teste.withColumn("qa_engines",
                when((df_planes_teste.engines.isNull()), "M")
                .when(~(df_planes_teste.engines.between(1, 4)), "I")
                .when(~(df_planes_teste.engines.rlike("([0-9])")), "A")
            .otherwise(lit(None)))
            )

df_plane7.filter(df_plane7.qa_engines.isNotNull()).show()

+--------+----+-----+------------+-----+-------+-----+-----+------+----------+
| tailnum|year| type|manufacturer|model|engines|seats|speed|engine|qa_engines|
+--------+----+-----+------------+-----+-------+-----+-----+------+----------+
|    null|null| null|        null| null|   null| null| null|  null|         M|
|T12345AZ|1949|Teste|       Teste|Teste|      5|    1|  151| Teste|         I|
+--------+----+-----+------------+-----+-------+-----+-----+------+----------+



#### Sumarizando Casos

In [50]:
df_plane7.createOrReplaceTempView("Teste")

spark.sql("select DISTINCT engines from Teste").show()

spark.sql("select qa_engines, Count(*) from Teste Group By qa_engines").show()

+-------+
|engines|
+-------+
|      1|
|      3|
|      2|
|      4|
|   null|
|      5|
+-------+

+----------+--------+
|qa_engines|count(1)|
+----------+--------+
|      null|    2628|
|         M|       1|
|         I|       1|
+----------+--------+



#### Tabela Oficial

In [22]:
dfo_plane7 = (dfo_plane6.withColumn("qa_engines",
                when((dfo_plane6.engines.isNull()), "M")
                .when(~(dfo_plane6.engines.between(1, 4)), "I")
                .when(~(dfo_plane6.engines.rlike("([0-9])")), "A")
            .otherwise(lit(None)))
            )

dfo_plane7.filter(dfo_plane7.qa_engines.isNotNull()).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+



### Pergunta 7 - Planes

#### Tabela Teste

In [52]:
df_plane8 = (df_planes_teste.withColumn("qa_seats",
                when((df_planes_teste.seats.isNull()), "M")
                .when(~(df_planes_teste.seats.between(2, 500)), "I")
                .when((df_planes_teste.seats.cast("int").isNull()), "A")
            .otherwise(lit(None)))
            )

df_plane8.filter(df_plane8.qa_seats.isNotNull()).show()

+--------+----+-----+------------+-----+-------+-----+-----+------+--------+
| tailnum|year| type|manufacturer|model|engines|seats|speed|engine|qa_seats|
+--------+----+-----+------------+-----+-------+-----+-----+------+--------+
|    null|null| null|        null| null|   null| null| null|  null|       M|
|T12345AZ|1949|Teste|       Teste|Teste|      5|    1|  151| Teste|       I|
+--------+----+-----+------------+-----+-------+-----+-----+------+--------+



#### Sumarizando Casos

In [53]:
df_plane8.createOrReplaceTempView("Teste")

spark.sql("select DISTINCT seats from Teste order by 1").show()

spark.sql("select qa_seats, Count(*) from Teste Group By qa_seats").show()

+-----+
|seats|
+-----+
| null|
|    1|
|    2|
|    4|
|    5|
|    6|
|    7|
|    8|
|   14|
|   20|
|   32|
|   55|
|   80|
|   95|
|  100|
|  128|
|  140|
|  142|
|  145|
|  147|
+-----+
only showing top 20 rows

+--------+--------+
|qa_seats|count(1)|
+--------+--------+
|    null|    2628|
|       M|       1|
|       I|       1|
+--------+--------+



#### Tabela Oficial

In [23]:
dfo_plane8 = (dfo_plane7.withColumn("qa_seats",
                when((dfo_plane7.seats.isNull()), "M")
                .when(~(dfo_plane7.seats.between(2, 500)), "I")
                .when((dfo_plane7.seats.cast("int").isNull()), "A")
            .otherwise(lit(None)))
            )

dfo_plane8.filter(dfo_plane8.qa_seats.isNotNull()).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+



### Pergunta 8 - Planes

#### Tabela Teste

In [55]:
df_plane9 = (df_planes_teste.withColumn("qa_speed",
                when((df_planes_teste.speed.isNull()), "M")
                .when(~(df_planes_teste.speed.between(50, 150)), "I")
                .when((df_planes_teste.speed.rlike("([0-9])")), "A")
            .otherwise(None))
            )

df_plane9.filter(df_plane9.qa_speed.isNotNull()).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_speed|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|

#### Sumarizando Casos

In [56]:
df_plane9.createOrReplaceTempView("Teste")

spark.sql("select speed, Count(*) from Teste Group by speed Order by 1 ").show() 

spark.sql("select qa_speed, Count(*) from Teste Group By qa_speed").show()

+-----+--------+
|speed|count(1)|
+-----+--------+
| null|    2623|
|   90|       2|
|  107|       1|
|  108|       1|
|  112|       1|
|  126|       1|
|  151|       1|
+-----+--------+

+--------+--------+
|qa_speed|count(1)|
+--------+--------+
|       M|    2623|
|       A|       6|
|       I|       1|
+--------+--------+



#### Tabela Oficial

In [24]:
dfo_plane9 = (dfo_plane8.withColumn("qa_speed",
                when((dfo_plane8.speed.isNull()), "M")
                .when(~(dfo_plane8.speed.between(50, 150)), "I")
                .when((dfo_plane8.speed.rlike("([0-9])")), "A")
            .otherwise(None))
            )

dfo_plane9.filter(dfo_plane9.qa_speed.isNotNull()).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+----------+--------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+----------+--------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|    null|      null|    null|       M|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C|    null|      null|    null|       M|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|              C| 

### Pergunta 9 - Planes

#### Tabela Teste

In [16]:
Engine = ["Turbo-fan", "Turbo-jet", "Turbo-prop", "Turbo-shaft", "4 Cycle"]

df_plane10 = (df_planes_teste.withColumn("qa_engine",
                when((df_planes_teste.engine.isNull()), "M")
                .when(~(df_planes_teste.engine.isin(Engine)), "C")
            .otherwise(None))
            )

df_plane10.filter(df_plane10.qa_engine.isNotNull()).show()

NameError: name 'df_planes_teste' is not defined

#### Sumarizando Casos

In [59]:
df_plane10.createOrReplaceTempView("Teste")

spark.sql("select engine, Count(*) from Teste Group by engine Order by 1 ").show() 

spark.sql("select qa_engine, Count(*) from Teste Group By qa_engine").show()

+-------------+--------+
|       engine|count(1)|
+-------------+--------+
|         null|       1|
|      4 Cycle|       1|
|Reciprocating|      10|
|        Teste|       1|
|    Turbo-fan|    2127|
|    Turbo-jet|     450|
|   Turbo-prop|      37|
|  Turbo-shaft|       3|
+-------------+--------+

+---------+--------+
|qa_engine|count(1)|
+---------+--------+
|     null|    2618|
|        C|      11|
|        M|       1|
+---------+--------+



#### Tabela Oficial

In [25]:
Engine = ["Turbo-fan", "Turbo-jet", "Turbo-prop", "Turbo-shaft", "4 Cycle"]

dfo_plane10 = (dfo_plane9.withColumn("qa_engine",
                when((dfo_plane9.engine.isNull()), "M")
                .when(~(dfo_plane9.engine.isin(Engine)), "C")
            .otherwise(None))
            )

dfo_plane10.filter(dfo_plane10.qa_engine.isNotNull()).show()

dfo_plane10.groupBy(col("qa_engine")).count().distinct().orderBy(col("qa_engine")).show(100)

+-------+----+--------------------+------------------+-------------+-------+-----+-----+-------------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
|tailnum|year|                type|      manufacturer|        model|engines|seats|speed|       engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|qa_engine|
+-------+----+--------------------+------------------+-------------+-------+-----+-----+-------------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
| N201AA|1959|Fixed wing single...|            CESSNA|          150|      1|    2|   90|Reciprocating|      null|   null|   null|           null|    null|      null|    null|       A|        C|
| N202AA|1980|Fixed wing multi ...|            CESSNA|         421C|      2|    8|   90|Reciprocating|      null|   null|   null|           null|    null|      null|    null|       A|        C|
| N425AA|1968|Fixed wing singl

#### FINALIZANDO DATASET PLANES TO PARQUET FILE

In [30]:
dfo_plane10.write.options(header='True', delimiter=',').csv("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_planes.csv")
dfo_plane10.write.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_planes.parquet")

## Dia 4 - Pergutas 1 [Flights] - 6 [Flights]

### Importando Dataset Airports

#### Meteodo Importação 1

In [63]:

df_flights_NoFormat =  (spark.read.options(header='True', delimiter=',')
                          .csv("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Data/flights.csv"))


In [26]:
schema = (StructType()
              .add("year",IntegerType(),True)
              .add("month",IntegerType(),True)
              .add("day",IntegerType(),True)
              .add("hour",IntegerType(),True)
              .add("minute",IntegerType(),True)
              .add("dep_time",StringType(),True)
              .add("arr_time",StringType(),True)
              .add("dep_delay",IntegerType(),True)
              .add("arr_delay",IntegerType(),True)
              .add("carrier",StringType(),True) 
              .add("tailnum",StringType(),True) 
              .add("flight",StringType(),True) 
              .add("origin",StringType(),True)
              .add("dest",StringType(),True)
              .add("air_time",IntegerType(),True)
              .add("distance",IntegerType(),True)
         )

df_flights=spark.createDataFrame([],schema)
                 
df_flights = df_flights.union(df_flights_NoFormat[['year', 'month', 'day','hour', 'minute','dep_time','arr_time', 'dep_delay', 'arr_delay', 'carrier', 'tailnum', 'flight',
'origin','dest', 'air_time', 'distance']])

df_flights.show()
df_flights.printSchema()

NameError: name 'df_flights_NoFormat' is not defined

#### Meteodo Importação 2

In [27]:
schema = (StructType()
            .add("year",IntegerType(),True)
            .add("month",IntegerType(),True)
            .add("day",IntegerType(),True)
            .add("dep_time",StringType(),True)
            .add("dep_delay",IntegerType(),True)  
            .add("arr_time",StringType(),True)
            .add("arr_delay",IntegerType(),True)
            .add("carrier",StringType(),True) 
            .add("tailnum",StringType(),True) 
            .add("flight",StringType(),True) 
            .add("origin",StringType(),True)
            .add("dest",StringType(),True)
            .add("air_time",IntegerType(),True)
            .add("distance",IntegerType(),True)
            .add("hour",IntegerType(),True)
            .add("minute",IntegerType(),True)
         )
df_flights = (spark.read.format("csv") 
                  .option("header", True) 
                  .schema(schema) 
                  .load("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Data/flights.csv")
                 )
                 
df_flights=df_flights['year', 'month', 'day','hour', 'minute','dep_time','arr_time', 'dep_delay', 'arr_delay', 'carrier', 'tailnum', 'flight',
'origin','dest', 'air_time', 'distance']

df_flights.show()
df_flights.printSchema()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|2014|   12|  8|   6|    58|     658|     935|       -7|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|
|2014|    1| 22|  10|    40|    1040|    1505|        5|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|
|2014|    3|  9|  14|    43|    1443|    1652|       -2|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|
|2014|    4|  9|  17|     5|    1705|    1839|       45|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|
|2014|    3|  9|   7|    54|     754|    1015|       -1|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|
|2014|    1| 15|  10|    37|    1037|   

### Inserindo linhas com erros

In [83]:
Columns = ['year', 'month', 'day','hour', 'minute','dep_time','arr_time', 'dep_delay', 'arr_delay', 'carrier', 'tailnum', 'flight',
'origin','dest', 'air_time', 'distance']
Vals = ([(None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
        ,(1949, 13, 32, 25, 60, 40, 5, 0, 0, "ASS", "N559ASS", 1, "SEAA", "HNLL", 19, 49)
        ,(1949, 0, 0, -1, -1, 2540, -505, 0, 0, "A", "I559", 85188, "SE", "HN", 501, 3001)])

newRow = spark.createDataFrame(Vals, Columns)

df_flights_Testes = df_flights.union(newRow)

df_flights_Testes.filter((df_flights_Testes.year == 1949) | (df_flights_Testes.year.isNull())).show(truncate=False) 

+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|year|month|day |hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|null|null |null|null|null  |null    |null    |null     |null     |null   |null   |null  |null  |null|null    |null    |
|1949|13   |32  |25  |60    |40      |5       |0        |0        |ASS    |N559ASS|1     |SEAA  |HNLL|19      |49      |
|1949|0    |0   |-1  |-1    |2540    |-505    |0        |0        |A      |I559   |85188 |SE    |HN  |501     |3001    |
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+



### Pergunta 1 - Flights

#### Tabela Testes

In [156]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_year_month_day",
                when((df_flights_Testes.year.isNull()),lit("MY"))
                .when((df_flights_Testes.year < 1950), lit("IY"))
                .otherwise(None))
            )

df_flights_Testes = (df_flights_Testes.withColumn("qa_year_month_day",
                when((df_flights_Testes.month.isNull()), concat_ws("_", df_flights_Testes.qa_year_month_day, lit("MM")))       
                .when(~(df_flights_Testes.month.between(1,12)), concat_ws("_", df_flights_Testes.qa_year_month_day, lit("IM")))                
                .otherwise(df_flights_Testes.qa_year_month_day))
            )

df_flights_Testes = (df_flights_Testes.withColumn("qa_year_month_day",
                when((df_flights_Testes.day.isNull()), concat_ws("_", df_flights_Testes.qa_year_month_day, lit("MD")))
                .when(((~(df_flights_Testes.day.between(1,31))&(df_flights_Testes.month != 2)))
                      |(((df_flights_Testes.day > 29) & (df_flights_Testes.month == 2)))
                      , concat_ws("_", df_flights_Testes.qa_year_month_day, lit("ID")))
                .otherwise(df_flights_Testes.qa_year_month_day))
            )


df_flights_Testes.filter(df_flights_Testes.qa_year_month_day.isNotNull()).show()


+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+
|year|month| day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+
|null| null|null|null|  null|    null|    null|     null|     null|   null|   null|  null|  null|null|    null|    null|         MY_MM_MD|
|1949|   13|  32|  25|    60|      40|       5|        0|        0|    ASS|N559ASS|     1|  SEAA|HNLL|      19|      49|         IY_IM_ID|
|1949|    0|   0|  -1|    -1|    2540|    -505|        0|        0|      A|   I559| 85188|    SE|  HN|     501|    3001|         IY_IM_ID|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+



#### Sumarizando Casos

In [157]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_year_month_day, Count(*) from Teste Group By qa_year_month_day  Order by 1").show()


+-----------------+--------+
|qa_year_month_day|count(1)|
+-----------------+--------+
|             null|   10000|
|         IY_IM_ID|       2|
|         MY_MM_MD|       1|
+-----------------+--------+



#### Tabela Oficial

In [28]:
dfo_flights = (df_flights.withColumn("qa_year_month_day",
                when((df_flights.year.isNull()),lit("MY"))
                .when((df_flights.year < 1950), lit("IY"))
                .otherwise(None))
            )

dfo_flights = (dfo_flights.withColumn("qa_year_month_day",
                when((dfo_flights.month.isNull()), concat_ws("_", dfo_flights.qa_year_month_day, lit("MM")))       
                .when(~(dfo_flights.month.between(1,12)), concat_ws("_", dfo_flights.qa_year_month_day, lit("IM")))                
                .otherwise(dfo_flights.qa_year_month_day))
            )

dfo_flights = (dfo_flights.withColumn("qa_year_month_day",
                when((dfo_flights.day.isNull()), concat_ws("_", dfo_flights.qa_year_month_day, lit("MD")))
                .when(((~(dfo_flights.day.between(1,31))&(dfo_flights.month != 2)))
                      |(((dfo_flights.day > 29) & (dfo_flights.month == 2)))
                      , concat_ws("_", dfo_flights.qa_year_month_day, lit("ID")))
                .otherwise(dfo_flights.qa_year_month_day))
            )


dfo_flights.filter(dfo_flights.qa_year_month_day.isNotNull()).show()


+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+



### Pergunta 2 - Flights

#### Tabela Testes

In [179]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_hour_minute",
                when((df_flights_Testes.hour.isNull()), lit("MH"))       
                .when(~(df_flights_Testes.hour.between(0,24)), lit("IH"))                
                .otherwise(None))
            )

df_flights_Testes = (df_flights_Testes.withColumn("qa_hour_minute",
                when((df_flights_Testes.minute.isNull()), concat_ws("_", df_flights_Testes.qa_hour_minute, lit("MM")))       
                .when(~(df_flights_Testes.minute.between(0,59)), concat_ws("_", df_flights_Testes.qa_hour_minute, lit("IM")))
                .otherwise(df_flights_Testes.qa_hour_minute))
            )


df_flights_Testes.filter(df_flights_Testes.qa_hour_minute.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+
|2014|    3|  4|null|  null|      NA|      NA|     null|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|             null|         MH_MM|
|2014|    2| 12|null|  null|      NA|      NA|     null|     null|     AS| N527AS|     2|   SEA| DCA|    null|    2329|             null|         MH_MM|
|2014|    7|  1|null|  null|      NA|      NA|     null|     null|     WN| N8323C|  2485|   SEA| MDW|    null|    1733|             null|         MH_MM|
|2014|    4| 30|null|  null|      NA|      NA|     null|     null|     AS| N526AS|

#### Sumarizando Casos

In [172]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_hour_minute, Count(*) from Teste Group By qa_hour_minute  Order by 1").show()

+--------------+--------+
|qa_hour_minute|count(1)|
+--------------+--------+
|          null|    9952|
|         MH_MM|      48|
+--------------+--------+



#### Tabela Oficial

In [29]:
dfo_flights = (dfo_flights.withColumn("qa_hour_minute",
                when((dfo_flights.hour.isNull()), lit("MH"))       
                .when(~(dfo_flights.hour.between(0,23)), lit("IH"))                
                .otherwise(None))
            )

dfo_flights = (dfo_flights.withColumn("qa_hour_minute",
                when((dfo_flights.minute.isNull()), concat_ws("_", dfo_flights.qa_hour_minute, lit("MM")))       
                .when(~(dfo_flights.minute.between(0,59)), concat_ws("_", dfo_flights.qa_hour_minute, lit("IM")))
                .otherwise(dfo_flights.qa_hour_minute))
            )


dfo_flights.filter(dfo_flights.qa_hour_minute.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+
|2014|    3|  4|null|  null|      NA|      NA|     null|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|             null|         MH_MM|
|2014|    2| 12|null|  null|      NA|      NA|     null|     null|     AS| N527AS|     2|   SEA| DCA|    null|    2329|             null|         MH_MM|
|2014|    7|  1|null|  null|      NA|      NA|     null|     null|     WN| N8323C|  2485|   SEA| MDW|    null|    1733|             null|         MH_MM|
|2014|    4| 30|null|  null|      NA|      NA|     null|     null|     AS| N526AS|

### Pergunta 3 - Flights

#### Tabela Testes

In [211]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_dep_arr_time",
                    when(((df_flights_Testes.dep_time.isNull()) | (df_flights_Testes.dep_time == 'NA')), lit("MD"))       
                    .when(~((df_flights_Testes.dep_time.isNull()) | (df_flights_Testes.dep_time == 'NA'))
                        & (~(length(df_flights_Testes.dep_time).between(3,4))) 
                        | ((~(substring(lpad(df_flights_Testes.dep_time, 4,'0'),1,2).between(0,23)))
                        | (~(substring(lpad(df_flights_Testes.dep_time, 4,'0'),3,2).between(0,59))))
                    , lit("FD"))
                    .otherwise(None))
            )

df_flights_Testes = (df_flights_Testes.withColumn("qa_dep_arr_time",
                    when(((df_flights_Testes.arr_time.isNull()) | (df_flights_Testes.arr_time == 'NA')), concat_ws("_", df_flights_Testes.qa_dep_arr_time, lit("MA")))       
                    .when(~((df_flights_Testes.arr_time.isNull()) | (df_flights_Testes.arr_time == 'NA'))
                        & (~(length(df_flights_Testes.arr_time).between(3,4))) 
                        | ((~(substring(lpad(df_flights_Testes.arr_time, 4,'0'),1,2).between(0,23)))
                        | (~(substring(lpad(df_flights_Testes.arr_time, 4,'0'),3,2).between(0,59))))
                    , concat_ws("_", df_flights_Testes.qa_dep_arr_time, lit("FA")))
                    .otherwise(df_flights_Testes.qa_dep_arr_time))
            )


df_flights_Testes.filter(df_flights_Testes.qa_dep_arr_time.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+
|2014|    6|  2|  22|    22|    2222|      55|        7|       15|     AS| N402AS|    99|   SEA| ANC|     190|    1448|             null|          null|             FA|
|2014|    7|  5|  22|    24|    2224|      48|       -1|      -20|     AS| N459AS|   143|   PDX| ANC|     185|    1542|             null|          null|             FA|
|2014|   12| 17|  22|    34|    2234|      11|      223|      212|     UA| N39450|  1596|   PDX| SFO|      76|     550|             null|          null|   

#### Sumarizando Casos

In [212]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_dep_arr_time, Count(*) from Teste Group By qa_dep_arr_time  Order by 1").show()

+---------------+--------+
|qa_dep_arr_time|count(1)|
+---------------+--------+
|           null|    9704|
|             FA|     151|
|             FD|      90|
|          FD_FA|       2|
|             MA|       7|
|          MD_MA|      49|
+---------------+--------+



#### Tabela Oficial

In [30]:
dfo_flights = (dfo_flights.withColumn("qa_dep_arr_time",
                    when(((dfo_flights.dep_time.isNull()) | (dfo_flights.dep_time == 'NA')), lit("MD"))       
                    .when(~((dfo_flights.dep_time.isNull()) | (dfo_flights.dep_time == 'NA'))
                        & (~(length(dfo_flights.dep_time).between(3,4))) 
                        | ((~(substring(lpad(dfo_flights.dep_time, 4,'0'),1,2).between(0,23)))
                        | (~(substring(lpad(dfo_flights.dep_time, 4,'0'),3,2).between(0,59))))
                    , lit("FD"))
                    .otherwise(None))
            )

dfo_flights = (dfo_flights.withColumn("qa_dep_arr_time",
                    when(((dfo_flights.arr_time.isNull()) | (dfo_flights.arr_time == 'NA')), concat_ws("_", dfo_flights.qa_dep_arr_time, lit("MA")))       
                    .when(~((dfo_flights.arr_time.isNull()) | (dfo_flights.arr_time == 'NA'))
                        & (~(length(dfo_flights.arr_time).between(3,4))) 
                        | ((~(substring(lpad(dfo_flights.arr_time, 4,'0'),1,2).between(0,23)))
                        | (~(substring(lpad(dfo_flights.arr_time, 4,'0'),3,2).between(0,59))))
                    , concat_ws("_", dfo_flights.qa_dep_arr_time, lit("FA")))
                    .otherwise(dfo_flights.qa_dep_arr_time))
            )


dfo_flights.filter(dfo_flights.qa_dep_arr_time.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+
|2014|    6|  2|  22|    22|    2222|      55|        7|       15|     AS| N402AS|    99|   SEA| ANC|     190|    1448|             null|          null|             FA|
|2014|    7|  5|  22|    24|    2224|      48|       -1|      -20|     AS| N459AS|   143|   PDX| ANC|     185|    1542|             null|          null|             FA|
|2014|   12| 17|  22|    34|    2234|      11|      223|      212|     UA| N39450|  1596|   PDX| SFO|      76|     550|             null|          null|   

### Pergunta 4 - Flights

#### Tabela Testes

In [216]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_dep_arr_delay",
                    when((df_flights_Testes.dep_delay.isNull()), lit("MD"))       
                    .otherwise(None))
            )

df_flights_Testes = (df_flights_Testes.withColumn("qa_dep_arr_delay",
                    when((df_flights_Testes.arr_delay.isNull()), concat_ws("_", df_flights_Testes.qa_dep_arr_delay, lit("MA")))       
                    .otherwise(df_flights_Testes.qa_dep_arr_delay))
            )


df_flights_Testes.filter(df_flights_Testes.qa_dep_arr_delay.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+
|2014|    4|  6|  13|    29|    1329|    2159|        4|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|             null|          null|           null|              MA|
|2014|    3|  4|null|  null|      NA|      NA|     null|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|             null|         MH_MM|          MD_MA|           MD_MA|
|2014|    2| 12|null|  null|      NA|      NA|     null|     null|    

#### Sumarizando Casos

In [217]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_dep_arr_delay, Count(*) from Teste Group By qa_dep_arr_delay  Order by 1").show()

+----------------+--------+
|qa_dep_arr_delay|count(1)|
+----------------+--------+
|            null|    9927|
|              MA|      27|
|           MD_MA|      49|
+----------------+--------+



#### Tabela Oficial

In [31]:
dfo_flights = (dfo_flights.withColumn("qa_dep_arr_delay",
                    when((dfo_flights.dep_delay.isNull()), lit("MD"))       
                    .otherwise(None))
            )

dfo_flights = (dfo_flights.withColumn("qa_dep_arr_delay",
                    when((dfo_flights.arr_delay.isNull()), concat_ws("_", dfo_flights.qa_dep_arr_delay, lit("MA")))       
                    .otherwise(dfo_flights.qa_dep_arr_delay))
            )


dfo_flights.filter(dfo_flights.qa_dep_arr_delay.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+
|2014|    4|  6|  13|    29|    1329|    2159|        4|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|             null|          null|           null|              MA|
|2014|    3|  4|null|  null|      NA|      NA|     null|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|             null|         MH_MM|          MD_MA|           MD_MA|
|2014|    2| 12|null|  null|      NA|      NA|     null|     null|    

### Pergunta 5 - Flights

#### Tabela Testes

In [221]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_carrier",
                    when((df_flights_Testes.carrier.isNull()), lit("M"))
                    .when(length(df_flights_Testes.carrier) != 2, lit("F"))                                                  
                    .otherwise(None))
            )


df_flights_Testes.filter(df_flights_Testes.qa_carrier.isNotNull()).show()

+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+
|year|month| day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+
|null| null|null|null|  null|    null|    null|     null|     null|   null|   null|  null|  null|null|    null|    null|         MY_MM_MD|         MH_MM|          MD_MA|           MD_MA|         M|
|1949|   13|  32|  25|    60|      40|       5|        0|        0|    ASS|N559ASS|     1|  SEAA|HNLL|      19|      49|         IY_IM_ID|         IH_IM|          FD_FA|            null|         F|
|1949|    

#### Sumarizando Casos

In [222]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_carrier, Count(*) from Teste Group By qa_carrier  Order by 1").show()

+----------+--------+
|qa_carrier|count(1)|
+----------+--------+
|      null|   10000|
|         F|       2|
|         M|       1|
+----------+--------+



#### Tabela Oficial

In [32]:
dfo_flights = (dfo_flights.withColumn("qa_carrier",
                    when((dfo_flights.carrier.isNull()), lit("M"))
                    .when(length(dfo_flights.carrier) != 2, lit("F"))                                                  
                    .otherwise(None))
            )


dfo_flights.filter(dfo_flights.qa_carrier.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+



### Pergunta 6 - Flights

#### Tabela Testes

In [225]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_tailnum",
            when((df_flights_Testes.tailnum.isNull()), lit("M"))
            .when(~(length(df_flights_Testes.tailnum).isin(5,6)), lit("S"))
            .when((df_flights_Testes.tailnum.rlike("^N([0-9]{1,4})([A-Z]{1,2}$)")) == False, lit("F"))
            .when((substring(df_flights_Testes.tailnum,1,1) != "N"), lit("FN"))
            .when((instr(df_flights_Testes.tailnum,"O")>0) | (instr(df_flights_Testes.tailnum,"I")>0) , lit("FE"))
            .when((substring(df_flights_Testes.tailnum,2,1)==0) , lit("FE"))
        .otherwise(lit(None)))
      )

df_flights_Testes.filter(df_flights_Testes.qa_tailnum.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+
|2014|    1| 29|  20|     9|    2009|    2159|        3|        9|     UA| N27205|  1458|   PDX| SFO|      90|     550|             null|          null|           null|            null|      null|         F|
|2014|   10| 20|  13|    28|    1328|    1949|       -1|        4|     UA| N68805|  1212|   SEA| IAH|     228|    1874|             null|          null|           null|

#### Sumarizando Casos

In [226]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_tailnum, Count(*) from Teste Group By qa_tailnum  Order by 1").show()

+----------+--------+
|qa_tailnum|count(1)|
+----------+--------+
|      null|    8997|
|         F|     989|
|         M|       1|
|         S|      16|
+----------+--------+



#### Tabela Oficial

In [33]:
dfo_flights = (dfo_flights.withColumn("qa_tailnum",
            when((dfo_flights.tailnum.isNull()), lit("M"))
            .when(~(length(dfo_flights.tailnum).isin(5,6)), lit("S"))
            .when((dfo_flights.tailnum.rlike("^N([0-9]{1,4})([A-Z]{1,2}$)")) == False, lit("F"))
            .when((substring(dfo_flights.tailnum,1,1) != "N"), lit("FN"))
            .when((instr(dfo_flights.tailnum,"O")>0) | (instr(dfo_flights.tailnum,"I")>0) , lit("FE"))
            .when((substring(dfo_flights.tailnum,2,1)==0) , lit("FE"))
        .otherwise(lit(None)))
      )

dfo_flights.groupBy(col("qa_tailnum")).count().distinct().orderBy(col("qa_tailnum")).show(100)

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|      null| 8997|
|         F|  989|
|         S|   14|
+----------+-----+



### Pergunta 7 - Flights

#### Tabela Testes

In [234]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_flight",
                    when((df_flights_Testes.flight.isNull()), lit("M"))
                    .when((length(df_flights_Testes.flight) != 4) | (df_flights_Testes.flight.cast("int").isNull() == True), lit("F"))                                                  
                    .otherwise(None))
            )


df_flights_Testes.filter(df_flights_Testes.qa_flight.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+
|2014|    1| 22|  10|    40|    1040|    1505|        5|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|             null|          null|           null|            null|      null|      null|        F|
|2014|    3|  9|  14|    43|    1443|    1652|       -2|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|         

#### Sumarizando Casos

In [252]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select flight*1, Count(*) from Teste Group By flight  Order by flight*1").show()

+------------+--------+
|(flight * 1)|count(1)|
+------------+--------+
|        null|       1|
|         1.0|       1|
|         2.0|      21|
|         4.0|      26|
|         8.0|      26|
|        10.0|       2|
|        12.0|      22|
|        14.0|      19|
|        16.0|      17|
|        18.0|      17|
|        20.0|       5|
|        21.0|      17|
|        22.0|      13|
|        24.0|      19|
|        25.0|      35|
|        26.0|      18|
|        28.0|      21|
|        29.0|      21|
|        30.0|      22|
|        32.0|      27|
+------------+--------+
only showing top 20 rows



#### Tabela Oficial

In [34]:
dfo_flights = (dfo_flights.withColumn("qa_flight",
                    when((dfo_flights.flight.isNull()), lit("M"))
                    .when((length(dfo_flights.flight) != 4) | (dfo_flights.flight.cast("int").isNull() == True), lit("F"))                                                  
                    .otherwise(None))
            )


dfo_flights.filter(dfo_flights.qa_flight.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+
|2014|    1| 22|  10|    40|    1040|    1505|        5|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|             null|          null|           null|            null|      null|      null|        F|
|2014|    3|  9|  14|    43|    1443|    1652|       -2|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|         

### Pergunta 8 - Flights

#### Tabela Testes

In [285]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_origin_dest",
                        when((df_flights_Testes.origin.isNull()), lit("MO"))
                        .when((length(df_flights_Testes.origin) != 3)
                              | (df_flights_Testes.origin.rlike("([A-Z])") == False), lit("FO")) 
                        .otherwise(None))
                    )

df_flights_Testes = (df_flights_Testes.withColumn("qa_origin_dest",
                        when((df_flights_Testes.dest.isNull()) , concat_ws("_", df_flights_Testes.qa_origin_dest,lit("MD")))
                        .when((length(df_flights_Testes.dest) != 3)
                              | (df_flights_Testes.dest.rlike("([A-Z])") == False) , concat_ws("_", df_flights_Testes.qa_origin_dest,lit("FD"))) 
                        .otherwise(df_flights_Testes.qa_origin_dest))
                    )


df_flights_Testes.filter(df_flights_Testes.qa_origin_dest.isNotNull()).show()

+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+
|year|month| day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+
|null| null|null|null|  null|    null|    null|     null|     null|   null|   null|  null|  null|null|    null|    null|         MY_MM_MD|         MH_MM|          MD_MA|           MD_MA|         M|         M|        M|         MO_MD|
|1949|   13|  32|  25|    60|      40|       5|        0|       

#### Sumarizando Casos

In [286]:
df_flights_Testes.createOrReplaceTempView("Teste")

spark.sql("select qa_origin_dest, Count(*) from Teste Group By qa_origin_dest  Order by 1").show()

+--------------+--------+
|qa_origin_dest|count(1)|
+--------------+--------+
|          null|   10000|
|         FO_FD|       2|
|         MO_MD|       1|
+--------------+--------+



#### Tabela Oficial

In [35]:
dfo_flights = (dfo_flights.withColumn("qa_origin_dest",
                        when((dfo_flights.origin.isNull()), lit("MO"))
                        .when((length(dfo_flights.origin) != 3)
                              | (dfo_flights.origin.rlike("([A-Z])") == False), lit("FO")) 
                        .otherwise(None))
                    )

dfo_flights = (dfo_flights.withColumn("qa_origin_dest",
                        when((dfo_flights.dest.isNull()) , concat_ws("_", dfo_flights.qa_origin_dest,lit("MD")))
                        .when((length(dfo_flights.dest) != 3)
                              | (dfo_flights.dest.rlike("([A-Z])") == False) , concat_ws("_", dfo_flights.qa_origin_dest,lit("FD"))) 
                        .otherwise(dfo_flights.qa_origin_dest))
                    )

dfo_flights.filter(dfo_flights.qa_origin_dest.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+



### Pergunta 9 - Flights

#### Tabela Testes

In [304]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_air_time",
                    when((df_flights_Testes.air_time.isNull()), lit("M"))
                    .when(~(df_flights_Testes.air_time.between(20,500)) , lit("I"))                                                  
                    .otherwise(None))
            )

df_flights_Testes.filter(df_flights_Testes.qa_air_time.isNotNull()).show()

#df_flights_Testes.groupBy("air_time").count().distinct().orderBy("air_time").show(1000)

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_distance|qa_air_time|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+
|2014|    4|  6|  13|    29|    1329|    2159|        4|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|             null|          null|           null|              MA|      null|      null|     null|          nu

#### Sumarizando Casos

In [305]:
df_flights_Testes.groupBy("qa_air_time").count().distinct().show()

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
|       null| 9925|
|          M|   76|
|          I|    2|
+-----------+-----+



#### Tabela Oficial

In [36]:
dfo_flights = (dfo_flights.withColumn("qa_air_time",
                    when((dfo_flights.air_time.isNull()), lit("M"))
                    .when(~(dfo_flights.air_time.between(20,500)) , lit("I"))                                                  
                    .otherwise(None))
            )

dfo_flights.filter(dfo_flights.qa_air_time.isNotNull()).show()

dfo_flights.groupBy("qa_air_time").count().distinct().show()


+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+
|2014|    4|  6|  13|    29|    1329|    2159|        4|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|             null|          null|           null|              MA|      null|      null|     null|          null|          M|
|2014|    3|  4|null

### Pergunta 10 - Flights

#### Tabela Testes

In [297]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_distance",
                    when((df_flights_Testes.distance.isNull()), lit("M"))
                    .when(~(df_flights_Testes.distance.between(50,3000)) , lit("I"))                                                  
                    .otherwise(None))
            )

df_flights_Testes.filter(df_flights_Testes.qa_distance.isNotNull()).show()

# df_flights_Testes.groupBy("distance").count().distinct().orderBy("distance").show(1000)

+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+
|year|month| day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_distance|
+----+-----+----+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+
|null| null|null|null|  null|    null|    null|     null|     null|   null|   null|  null|  null|null|    null|    null|         MY_MM_MD|         MH_MM|          MD_MA|           MD_MA|         M|         M|        M|         MO_MD|          M|
|1949|   13|  32

#### Sumarizando Casos

In [299]:
df_flights_Testes.groupBy("qa_distance").count().distinct().show()

+-----------+-----+
|qa_distance|count|
+-----------+-----+
|       null|10000|
|          M|    1|
|          I|    2|
+-----------+-----+



#### Tabela Oficial

In [37]:
dfo_flights = (dfo_flights.withColumn("qa_distance",
                    when((dfo_flights.distance.isNull()), lit("M"))
                    .when(~(dfo_flights.distance.between(50,3000)) , lit("I"))                                                  
                    .otherwise(None))
            )

dfo_flights.filter(dfo_flights.qa_distance.isNotNull()).show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|qa_distance|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+------------

### Pergunta 11 - Flights

#### Tabela Testes

In [316]:
df_flights_Testes = (df_flights_Testes.withColumn("qa_distance_airtime",
                    when((df_flights_Testes.distance.isNull()) 
                         | (df_flights_Testes.air_time.isNull()), lit("M"))
                    .when(df_flights_Testes.air_time >= (df_flights_Testes.distance*0.1+30) , lit("TL"))
                    .when(df_flights_Testes.air_time <= (df_flights_Testes.distance*0.1+10) , lit("TS"))                              
                    .otherwise(lit("TR")))
            )

df_flights_Testes.filter(df_flights_Testes.qa_distance_airtime.isNotNull()).show()

# df_flights_Testes.groupBy("distance").count().distinct().orderBy("distance").show(1000)

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+-------------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_distance|qa_air_time|qa_distance_airtime|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+-------------------+
|2014|   12|  8|   6|    58|     658|     935|       -7|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|             null|          null|           null| 

#### Sumarizando Casos

In [317]:
df_flights_Testes.groupBy("qa_distance_airtime").count().distinct().orderBy("qa_distance_airtime").show()

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+
|                  M|   76|
|                 TL| 5028|
|                 TR| 4832|
|                 TS|   67|
+-------------------+-----+



#### Tabela Oficial

In [38]:
dfo_flights = (dfo_flights.withColumn("qa_distance_airtime",
                    when((dfo_flights.distance.isNull()) 
                         | (dfo_flights.air_time.isNull()), lit("M"))
                    .when(dfo_flights.air_time >= (dfo_flights.distance*0.1+30) , lit("TL"))
                    .when(dfo_flights.air_time <= (dfo_flights.distance*0.1+10) , lit("TS"))                              
                    .otherwise(lit("TR")))
            )

dfo_flights.filter(dfo_flights.qa_distance_airtime.isNotNull()).show()
dfo_flights.groupBy("qa_distance_airtime").count().distinct().orderBy("qa_distance_airtime").show()

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+-------------------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|qa_distance|qa_distance_airtime|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+-----------------+--------------+---------------+----------------+----------+----------+---------+--------------+-----------+-----------+-------------------+
|2014|   12|  8|   6|    58|     658|     935|       -7|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|             null|          null|           null| 

#### FINALIZANDO DATASET FLIGHTS TO PARQUET FILE

In [39]:
dfo_flights.write.options(header='True', delimiter=',').csv("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_Flights.csv")
dfo_flights.write.parquet("C:/Users/amarti40/OneDrive - Capgemini/Desktop/ACELERAÇÃO PYSPARK/Semana 2 - Desafio Qualidade/Outputs_Flights.parquet")

In [None]:
spark.stop()