### Qualidade
Antes de realizarmos transformações nos dados, é importante estabelecermos processos que apontem
erros de qualidade nos dados que estamos trabalhado, dessa forma é possível ter clareza das
inconsistências comuns e assim criar formas de melhorar a qualidade dos dados. Pensando nisso, a
primeira atividade planejada é criarmos uma coluna adicional reportando o tipo de inconsistência que
encontramos nos datasets.

In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



### Dicionário
* <b>faa (string):</b> Identificador do aeroporto determinado pela Federal Aviation Administration. Formato: 3-
5 caracteres alfanuméricos.<br>
* <b> name (string)</b>: Nome do aeroporto.<br>
* <b> lat (float):</b> Latitude do aeroporto. Intervalo de valores [−180, 180]. <br>
* <b> lon (float):</b> Longitude do aeroporto Intervalo de valores . [−180, 180] <br>
* <b> alt (int):</b> Altitude do aeroporto. Unidade de medida em pés. Intervalo de valores [0,+∞) . <br>
* <b> tz (float):</b> Fuso horário baseado no deslocamento de horas a partir de UTC/GMT. Intervalo de valores [−11,+14] 
. Pode ser fuso fracionário [1] <br>
* <b> dst (category):</b> Horário de verão. Descrição dos possíveis valores [2]: <br>
*  E (Europe)
*  A (US/Canada)
*  S (South America) 
*  O (Australia) 
*  Z (New Zealand)
*  N (None) 
*  U (Unknown) 
*  faa

In [2]:
# Locate spark
import findspark
findspark.init()

In [3]:
# Importando pacotes
import re
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F  # Importando todas as funções 
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [4]:
# Expressoes regulares comuns
# Boas práticas (variáveis constantes em maiusculo)
REGEX_ALPHA    = r'[a-zA-Z]+'
REGEX_INTEGER  = r'[0-9]+'
REGEX_FLOAT    = r'[0-9]+\.[0-9]+'
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'
REGEX_EMPTY_STR= r'[\t ]+$'
REGEX_SPECIAL  = r'[!@#$%&*\(\)_]+'
REGEX_NNUMBER  = r'^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNPRSTUVXWYZ]{1,2})'
REGEX_NNUMBER_INVALID = r'(N0.*$)|(.*[IO].*)'
REGEX_TIME_FMT = r'^(([0-1]?[0-9])|(2[0-3]))([0-5][0-9])$'

In [5]:
# Funcoes auxiliares 
def split_csv(line):
    return tuple(map(lambda x: x.replace('"',''), line.split(",")))

def check_empty_column(col):
    return (F.col(col).isNull() | (F.col(col) =='') | F.col(col).rlike(REGEX_EMPTY_STR))

In [6]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [27]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [31]:
# Lendo Datasets 
df_airports = (spark.getOrCreate().read
                   .format("csv")
                   .option("header","True")
                   .schema(schema_airports)
                   .load("../../pyspark-capgemini/data/airports.csv"))


df_planes = (spark.getOrCreate().read
                   .format("csv")
                   .option("header","True")
                   .schema(schema_planes)
                   .load("../../pyspark-capgemini/data/planes.csv"))


df_flights = (spark.getOrCreate().read
                   .format("csv")
                   .option("header","True")
                   .schema(schema_flights)
                   .load("../../pyspark-capgemini/data/flights.csv"))

In [17]:
df_airports.printSchema()

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: integer (nullable = true)
 |-- dst: string (nullable = true)



In [18]:
# Criacao das visões temporarias
df_airports.createOrReplaceTempView('airports')

In [21]:
# Leitura dos dados como RDD
rdd_airports = sc.textFile("../../pyspark-capgemini/data/airports.csv")

In [22]:
# armazena a primeira linha (colunas) do arquivo como referencia.
header_airports = rdd_airports.first()

# Remove a primeira linha (colunas) do rdd
rdd_airports = rdd_airports.filter(lambda line: line != header_airports).map(split_csv)

### Criando função para adicionar linhas e testar o código. 

In [23]:
def add_test_rows_for_airports(data,_format): #convenção para usar _ para diferenciar de nomes reservados
        # Adiciona linhas para testar regras de negócio
        values = [
        #faa    name  lat       lon               alt       tz    dst
        (None , None, None    , None            , None    , None, None),
        (''   , ''  , ''      , ''              , ''      , ''  , ''  ),
        ('   ', None, '12O.12', '-80.Aa6195833' , '1044Aa', '-2x', '34'),
        ('AAA', None, '12O.12', '-80.Aa6195833' , '1044Aa', '-2x', '34'),
        ('222', None, None    , '-80.Aa6195833' , '-100'  , '-14', 'K'),
        ('__!', None, None    , '-80.Aa6195833' , '-100'  , '-14', 'K')
    ]
        if _format =="df":
            return spark.getOrCreate().createDataFrame(values,data.columns).union(data)
        elif _format == "rdd":
            return sc.parallelize(tuple(values)).union(data)

In [24]:
# Adicionando linhas ao Dataframe
df_airports  = add_test_rows_for_airports(df_airports,  "df")
rdd_airports = add_test_rows_for_airports(rdd_airports, "rdd")

In [29]:
rdd_airports.take(10)

[(None, None, None, None, None, None, None),
 ('', '', '', '', '', '', ''),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K'),
 ('__!', None, None, '-80.Aa6195833', '-100', '-14', 'K'),
 ('04G', 'Lansdowne Airport', '41.1304722', '-80.6195833', '1044', '-5', 'A'),
 ('06A',
  'Moton Field Municipal Airport',
  '32.4605722',
  '-85.6800278',
  '264',
  '-5',
  'A'),
 ('06C', 'Schaumburg Regional', '41.9893408', '-88.1012428', '801', '-6', 'A'),
 ('06N', 'Randall Airport', '41.431912', '-74.3915611', '523', '-5', 'A')]

# Dataset Airports

### <b>Perguntas
Considere o dataset airports.csv para realizar as seguintes tarefas:</b>
### Pergunta 1<br>


In [10]:
# Usando dataframe
df_airports = df_airports.withColumn('qa_faa',(
        F.when(check_empty_column('faa'),'M')
         .when(
             (~F.length(F.col('faa')).between(3,5)  |
             (~F.col('faa').rlike(REGEX_ALPHANUM))),"F")))


df_airports.show(10)

+---+--------------------+---------+-----------+----+---+---+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|
+---+--------------------+---------+-----------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|  null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|39.794823| -76.647194|1000| -5|  U|  null|
|0S9|Jefferson County ...| 48.05381|-122.810646| 108| -8|  A|  null|
+---+--------------------+---------+-----------+----+---+---+------+
only showing top 10 rows



In [26]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE 
            WHEN faa RLIKE '{REGEX_EMPTY_STR}' OR faa IS NULL OR faa LIKE ''           THEN 'M'
            WHEN NOT (faa RLIKE'{REGEX_ALPHANUM}') OR NOT (LENGTH(faa) BETWEEN 3 AND 5) THEN 'F'
          END AS qa_faa
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|
+---+--------------------+---------+---------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|
+---+--------------------+---------+---------+----+---+---+------+
only showing top 5 rows



In [14]:
# Usando RDD

# Usando rdd
def qa_faa(row):
    faa, value = row[0], None

    if (faa is None) or (faa == '') or re.findall(REGEX_EMPTY_STR, faa):
        value = "M"
    elif (not (3 <= len(faa) <= 5)) or (not faa.isalnum()):
        value = "F"

    return row + (value,)

rdd_airports.map(qa_faa).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'M'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', None),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', None)]

### Pergunta 2 


In [11]:
# Usando dataframe
df_airports = df_airports.withColumn('qa_name', F.when(check_empty_column('name'), 'M'))
df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|
+---+--------------------+---------+---------+----+---+---+------+-------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|
+---+--------------------+---------+---------+----+---+---+------+-------+
only showing top 5 rows



In [63]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE WHEN name IS NULL OR faa RLIKE '{REGEX_EMPTY_STR}' OR faa LIKE '' THEN 'M' END AS qa_name
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+-------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_name|
+---+--------------------+---------+---------+----+---+---+-------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|   null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|   null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|   null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|   null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|   null|
+---+--------------------+---------+---------+----+---+---+-------+
only showing top 5 rows



In [64]:
# usando rdd 
def qa_name(row):
    name, value = row[1],None
    
    if (name is None) or re.findall(REGEX_EMPTY_STR,name):
        value = "M"
        
    return row +(value,)

rdd_airports.map(qa_name).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', None),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'M'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'M'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', 'M')]

### Pergunta 3 

In [12]:
# Usando Dataframe
df_airports = df_airports.withColumn('qa_lat',(
      F.when(check_empty_column('lat'), 'M')
       .when(F.col('lat').rlike(REGEX_ALPHA),'A')
       .when(F.col('lat').between(-180,180),'I')))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|qa_lat|
+---+--------------------+---------+---------+----+---+---+------+-------+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|     I|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|     I|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|     I|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|     I|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|     I|
+---+--------------------+---------+---------+----+---+---+------+-------+------+
only showing top 5 rows



In [67]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE
            WHEN lat RLIKE '{REGEX_EMPTY_STR}' OR lat IS NULL OR lat LIKE '' THEN 'M'
            WHEN lat RLIKE '{REGEX_ALPHA}'  THEN 'A'
            WHEN lat <= -180 OR lat >= 180  THEN 'I'
          END AS qa_lat
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_lat|
+---+--------------------+---------+---------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|
+---+--------------------+---------+---------+----+---+---+------+
only showing top 5 rows



In [70]:
# Usando rdd  - revisar 
def qa_lat(row):
    lat, value = row[2], None

    if not lat or re.findall(REGEX_EMPTY_STR, lat):
        value = "M"
    elif re.findall(REGEX_ALPHA, lat): # prioritade: alfanum > casting
        value = "A"
    elif not (-180 <= float(lat) <= 180):
        value = "I"

    return row + (value,)

rdd_airports.map(qa_lat).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', 'M')]

### Pergunta 4 


In [13]:
# Usando dataframe
df_airports = df_airports.withColumn("qa_lon", (
    F.when(check_empty_column("lon"), "M")
     .when(~F.col('lon').rlike(REGEX_FLOAT),"A")
     .when(~F.col('lon').between(-180,180),"I")))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+------+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|     I|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|     I|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|     I|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|     I|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|     I|  null|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+
only showing top 5 rows



In [75]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE
            WHEN lon RLIKE '{REGEX_EMPTY_STR}' OR lon IS NULL OR lon LIKE '' THEN 'M'
            WHEN lon RLIKE '{REGEX_ALPHA}'  THEN 'A'
            WHEN lon <= -180 OR lon >= 180  THEN 'I'
          END AS qa_lat
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_lat|
+---+--------------------+---------+---------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|
+---+--------------------+---------+---------+----+---+---+------+
only showing top 5 rows



In [77]:
# Usando rdd
def qa_lon(row):
    lon,value = row[3],None
    
    if not lon or re.findall(REGEX_EMPTY_STR,lon):
        value = "M"
        
    elif re.findall(REGEX_ALPHANUM,lon):
        value="A"
    elif  not (-180 <= float(lon) <= 180):
        value ="I"
    return row + (value,)

rdd_airports.map(qa_lon).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', 'A')]

###  Pergunta 5 

In [14]:
# Usando dataframe
df_airports = df_airports.withColumn('qa_alt', (
            F.when(check_empty_column('alt'), 'M')
             .when(~F.col('alt').rlike(REGEX_INTEGER), 'A')             
             .when(F.col('alt') < 0, 'I')))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|     I|  null|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|     I|  null|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|     I|  null|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|     I|  null|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|     I|  null|  null|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+
only showing top 5 rows



In [79]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE
            WHEN alt RLIKE '{REGEX_EMPTY_STR}' OR alt IS NULL OR alt LIKE '' THEN 'M'
            WHEN alt RLIKE '{REGEX_ALPHANUM}' THEN 'A'
            WHEN alt < 0 THEN 'I'
          END AS qa_alt
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_alt|
+---+--------------------+---------+---------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|     A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|     A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|     A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|     A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|     A|
+---+--------------------+---------+---------+----+---+---+------+
only showing top 5 rows



In [80]:
# Usando rdd
def qa_alt(row):
    alt, value = row[4], None

    if not alt or re.findall(REGEX_EMPTY_STR, alt):
        value = "M"
    elif re.findall(REGEX_ALPHANUM, alt):
        value = "A"
    elif int(alt) < 0:
        value = "I"

    return row + (value,)

rdd_airports.map(qa_alt).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', 'A')]

### Pergunta 6.<br>

In [15]:
# Usando dataframe
df_airports = df_airports.withColumn('qa_tz', (
            F.when(check_empty_column('tz'), 'M')
             .when(~F.col('tz').rlike(REGEX_INTEGER + '$'), 'A')         
             .when(~F.col('tz').between(-11, 14), 'I')))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|     I|  null|  null| null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|     I|  null|  null| null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|     I|  null|  null| null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|     I|  null|  null| null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|     I|  null|  null| null|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+
only showing top 5 rows



In [82]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE
            WHEN tz RLIKE '{REGEX_EMPTY_STR}' OR tz IS NULL OR tz LIKE '' THEN 'M'
            WHEN tz RLIKE '{REGEX_INTEGER}' THEN 'A'
            WHEN tz < -11 OR tz > 14 THEN 'I'
          END AS qa_tz
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+-----+
|faa|                name|      lat|      lon| alt| tz|dst|qa_tz|
+---+--------------------+---------+---------+----+---+---+-----+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|    A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|    A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|    A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|    A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|    A|
+---+--------------------+---------+---------+----+---+---+-----+
only showing top 5 rows



In [83]:
# Usando rdd
def qa_tz(row):
    tz, value = row[5], None

    if not tz or re.findall(REGEX_EMPTY_STR, tz):
        value = "M"
    elif not re.findall(REGEX_INTEGER + '$', tz):
        value = "A"
    elif -11 <= int(tz) <= 14:
        value = "I"

    return row + (value,)

rdd_airports.map(qa_tz).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'A'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', None)]

### Pergunta 7<br>


In [16]:
DST_CATEGORIES = ['E', 'A', 'S', 'O', 'Z', 'N', 'U']

In [17]:
# Usando dataframe
df_airports = df_airports.withColumn('qa_dst', (
            F.when(check_empty_column('dst'), 'M')
             .when(F.col('dst').rlike(REGEX_INTEGER), 'N')
             .when(~F.col('dst').isin(DST_CATEGORIES), 'C')))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|   null|     I|  null|  null| null|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|   null|     I|  null|  null| null|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|   null|     I|  null|  null| null|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|   null|     I|  null|  null| null|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|   null|     I|  null|  null| null|  null|
+---+--------------------+---------+---------+----+---+---+------+-------+------+------+------+-----+------+
only showing top 5 

In [86]:
# Usando SQL
spark.getOrCreate().sql(f"""
SELECT *, CASE
            WHEN dst RLIKE '{REGEX_EMPTY_STR}' OR dst IS NULL OR dst LIKE '' THEN 'M'
            WHEN dst RLIKE '{REGEX_INTEGER}' THEN 'N'
            WHEN dst NOT IN {tuple(DST_CATEGORIES)} THEN 'C'
          END AS qa_dst
FROM airports
""").show(5)

+---+--------------------+---------+---------+----+---+---+------+
|faa|                name|      lat|      lon| alt| tz|dst|qa_dst|
+---+--------------------+---------+---------+----+---+---+------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|  null|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|  null|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|  null|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|  null|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|  null|
+---+--------------------+---------+---------+----+---+---+------+
only showing top 5 rows



In [87]:
# Usando rdd
def qa_dst(row):
    dst, value = row[6], None

    if not dst or re.findall(REGEX_EMPTY_STR, dst):
        value = "M"
    elif re.findall(REGEX_INTEGER, dst):
        value = "N"
    elif dst not in DST_CATEGORIES:
        value = "C"

    return row + (value,)

rdd_airports.map(qa_dst).take(5)

[(None, None, None, None, None, None, None, 'M'),
 ('', '', '', '', '', '', '', 'M'),
 ('   ', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'N'),
 ('AAA', None, '12O.12', '-80.Aa6195833', '1044Aa', '-2x', '34', 'N'),
 ('222', None, None, '-80.Aa6195833', '-100', '-14', 'K', 'C')]

# Dataset Planes
  

###  Pergunta 1 

In [32]:
df_planes = df_planes.withColumn('qa_tailnum', (
        F.when(check_empty_column('tailnum'), 'M')
         .when(~F.length(F.col('tailnum')).between(5, 6), 'S')
         .when(~F.col('tailnum').startswith('N'), 'FN')
         .when( F.col('tailnum').rlike(REGEX_NNUMBER_INVALID), 'FE')
         .when(~F.col('tailnum').rlike(REGEX_NNUMBER), 'F')))

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
only showing top 5 rows



### Pergunta 2 

In [33]:
# Usando dataframe
df_planes = df_planes.withColumn('qa_year', (
        F.when(check_empty_column('year'), 'M')
         .when(F.col('year') < 1950, 'I')))

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+
o

### Pergunta 3

In [34]:
# Usando dataframe
PLANE_TYPES = ["Fixed wing multi engine", "Fixed wing single engine", "Rotorcraft"]
df_planes = df_planes.withColumn('qa_type', (
        F.when(check_empty_column('type'), 'M')
         .when(~F.col('type').isin(PLANE_TYPES), 'C')))

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|
+-------+----+--------------------+-------------

### Pergunta 4

In [35]:
# Usando dataframe
PLANES_MANUFACTURERS = ["AIRBUS","BOEING","BOMBARDIER","CESSNA","EMBRAER","SIKORSKY","CANADAIR","PIPER","MCDONNELL DOUGLAS","CIRRUS","BELL","KILDALL GARY","LAMBERT RICHARD","BARKER JACK","ROBINSON HELICOPTER","GULFSTREAM","MARZ BARRY"]
REGEX_MANUFACTURERS = r'|'.join(map(lambda x : f".*({x}).*", PLANES_MANUFACTURERS))

df_planes = df_planes.withColumn('qa_manufacturer', (
        F.when(check_empty_column('manufacturer'), 'M')
         .when(~F.col('manufacturer').rlike(REGEX_MANUFACTURERS), 'C')))

df_planes.show(3,vertical=True)

-RECORD 0-------------------------------
 tailnum         | N102UW               
 year            | 1998                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum      | null                 
 qa_year         | null                 
 qa_type         | null                 
 qa_manufacturer | null                 
-RECORD 1-------------------------------
 tailnum         | N103US               
 year            | 1999                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum     

### Pergunta 5

In [37]:
# Usando dataframe
df_planes = df_planes.withColumn('qa_model', (
        F.when(check_empty_column('model'), 'M')
         .when(
             ((F.col('manufacturer').rlike(r'.*AIRBUS.*') & ~F.col('model').startswith('A')) |
              (F.col('manufacturer').rlike(r'.*BOEING.*') & ~F.col('model').startswith('7')) |
              (F.col('manufacturer').rlike(r'.*(BOMBARDIER|CANADAIR).*') & ~F.col('model').startswith('CL')) |
              (F.col('manufacturer').rlike(r'.*MCDONNELL DOUGLAS.*') &  ~(F.col('model').startswith('MD') | F.col('model').startswith('DC')) )), 'F')))

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|    null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|    null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|    null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|  

### Pergunta 6

In [38]:
df_planes = df_planes.withColumn('qa_engines', (
        F.when(check_empty_column('engines'), 'M')
         .when(F.col("engines").between(1,4),'I')
         .when(F.col("engines").rlike(REGEX_ALPHA),'A')
))

df_planes.show(2,vertical=True)

-RECORD 0-------------------------------
 tailnum         | N102UW               
 year            | 1998                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum      | null                 
 qa_year         | null                 
 qa_type         | null                 
 qa_manufacturer | null                 
 qa_model        | null                 
 qa_engines      | I                    
-RECORD 1-------------------------------
 tailnum         | N103US               
 year            | 1999                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed          

### Pergunta 7

In [39]:
df_planes = df_planes.withColumn('qa_seats', (
        F.when(check_empty_column('seats'), 'M')
         .when(
              F.col('seats').between(2, 500), 'I')
         .when(
             F.col('seats').rlike(REGEX_ALPHANUM), 'A')))

df_planes.show(2,vertical=True)

-RECORD 0-------------------------------
 tailnum         | N102UW               
 year            | 1998                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum      | null                 
 qa_year         | null                 
 qa_type         | null                 
 qa_manufacturer | null                 
 qa_model        | null                 
 qa_engines      | I                    
 qa_seats        | I                    
-RECORD 1-------------------------------
 tailnum         | N103US               
 year            | 1999                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats          

### Pergunta 8

In [40]:
df_planes = df_planes.withColumn('qa_speed', (
        F.when(check_empty_column('speed'), 'M')
         .when(
               F.col('speed').between(50, 150), 'I')
         .when(
              F.col('speed').rlike(REGEX_ALPHANUM), 'A')
))

df_planes.show(2,vertical=True)

-RECORD 0-------------------------------
 tailnum         | N102UW               
 year            | 1998                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum      | null                 
 qa_year         | null                 
 qa_type         | null                 
 qa_manufacturer | null                 
 qa_model        | null                 
 qa_engines      | I                    
 qa_seats        | I                    
 qa_speed        | M                    
-RECORD 1-------------------------------
 tailnum         | N103US               
 year            | 1999                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines        

### Pergunta 9 

In [41]:
nome = ["Turbo-fan","Turbo-jet","Turbo-prop","Turbo-shaft","4 Cycle"]

df_planes = df_planes.withColumn('qa_engine', (
        F.when(
            check_empty_column('engine'), 'M')
         .when(
             ~F.col('engine').isin(nome), 'C')))

df_planes.show(2,vertical=True)

-RECORD 0-------------------------------
 tailnum         | N102UW               
 year            | 1998                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model           | A320-214             
 engines         | 2                    
 seats           | 182                  
 speed           | null                 
 engine          | Turbo-fan            
 qa_tailnum      | null                 
 qa_year         | null                 
 qa_type         | null                 
 qa_manufacturer | null                 
 qa_model        | null                 
 qa_engines      | I                    
 qa_seats        | I                    
 qa_speed        | M                    
 qa_engine       | null                 
-RECORD 1-------------------------------
 tailnum         | N103US               
 year            | 1999                 
 type            | Fixed wing multi ... 
 manufacturer    | AIRBUS INDUSTRIE     
 model          

# Dataset Flights

### Pergunta 1

In [45]:
df_flights = df_flights.withColumn('qa_year_month_day',(
        F.when(check_empty_column('year'),'MY')
         .when(check_empty_column('month'),'MM')
         .when(check_empty_column('day'),'MD')
         .when(
             (F.col('year') > 1950),'IY')
         .when(
             (F.col('month').between(1,12)),'IM')
         .when(
             (F.col('day').between(1,31)) &
             (F.col('day') =='2'),'ID')
         )) 

### Pergunta 2 

In [47]:
df_flights = df_flights.withColumn('qa_hour_minute' , (
        F.when(
               check_empty_column('hour'), 'MH')
         .when(
               check_empty_column('minute'), 'MM')
         .when(
               F.col('hour').between(0,24) ,"IH")
         .when(
              F.col('minute').between(0,59) ,"IM")                            
        
))

### Pergunta 3 

In [None]:
df_flights = df_flights.withColumn('qa_dep_arr_time', (
    F.when(
        (F.col('dep_time').isNull()) | 
        (F.col('dep_time') == ''), 'MD')
     .when(
          (F.col('arr_time').isNull())| 
          (F.col('arr_time') == ''), 'MA')
     .when(
          (F.col('dep_time').rlike('/^(0?[1-9]|1[0-2]):[0-5][0-9]$/')) |
          (F.col('dep_time').rlike('^[0-9][0-5][0-9]$')), 'FD')
                                
     .when(
          (F.col('arr_time').rlike('/^(0?[1-9]|1[0-2]):[0-5][0-9]$/')) |
          (F.col('arr_time').rlike('^[0-9][0-5][0-9]$')), 'FA')
))

In [None]:
df.filter(~col('dep_time').rlike('/^(0?[1-9]|1[0-2]):[0-5][0-9]$/')).show()

### Pergunta 4

In [48]:
df_flights = df_flights.withColumn('qa_dep_arr_delay', (
        F.when(
             (F.col('dep_delay').isNull()) | 
             (F.col('dep_delay') == ''), 'MD')
        .when(
             (F.col('arr_delay').isNull())| 
             (F.col('arr_delay') == ''), 'MA')
        

))

### Pergunta 5

In [49]:
df_flights = df_flights.withColumn ('qa_carrier', (
    F.when(
         (F.col('carrier').isNull()) | 
         (F.col('carrier') == ''), 'M')
     .when(
          (~F.col('carrier').rlike("[0-9a-zA-Z]{2}")),'F') 

))

### Pergunta 6

In [52]:
invalid_characters_list = ['I', 'O', 0]

df_flights = df_flights.withColumn('qa_tailnum' , (
    F.when(
         (F.col('tailnum').isNull()) | 
         (F.col('tailnum') == ''), 'M')
     .when(
         F.col('tailnum').rlike("^N([0-9]{1,4})([A-Z]{1,2}$)") == False, "F")
     .when(
         F.col('tailnum').rlike('^[0-9]*$'), 'A')
     .when(
         ~F.col('tailnum').startswith('N'), 'FN')
     .when(
         F.col('tailnum').substr(1, 1).isin(invalid_characters_list), 'FE')
    
))

### Pergunta 7

In [54]:
df_flights = df_flights.withColumn ('qa_flight',   (
    F.when(
         (F.col('flight').isNull()) | 
         (F.col('flight') == ''), 'M')
     .when(
         (~F.col('flight').rlike("[0-9]{4}")),'F')
    
))

### Pergunta 8

In [55]:
df_flights = df_flights.withColumn('qa_origin_dest', (
    F.when(
          (F.col('origin').isNull()) | 
          (F.col('origin') == ''), 'MO')
     .when(
           (F.col('dest').isNull())| 
           (F.col('dest') == ''), 'MD')
     .when(
           (F.col('origin').rlike("[0-9a-zA-Z]{3}")),'FO')
                            
     .when(
            (F.col('dest').rlike("[0-9a-zA-Z]{3}")),'FD')
    
))

### Pergunta 9

In [56]:
df_flights = df_flights.withColumn('qa_air_time', (
    F.when(
            (F.col('air_time').isNull()) | 
            (F.col('air_time') == ''), 'M')
    .when(
            (F.col('air_time').between(20,500)) ,"I")
                            
))     

In [57]:
df_flights.toPandas()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,...,hour,minute,qa_year_month_day,qa_hour_minute,qa_dep_arr_delay,qa_carrier,qa_tailnum,qa_flight,qa_origin_dest,qa_air_time
0,2014,12,8,658,-7.0,935,-5.0,VX,N846VA,1780,...,6.0,58.0,IY,IH,,,,,FO,I
1,2014,1,22,1040,5.0,1505,5.0,AS,N559AS,851,...,10.0,40.0,IY,IH,,,,F,FO,I
2,2014,3,9,1443,-2.0,1652,2.0,VX,N847VA,755,...,14.0,43.0,IY,IH,,,,F,FO,I
3,2014,4,9,1705,45.0,1839,34.0,WN,N360SW,344,...,17.0,5.0,IY,IH,,,,F,FO,I
4,2014,3,9,754,-1.0,1015,1.0,AS,N612AS,522,...,7.0,54.0,IY,IH,,,,F,FO,I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2014,6,23,1806,-4.0,2104,-6.0,OO,N225AG,3458,...,18.0,6.0,IY,IH,,,,,FO,I
9996,2014,8,31,2336,11.0,452,-13.0,AA,N3LEAA,1230,...,23.0,36.0,IY,IH,,,F,,FO,I
9997,2014,8,8,904,-1.0,1042,-5.0,AS,N523AS,360,...,9.0,4.0,IY,IH,,,,F,FO,I
9998,2014,8,29,1441,26.0,1820,10.0,WN,N8647A,2857,...,14.0,41.0,IY,IH,,,,,FO,I


### Pergunta 10

In [59]:
df_flights= df_flights.withColumn('qa_distance', (
    F.when(
         (F.col('distance').isNull()) | 
         (F.col('distance') == ''), 'M')
    .when(
         (F.col('distance').between(50,3000)) ,"I")
                            
))    

### Pergunta 11

In [60]:
df_flights = df_flights.withColumn('qa_distance_airtime', (
    F.when(
         (F.col('distance').isNull()) | 
         (F.col('distance') == ''), 'M')
     .when(
          (F.col('air_time').isNull()) | 
          (F.col('air_time') == ''), 'M')
     .when(
          (F.col('distance').between(50,3000)) ,"TL")
                         
     .when(
          (F.col('air_time') >= (F.col('distance') * 0.1 + 30)) ,"TL")
                                 
    .when(
          (F.col('air_time') <= (F.col('distance') * 0.1 + 10)) ,"TS")
                            
                            
))

### SALVANDO ARQUIVO  (DATAFRAME)

In [89]:
### SALVANDO ARQUIVO CSV
(df_airports.select(df_airports.colRegex("`^qa_.*`"))
            .repartition(1) # coalesce
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
            .save("Datasets/airports_qa.parquet"))


In [18]:
### SALVANDO ARQUIVO PARQUET
(df_airports.repartition(1) # coalesce
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
            .save("Datasets/airports_qa.parquet"))
            

### SALVANDO ARQUIVO (RDD)

In [90]:
(rdd_airports.map(qa_faa)
             .map(qa_name)
             .map(qa_lat)
             .map(qa_lon)
             .map(qa_alt)
             .map(qa_tz)
             .map(qa_dst)
             .saveAsTextFile("Datasets/airports_qa.txt"))