# ETL: Raw → Silver (Airline Delays)

## 1. Imports

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
from pyspark.sql.types import DoubleType, IntegerType
import psycopg2
from psycopg2.extras import execute_batch
from datetime import datetime

print('✓ Imports OK')

✓ Imports OK


## 2. Inicializar Spark

In [19]:
spark = SparkSession.builder.appName('ETL Airline Delays').config('spark.driver.memory', '4g').getOrCreate()
print(f'✓ Spark {spark.version}')

✓ Spark 4.1.1


## 3. Configurar PostgreSQL

In [None]:
DB_CONFIG = {'host': 'localhost', 'port': 5432, 'database': 'airline_delays', 'user': 'postgres', 'password': 'postgres'}

def get_db_connection():
    return psycopg2.connect(**DB_CONFIG)

conn = get_db_connection()
cursor = conn.cursor()
cursor.execute('SELECT version();')
print(f"✓ PostgreSQL OK")
cursor.close()
conn.close()

✓ PostgreSQL OK


## 4. EXTRACT - Carregar CSV

In [21]:
df_raw = spark.read.csv('../Data Layer/raw/dados_brutos.csv', header=True, inferSchema=True)
print(f'✓ Carregado: {df_raw.count():,} registros')
df_raw.printSchema()

                                                                                

✓ Carregado: 171,666 registros
root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- carrier_name: string (nullable = true)
 |-- airport: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- arr_flights: double (nullable = true)
 |-- arr_del15: double (nullable = true)
 |-- carrier_ct: double (nullable = true)
 |-- weather_ct: double (nullable = true)
 |-- nas_ct: double (nullable = true)
 |-- security_ct: double (nullable = true)
 |-- late_aircraft_ct: double (nullable = true)
 |-- arr_cancelled: double (nullable = true)
 |-- arr_diverted: double (nullable = true)
 |-- arr_delay: double (nullable = true)
 |-- carrier_delay: double (nullable = true)
 |-- weather_delay: double (nullable = true)
 |-- nas_delay: double (nullable = true)
 |-- security_delay: double (nullable = true)
 |-- late_aircraft_delay: double (nullable = true)



## 5. TRANSFORM - Limpar Dados

In [22]:
df = df_raw.dropna(how='all')

for col_name in ['carrier', 'carrier_name', 'airport', 'airport_name']:
    df = df.withColumn(col_name, trim(col(col_name)))

numeric_cols = ['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

for col_name in numeric_cols:
    df = df.withColumn(col_name, col(col_name).cast(DoubleType()))

df = df.withColumn('year', col('year').cast(IntegerType())).withColumn('month', col('month').cast(IntegerType()))

df_clean = df.dropDuplicates(['year', 'month', 'carrier', 'airport'])

print(f'✓ Limpo: {df_clean.count():,} registros')
df_clean.select('arr_delay', 'arr_flights').describe().show()

                                                                                

✓ Limpo: 171,666 registros




+-------+------------------+------------------+
|summary|         arr_delay|       arr_flights|
+-------+------------------+------------------+
|  count|            171426|            171426|
|   mean| 4239.487329809947|362.52846709367304|
| stddev|12618.566049584952| 992.8946622226691|
|    min|               0.0|               1.0|
|    max|          438783.0|           21977.0|
+-------+------------------+------------------+



                                                                                

## 6. LOAD - Inserir no PostgreSQL

In [23]:
data = df_clean.collect()
print(f'Coletado: {len(data):,} registros')

conn = get_db_connection()
cursor = conn.cursor()

cursor.execute('CREATE SCHEMA IF NOT EXISTS silver')
cursor.execute('DROP TABLE IF EXISTS silver.airline_delays CASCADE')
cursor.execute('''
CREATE TABLE silver.airline_delays (
    year INTEGER NOT NULL,
    month INTEGER NOT NULL,
    carrier VARCHAR(10) NOT NULL,
    carrier_name VARCHAR(200),
    airport VARCHAR(10) NOT NULL,
    airport_name VARCHAR(200),
    arr_flights DECIMAL(10,2),
    arr_del15 DECIMAL(10,2),
    carrier_ct DECIMAL(10,2),
    weather_ct DECIMAL(10,2),
    nas_ct DECIMAL(10,2),
    security_ct DECIMAL(10,2),
    late_aircraft_ct DECIMAL(10,2),
    arr_cancelled DECIMAL(10,2),
    arr_diverted DECIMAL(10,2),
    arr_delay DECIMAL(10,2),
    carrier_delay DECIMAL(10,2),
    weather_delay DECIMAL(10,2),
    nas_delay DECIMAL(10,2),
    security_delay DECIMAL(10,2),
    late_aircraft_delay DECIMAL(10,2),
    PRIMARY KEY (year, month, carrier, airport)
)
''')
conn.commit()
print('✓ Schema criado')
print('✓ Tabela criada')

query = 'INSERT INTO silver.airline_delays VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
batch = [tuple(row) for row in data]
execute_batch(cursor, query, batch, page_size=1000)
conn.commit()
print(f'✓ Inserido: {len(batch):,} registros')

cursor.close()
conn.close()

                                                                                

Coletado: 171,666 registros
✓ Schema criado
✓ Tabela criada
✓ Inserido: 171,666 registros


## 7. VALIDAÇÃO

In [24]:
conn = get_db_connection()
cursor = conn.cursor()

cursor.execute('SELECT COUNT(*) FROM silver.airline_delays')
print(f'Total: {cursor.fetchone()[0]:,} registros')

cursor.execute('SELECT COUNT(DISTINCT carrier_name), COUNT(DISTINCT airport_name), ROUND(AVG(arr_delay)::numeric,2) FROM silver.airline_delays WHERE arr_delay IS NOT NULL')
stats = cursor.fetchone()
print(f'Companhias: {stats[0]} | Aeroportos: {stats[1]} | Atraso médio: {stats[2]} min')

cursor.execute('SELECT year, month, carrier_name, airport_name, arr_flights, arr_delay FROM silver.airline_delays LIMIT 5')
for row in cursor.fetchall():
    print(f'{row[0]}-{row[1]:02d} | {row[2][:20]:20s} | {row[3][:20]:20s} | {row[4]:6.0f} | {row[5]:6.0f}')

cursor.close()
conn.close()
print('\n✓ ETL CONCLUÍDO!')

Total: 171,666 registros
Companhias: 23 | Aeroportos: 415 | Atraso médio: 4239.49 min
2013-08 | Endeavor Air Inc.    | Scranton/Wilkes-Barr |     91 |   1143
2013-08 | Endeavor Air Inc.    | Akron, OH: Akron-Can |    118 |    925
2013-08 | Endeavor Air Inc.    | Escanaba, MI: Delta  |     53 |    567
2013-08 | Endeavor Air Inc.    | Madison, WI: Dane Co |     89 |    392
2013-08 | Endeavor Air Inc.    | Tampa, FL: Tampa Int |     28 |    200

✓ ETL CONCLUÍDO!


## 8. Finalizar Spark

In [25]:
spark.stop()
print('✓ Spark finalizado')

✓ Spark finalizado
