In [None]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession


# Como o notebook est√° em /Transformer e o na raiz
load_dotenv(os.path.join(os.path.dirname(os.getcwd()), '.env'))

# 2. CONFIGURAR CAMINHOS BASEADOS NA ESTRUTURA DE PASTAS
RAIZ_PROJETO = os.path.dirname(os.getcwd())
BASE_PATH = os.path.join(RAIZ_PROJETO, "Data Layer", "raw")

print(f"üîç Procurando arquivos em: {BASE_PATH}")

# 3. DICION√ÅRIO E VALIDA√á√ÉO
arquivos = {
    "Listings": "dados_brutos_listings.csv",
    "Calendar": "dados_brutos_calendar.csv",
    "Reviews": "dados_brutos_reviews.csv"
}

for nome, arquivo in arquivos.items():
    if os.path.exists(os.path.join(BASE_PATH, arquivo)):
        print(f" Arquivo {nome} ENCONTRADO!")
    else:
        print(f" ERRO: Arquivo {nome} N√ÉO encontrado!")

# 4. INICIAR SPARK 
spark = SparkSession.builder \
    .appName("ETL_Austin_Airbnb") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .getOrCreate()

# 5. EXTRA√á√ÉO
try:
    df_listings_raw = spark.read.csv(os.path.join(BASE_PATH, arquivos["Listings"]), header=True, inferSchema=True)
    df_calendar_raw = spark.read.csv(os.path.join(BASE_PATH, arquivos["Calendar"]), header=True, inferSchema=True)
    df_reviews_raw = spark.read.csv(os.path.join(BASE_PATH, arquivos["Reviews"]), header=True, inferSchema=True)
    print("\n Sucesso! Os DataFrames da camada RAW foram carregados.")
except Exception as e:
    print(f"\n Falha na leitura: {e}")

üîç Procurando arquivos em: /home/gandalfe/Documentos/sbd2/SBD2-Austin-Airbnb/Data Layer/raw
‚úÖ Arquivo Listings ENCONTRADO!
‚úÖ Arquivo Calendar ENCONTRADO!
‚úÖ Arquivo Reviews ENCONTRADO!

üöÄ Sucesso! Os DataFrames da camada RAW foram carregados.


In [15]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# 1. Aplicando try_cast em massa para ignorar erros de deslocamento de coluna no CSV
df_listings = df_listings_raw.select(
    F.expr("try_cast(id as int)").alias("listing_id"),
    F.col("name").alias("listing_name"),
    "property_type", 
    "room_type", 
    "bed_type", 
    F.expr("try_cast(accommodates as int)").alias("accommodates"),
    F.expr("try_cast(bathrooms as double)").alias("bathrooms"),
    F.expr("try_cast(bedrooms as double)").alias("bedrooms"),
    F.expr("try_cast(beds as double)").alias("beds"),
    "neighbourhood_cleansed",
    # Limpeza de pre√ßo com try_cast
    F.expr("try_cast(regexp_replace(price, '[^0-9.]', '') as decimal(10,2))").alias("listing_price"),
    F.expr("try_cast(number_of_reviews as int)").alias("number_of_reviews"),
    F.expr("try_cast(first_review as date)").alias("first_review"),
    F.expr("try_cast(last_review as date)").alias("last_review"),
    F.expr("try_cast(host_id as int)").alias("host_id"),
    "host_name"
).fillna({
    "listing_price": 0.0, 
    "number_of_reviews": 0, 
    "accommodates": 0,
    "bedrooms": 0,
    "beds": 0,
    "bathrooms": 0
})

# 2. Garantir que listing_id n√£o seja nulo (pois √© PK no banco)
df_listings = df_listings.filter(F.col("listing_id").isNotNull())

# 3. Tratamento do Calendar (tamb√©m com try_cast por seguran√ßa)
df_calendar = df_calendar_raw.select(
    F.expr("try_cast(listing_id as int)").alias("listing_id"),
    F.expr("try_cast(date as date)").alias("calendar_date"),
    F.when(F.col("available") == "t", True).otherwise(False).alias("calendar_available")
).filter(F.col("listing_id").isNotNull() & F.col("calendar_date").isNotNull())

# 4. Join Final
df_silver = df_calendar.join(df_listings, "listing_id", "inner")

# 5. Configura√ß√£o JDBC e Grava√ß√£o no Postgres
jdbc_url = "jdbc:postgresql://localhost:5433/austin_airbnb"
db_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

print(f"Enviando {df_silver.count():,} linhas para o banco...")

try:
    df_silver.write.jdbc(
        url=jdbc_url, 
        table="silver.one_big_table", 
        mode="append", # 'append' porque a tabela j√° existe via DDL
        properties=db_properties
    )
    print("SUCESSO! O banco de dados foi populado.")
except Exception as e:
    print(f"Erro na carga: {e}")

Enviando 1,033,610 linhas para o banco...


26/01/14 17:18:26 ERROR Executor: Exception in task 4.0 in stage 41.0 (TID 101)
java.sql.BatchUpdateException: Batch entry 0 INSERT INTO silver.one_big_table ("listing_id","calendar_date","calendar_available","listing_name","property_type","room_type","bed_type","accommodates","bathrooms","bedrooms","beds","neighbourhood_cleansed","listing_price","number_of_reviews","first_review","last_review","host_id","host_name") VALUES (5785387,'2016-02-07 -02'::date,'TRUE','Cute Shabby Chic Room in E. Austin','House','Private room','Real Bed',3,1.0,1.0,2.0,78702,0.00,0,NULL,NULL,6291345,'Beth') was aborted: ERROR: duplicate key value violates unique constraint "one_big_table_pkey"
  Detalhe: Key (listing_id, calendar_date)=(5785387, 2016-02-07) already exists.  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:165)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2402)
	at org.po

Erro na carga: An error occurred while calling o438.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 41.0 failed 1 times, most recent failure: Lost task 5.0 in stage 41.0 (TID 102) (192.168.1.8 executor driver): java.sql.BatchUpdateException: Batch entry 0 INSERT INTO silver.one_big_table ("listing_id","calendar_date","calendar_available","listing_name","property_type","room_type","bed_type","accommodates","bathrooms","bedrooms","beds","neighbourhood_cleansed","listing_price","number_of_reviews","first_review","last_review","host_id","host_name") VALUES (2420168,'2016-03-16 -03'::date,'FALSE',' EAST AUSTIN 3 BDRM Artist Home','House','Entire home/apt','Real Bed',5,1.5,3.0,3.0,78702,0.00,6,NULL,NULL,791464,'Rachel') was aborted: ERROR: duplicate key value violates unique constraint "one_big_table_pkey"
  Detalhe: Key (listing_id, calendar_date)=(2420168, 2016-03-16) already exists.  Call getNextException to see other errors in the batch.
	at org

In [16]:
import psycopg2

conn = psycopg2.connect(host="localhost", port="5433", database="austin_airbnb", user="postgres", password="postgres")
cur = conn.cursor()
cur.execute("SELECT count(*) FROM silver.one_big_table")
print(f"Total de registros no banco: {cur.fetchone()[0]:,}")
cur.close()
conn.close()

Total de registros no banco: 1,033,610


In [10]:
import psycopg2
conn = psycopg2.connect(host="localhost", port="5433", database="austin_airbnb", user="postgres", password="postgres")
cur = conn.cursor()
cur.execute("TRUNCATE TABLE silver.one_big_table;")
conn.commit()
cur.close()
conn.close()
print("Tabela limpa")

Tabela limpa


In [17]:
import psycopg2
from psycopg2.extras import RealDictCursor

try:
    # 1. Conectar ao banco
    conn = psycopg2.connect(
        host="localhost", 
        port="5433", 
        database="austin_airbnb", 
        user="postgres", 
        password="postgres"
    )
    
    # 2. Criar um cursor (usando RealDictCursor para ver os nomes das colunas como um dicion√°rio)
    cur = conn.cursor(cursor_factory=RealDictCursor)
    
    # 3. Executar a consulta para pegar 10 linhas
    query = "SELECT listing_id, calendar_date, listing_price, listing_name FROM silver.one_big_table LIMIT 10;"
    cur.execute(query)
    
    # 4. Recuperar os resultados
    rows = cur.fetchall()
    
    print(f"--- Exibindo {len(rows)} linhas da tabela silver.one_big_table ---\n")
    
    # 5. Iterar e imprimir de forma organizada
    for row in rows:
        print(f"ID: {row['listing_id']} | Data: {row['calendar_date']} | Pre√ßo: ${row['listing_price']} | Nome: {row['listing_name'][:30]}...")

except Exception as e:
    print(f"‚ùå Erro ao consultar o banco: {e}")

finally:
    # 6. Fechar conex√£o sempre!
    if cur: cur.close()
    if conn: conn.close()

--- Exibindo 10 linhas da tabela silver.one_big_table ---

ID: 3997642 | Data: 2016-10-20 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-21 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-22 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-23 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-24 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-25 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-26 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 3997642 | Data: 2016-10-27 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
ID: 2429409 | Data: 2016-09-27 | Pre√ßo: $0.00 | Nome: Artist Home Close to SoCo/Down...
ID: 3997642 | Data: 2016-10-28 | Pre√ßo: $0.00 | Nome: Private Treehouse Studio NW Hi...
