 ## conexao

In [None]:
import psycopg2

# testar se o DDL funcionou
DB_CONFIG = {
    'host': 'localhost',
    'port': '5433',
    'database': 'austin_airbnb',
    'user': 'postgres',
    'password': 'postgres'
}

try:
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()
    # verifica se a tabela existe
    cur.execute("SELECT * FROM silver.one_big_table LIMIT 0;")
    print("Sucesso! A tabela silver.one_big_table já existe e está pronta para visualização.")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Erro: {e}")

pyspark


In [None]:
from pyspark.sql import SparkSession

# 1. Configurar a sessão Spark com o driver do Postgres
spark = SparkSession.builder \
    .appName("AustinAirbnbValidation") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .getOrCreate()

# 2. Configurações de conexão
jdbc_url = "jdbc:postgresql://localhost:5433/austin_airbnb"
db_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

# Função auxiliar para ler queries SQL via Spark
def read_query(query):
    return spark.read.jdbc(url=jdbc_url, table=f"({query}) as tmp", properties=db_properties)

# 3. Execução das Queries
query_listings = """
SELECT DISTINCT 
    listing_id, listing_name, property_type, room_type, bed_type,
    accommodates, bathrooms, bedrooms, beds,
    neighbourhood_cleansed, listing_price as price,
    number_of_reviews, first_review, last_review
FROM silver.one_big_table
"""
df_spark_listings = read_query(query_listings)

query_calendar = """
SELECT DISTINCT
    listing_id, calendar_date as date, calendar_available as available
FROM silver.one_big_table
"""
df_spark_calendar = read_query(query_calendar)

query_reviews = """
SELECT DISTINCT
    review_id, listing_id, review_date as date, reviewer_id, reviewer_name
FROM silver.one_big_table
WHERE review_id IS NOT NULL
"""
df_spark_reviews = read_query(query_reviews)

# 4. Validação dos Volumes (Contagem distribuída)
print(f"Listings: {df_spark_listings.count():,} registros")
print(f"Calendar: {df_spark_calendar.count():,} registros")
print(f"Reviews: {df_spark_reviews.count():,} registros")

# 5. Visualizar amostra 
df_spark_listings.show(5)