In [1]:
# ==========================
# Importações
# ==========================
# %pip install requests  # (ative se necessário no Fabric)
import time
import requests
from typing import List, Dict

from pyspark.sql import Row
from pyspark.sql import types as T

# ==========================
# 1) Parâmetros
# ==========================
SUPABASE_TABLE = "tb_kaizen_cadastro"
SUPABASE_URL = f"https://jewtbymqxxubjpwnjtux.supabase.co/rest/v1/{SUPABASE_TABLE}"

# Mesmas chaves (em produção, use Key Vault / credenciais do Fabric)
API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Impld3RieW1xeHh1Ympwd25qdHV4Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MTU3NzQ1ODQsImV4cCI6MjAzMTM1MDU4NH0.bs8NXsld5F98WdGTqt_9U0d1HY3DSXT4us0Ur1Rs8HE"
BEARER_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Impld3RieW1xeHh1Ympwd25qdHV4Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTcxNTc3NDU4NCwiZXhwIjoyMDMxMzUwNTg0fQ.qJW13vrpLiF_uIHpGxNCy0iGpr--WhUK8g-AfeS4xm8"

# Lakehouse destino (ajuste para o seu workspace/lakehouse)
path_destino = "abfss://ws_sistemas@onelake.dfs.fabric.microsoft.com/lk_systemmax.Lakehouse/Tables/tab_boas_praticas"

# (Opcional) nome de tabela gerenciada (catálogo). Deixe vazio para usar apenas o path físico.
tabela_destino = ""  # ex.: "bronze.tb_kaizen_cadastro"

# Coleta
PAGE_SIZE = 1000
HTTP_TIMEOUT = 60  # s

# ==========================
# 2) Schema explícito
# ==========================
schema = T.StructType([
    T.StructField("id", T.LongType(), True),
    T.StructField("created_at", T.StringType(), True),
    T.StructField("objetivo", T.StringType(), True),
    T.StructField("area_aplicada", T.StringType(), True),
    T.StructField("data", T.StringType(), True),
    T.StructField("pilar", T.StringType(), True),
    T.StructField("elimina_desperdicio", T.StringType(), True),
    T.StructField("modificado_por", T.StringType(), True),
    T.StructField("modificado_quando", T.StringType(), True),
    T.StructField("contrato_user", T.StringType(), True),
    T.StructField("status", T.StringType(), True),
    T.StructField("relevancia", T.StringType(), True),
    T.StructField("uuidcadastro", T.StringType(), True),
    T.StructField("resultados", T.StringType(), True),
    T.StructField("geral", T.StringType(), True),
    T.StructField("responsavel_etapa", T.StringType(), True),
    T.StructField("visualizacoes", T.LongType(), True),
    T.StructField("votos", T.LongType(), True),
    T.StructField("categoria", T.StringType(), True),
    T.StructField("titulo", T.StringType(), True),
    T.StructField("descricao", T.StringType(), True),
    T.StructField("etapa_processo", T.StringType(), True),
    T.StructField("fabricou_dispositivo", T.StringType(), True),
    T.StructField("projeto", T.StringType(), True),
    T.StructField("nota_local", T.StringType(), True),
    T.StructField("nota_corporativo", T.StringType(), True),
    T.StructField("premiada", T.StringType(), True),
    T.StructField("reconhecida", T.StringType(), True),
    T.StructField("data_reconhecimento", T.StringType(), True),
    T.StructField("observacoes", T.StringType(), True),
    T.StructField("matricula_cadastrante", T.StringType(), True),
    # 'tags' propositalmente não incluída (mesmo efeito do Table.RemoveColumns)
])
cols = [f.name for f in schema]
_numeric_cols = {"id", "visualizacoes", "votos"}

# ==========================
# 3) GET com retry/backoff
# ==========================
def http_get_with_retry(url: str, headers: Dict[str, str], timeout: int, max_retries: int = 5) -> requests.Response:
    backoff = 1.5
    attempt = 0
    while True:
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            if resp.status_code in (429, 500, 502, 503, 504):
                attempt += 1
                if attempt > max_retries:
                    resp.raise_for_status()
                time.sleep(backoff ** attempt)
                continue
            resp.raise_for_status()
            return resp
        except requests.RequestException:
            attempt += 1
            if attempt > max_retries:
                raise
            time.sleep(backoff ** attempt)

# ==========================
# 4) Coleta paginada no Supabase
# ==========================
base_headers = {
    "apikey": API_KEY,
    "Authorization": f"Bearer {BEARER_TOKEN}",
    "Accept": "application/json",
}

offset = 0
registros: List[Dict] = []

while True:
    headers = {**base_headers, "Range": f"{offset}-{offset + PAGE_SIZE - 1}"}
    resp = http_get_with_retry(SUPABASE_URL, headers, timeout=HTTP_TIMEOUT)
    batch = resp.json()
    if not batch:
        break
    registros.extend(batch)
    if len(batch) < PAGE_SIZE:
        break
    offset += PAGE_SIZE

print(f"Registros coletados do Supabase ({SUPABASE_TABLE}): {len(registros)}")

# ==========================
# 5) Monta DataFrame sem inferência
# ==========================
def _to_long(v):
    if v is None:
        return None
    try:
        # cobre int/float/str numérica
        return int(float(v)) if isinstance(v, str) and v.strip() != "" and ("." in v or "e" in v.lower()) else int(v)
    except Exception:
        return None

if len(registros) == 0:
    df = spark.createDataFrame([], schema)
else:
    rows = []
    for rec in registros:
        fixed = {}
        for c in cols:
            val = rec.get(c)
            if c in _numeric_cols:
                fixed[c] = _to_long(val)
            else:
                fixed[c] = None if val is None else str(val)
        rows.append(Row(**fixed))
    df = spark.createDataFrame(rows, schema)

df.printSchema()
df.show(10, truncate=False)

# ==========================
# 6) Escrita em Delta
# ==========================
if tabela_destino.strip():
    spark.sql(f"DROP TABLE IF EXISTS {tabela_destino}")
    df.write.format("delta").mode("overwrite").saveAsTable(tabela_destino)
    print(f"Tabela gerenciada gravada: {tabela_destino}")
else:
    df.write.format("delta").mode("overwrite").save(path_destino)
    print(f"Delta gravado no caminho: {path_destino}")

print(f"Linhas salvas: {df.count()}")



StatementMeta(, 4078c993-4e0e-4324-bc5c-52d410cc9191, 3, Finished, Available, Finished)

Registros coletados do Supabase (tb_kaizen_cadastro): 233
root
 |-- id: long (nullable = true)
 |-- created_at: string (nullable = true)
 |-- objetivo: string (nullable = true)
 |-- area_aplicada: string (nullable = true)
 |-- data: string (nullable = true)
 |-- pilar: string (nullable = true)
 |-- elimina_desperdicio: string (nullable = true)
 |-- modificado_por: string (nullable = true)
 |-- modificado_quando: string (nullable = true)
 |-- contrato_user: string (nullable = true)
 |-- status: string (nullable = true)
 |-- relevancia: string (nullable = true)
 |-- uuidcadastro: string (nullable = true)
 |-- resultados: string (nullable = true)
 |-- geral: string (nullable = true)
 |-- responsavel_etapa: string (nullable = true)
 |-- visualizacoes: long (nullable = true)
 |-- votos: long (nullable = true)
 |-- categoria: string (nullable = true)
 |-- titulo: string (nullable = true)
 |-- descricao: string (nullable = true)
 |-- etapa_processo: string (nullable = true)
 |-- fabricou_disp