In [None]:
from datetime import datetime, date
from pathlib import PurePosixPath
import pandas as pd
from pyspark.sql import SparkSession

# Configuração fixa
CATALOG = "workspace"
SCHEMA  = "default"
VOLUME  = "elt_volume"

VOL_ROOT = PurePosixPath("/Volumes") / CATALOG / SCHEMA / VOLUME
LANDING  = VOL_ROOT / "landing"
RAW_ROOT = VOL_ROOT / "raw"

today_str = date.today().isoformat()
time_str  = datetime.now().strftime("%H%M")
RAW_PART  = RAW_ROOT / today_str.replace("-", "/") / f"vendas_mock_{time_str}"

LANDING_CSV = LANDING / "vendas_mock.csv"
RAW_CSV     = RAW_PART / "vendas_mock.csv"
PARQUET_OUT = RAW_PART / f"vendas_mock_{time_str}.parquet"

spark = SparkSession.builder.getOrCreate()

# Garante volume e diretórios
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")
for p in [LANDING, RAW_ROOT, RAW_PART]:
    dbutils.fs.mkdirs(str(p))
    print(f"DEBUG: Criado diretório {p}")

# Valida arquivo em Landing
if not dbutils.fs.ls(str(LANDING_CSV)):
    print(f"DEBUG: Arquivo não encontrado em {LANDING_CSV}")
    raise FileNotFoundError(f"Arquivo não encontrado em {LANDING_CSV}")
else:
    print(f"DEBUG: Arquivo encontrado em {LANDING_CSV}")

# Move CSV para Raw
dbutils.fs.rm(str(RAW_CSV), recurse=True)
dbutils.fs.mv(str(LANDING_CSV), str(RAW_CSV))
print(f"DEBUG: Arquivo movido de {LANDING_CSV} para {RAW_CSV}")

# Converte para Parquet único
df = (spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv(f"dbfs:{RAW_CSV}"))
print(f"DEBUG: Arquivo CSV lido de {RAW_CSV}, total de linhas: {df.count()}")

tmp_dir = str(RAW_PART / "_tmp_parquet")
df.coalesce(1).write.mode("overwrite").parquet(f"dbfs:{tmp_dir}")
print(f"DEBUG: Arquivo Parquet temporário salvo em {tmp_dir}")

part_file = [f.path for f in dbutils.fs.ls(f"dbfs:{tmp_dir}") if f.path.endswith(".parquet")][0]
dbutils.fs.rm(f"dbfs:{PARQUET_OUT}", recurse=True)
dbutils.fs.mv(part_file, f"dbfs:{PARQUET_OUT}")
dbutils.fs.rm(f"dbfs:{tmp_dir}", recurse=True)
print(f"DEBUG: Arquivo Parquet final salvo em {PARQUET_OUT}")

# Salva contextos para próximos tasks
task_key = "Execute_process_file_to_parquet"  # Nome real da task no Workflow
try:
    dbutils.jobs.taskValues.set(key="raw_csv", value=str(RAW_CSV))
    dbutils.jobs.taskValues.set(key="raw_dir", value=str(RAW_PART))
    dbutils.jobs.taskValues.set(key="parquet", value=str(PARQUET_OUT))
    print(f"DEBUG: Task values successfully set - raw_csv: {str(RAW_CSV)}")
except Exception as e:
    print(f"DEBUG: Failed to set task values: {str(e)}")
    raise

# Debug: Verifica os valores salvos
print("DEBUG: Task values set:")
print(f"  raw_csv: {dbutils.jobs.taskValues.get(taskKey=task_key, key='raw_csv', debugValue='Not set')}")
print(f"  raw_dir: {dbutils.jobs.taskValues.get(taskKey=task_key, key='raw_dir', debugValue='Not set')}")
print(f"  parquet: {dbutils.jobs.taskValues.get(taskKey=task_key, key='parquet', debugValue='Not set')}")