In [16]:
import os
from io import BytesIO
from dotenv import load_dotenv
import boto3
from pyspark.sql import SparkSession

In [17]:
# ================================================================================
# PASO 1: CARGA DE VARIABLES DE ENTORNO
# ================================================================================
load_dotenv("/home/jovyan/.env")

MINIO_ENDPOINT     = os.getenv("MINIO_ENDPOINT")
MINIO_ACCESS_KEY   = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET_KEY   = os.getenv("MINIO_ROOT_PASSWORD")
BUCKET_BRONZE      = os.getenv("MINIO_BUCKET_BRONZE")

In [18]:
# ================================================================================
# PASO 2: INICIALIZAR SPARK Y LEER CSV LOCAL
# ================================================================================
spark = SparkSession.builder \
    .appName("Subida archivo crudo a MinIO") \
    .getOrCreate()

# Ruta local dentro del contenedor
RUTA_LOCAL_CSV = "/home/jovyan/datos/csv/pacientes_crudo.csv"

df_spark = spark.read.option("header", True).csv(RUTA_LOCAL_CSV)

25/05/09 15:02:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/05/09 15:02:32 INFO InMemoryFileIndex: It took 7 ms to list leaf files for 1 paths.
25/05/09 15:02:32 INFO InMemoryFileIndex: It took 6 ms to list leaf files for 1 paths.
25/05/09 15:02:32 INFO FileSourceStrategy: Pushed Filters: 
25/05/09 15:02:32 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#74, None)) > 0)
25/05/09 15:02:32 INFO MemoryStore: Block broadcast_10 stored as values in memory (estimated size 201.7 KiB, free 434.0 MiB)
25/05/09 15:02:32 INFO MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 35.0 KiB, free 433.9 MiB)
25/05/09 15:02:32 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on 41a730c8830f:35003 (size: 35.0 KiB, free: 434.3 MiB)
25/05/09 15:02:32 INFO SparkContext: Created broadcast 10 from csv at NativeMethodAccessorImpl.java:0
25/05/09 15:02:32 INFO FileSourceScanExec: Planning scan wi

In [19]:
# ================================================================================
# PASO 3: CONVERTIR A PANDAS Y SUBIR A MINIO
# ================================================================================
df_pandas = df_spark.toPandas()

# Ruta destino en MinIO (con timestamp opcional)
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d%H%M")
ruta_objeto_s3 = f"bronze/pacientes_crudo_{timestamp}.csv"

# Cliente boto3
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# Guardar en buffer y subir
buffer = BytesIO()
df_pandas.to_csv(buffer, index=False)
buffer.seek(0)

s3.upload_fileobj(buffer, BUCKET_BRONZE, ruta_objeto_s3)

print(f"✅ Archivo subido a s3://{BUCKET_BRONZE}/{ruta_objeto_s3}")

25/05/09 15:02:54 INFO FileSourceStrategy: Pushed Filters: 
25/05/09 15:02:54 INFO FileSourceStrategy: Post-Scan Filters: 
25/05/09 15:02:54 INFO MemoryStore: Block broadcast_13 stored as values in memory (estimated size 201.6 KiB, free 433.5 MiB)
25/05/09 15:02:54 INFO MemoryStore: Block broadcast_13_piece0 stored as bytes in memory (estimated size 35.0 KiB, free 433.5 MiB)
25/05/09 15:02:54 INFO BlockManagerInfo: Added broadcast_13_piece0 in memory on 41a730c8830f:35003 (size: 35.0 KiB, free: 434.3 MiB)
25/05/09 15:02:54 INFO SparkContext: Created broadcast 13 from toPandas at /tmp/ipykernel_3329/4205248700.py:4
25/05/09 15:02:54 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
25/05/09 15:02:54 INFO SparkContext: Starting job: toPandas at /tmp/ipykernel_3329/4205248700.py:4
25/05/09 15:02:54 INFO DAGScheduler: Got job 7 (toPandas at /tmp/ipykernel_3329/4205248700.py:4) with 10 output partitions
25/05

✅ Archivo subido a s3://dev-bronze/bronze/pacientes_crudo_202505091502.csv
