In [40]:
"""
================================================================================
Nombre del Script: guardar_csv_bronze.py
Autor: Félix Cárdenas
Fecha de Creación: 2025-05-08
Última Modificación: 2025-05-08
Versión: 1.0.0

Descripción:
Este script forma parte de la capa BRONZE del proyecto BigData_Project.
Lee un archivo CSV local con Spark, lo convierte a Pandas y lo guarda en el
bucket `dev-bronze` de MinIO con una estructura basada en origen y nombre del archivo.

Dependencias:
- Python >= 3.8
- Librerías: pandas, boto3, dotenv, pyspark
"""



In [41]:
# ================================================================================
# PASO 1: IMPORTACIÓN DE LIBRERÍAS
# ================================================================================
import os
import logging
from io import BytesIO
from datetime import datetime
from dotenv import load_dotenv
import boto3
import pandas as pd
from pyspark.sql import SparkSession
from pathlib import Path

In [42]:
# ================================================================================
# PASO 2: CARGA DE VARIABLES DE ENTORNO
# ================================================================================

# Parámetros MinIO
MINIO_ENDPOINT     = os.getenv("MINIO_ENDPOINT")
MINIO_ACCESS_KEY   = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET_KEY   = os.getenv("MINIO_ROOT_PASSWORD")
BUCKET_BRONZE      = os.getenv("MINIO_BUCKET_BRONZE")

# Ruta local del archivo a cargar
ruta_csv_local     = "/home/jovyan/datos/csv/pacientes_crudo.csv"

# Cargar las variables definidas en el archivo .env
load_dotenv("/home/jovyan/.env")

# ===> Nombre base del archivo (sin extensión)
nombre_archivo_base = Path(ruta_csv_local).stem 

# ===> Dominio extraído del nombre: lo que esté antes del guion bajo
dominio = nombre_archivo_base.split("_")[0].lower()  

# ===> Carpeta MinIO: siempre LOCAL_{dominio.upper()}
carpeta_destino = f"LOCAL_{dominio.upper()}" 

# ===> Timestamp actual
timestamp = datetime.now().strftime("%Y%m%d%H%M")

# ===> Nombre final del archivo
nombre_archivo = f"{nombre_archivo_base}_{timestamp}.csv"  

# ===> Ruta final en MinIO (clave)
key_minio = f"{carpeta_destino}/{nombre_archivo}"

In [None]:
# ================================================================================
# PASO 3: LECTURA CON SPARK Y CONVERSIÓN A PANDAS
# ================================================================================
spark = SparkSession.builder \
    .appName("Guardar CSV en MinIO") \
    .getOrCreate()

df_spark = spark.read.option("header", True).csv(ruta_csv_local)
df_pandas = df_spark.toPandas()


25/05/09 15:27:24 INFO InMemoryFileIndex: It took 7 ms to list leaf files for 1 paths.
25/05/09 15:27:24 INFO InMemoryFileIndex: It took 7 ms to list leaf files for 1 paths.
25/05/09 15:27:24 INFO FileSourceStrategy: Pushed Filters: 
25/05/09 15:27:24 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#165, None)) > 0)
25/05/09 15:27:24 INFO MemoryStore: Block broadcast_22 stored as values in memory (estimated size 201.6 KiB, free 434.0 MiB)
25/05/09 15:27:24 INFO MemoryStore: Block broadcast_22_piece0 stored as bytes in memory (estimated size 35.0 KiB, free 433.9 MiB)
25/05/09 15:27:24 INFO BlockManagerInfo: Added broadcast_22_piece0 in memory on 41a730c8830f:35003 (size: 35.0 KiB, free: 434.3 MiB)
25/05/09 15:27:24 INFO SparkContext: Created broadcast 22 from csv at NativeMethodAccessorImpl.java:0
25/05/09 15:27:24 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
25/05/09 15:27:24 INFO Spar

In [45]:
# ================================================================================
# PASO 4: GUARDADO EN MINIO CON BOTO3
# ================================================================================
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

buffer = BytesIO()
df_pandas.to_csv(buffer, index=False)
buffer.seek(0)

s3.upload_fileobj(buffer, BUCKET_BRONZE, key_minio)