In [1]:
"""
================================================================================
Nombre del Script: refinar_pacientes_silver.py
Autor: Félix Cárdenas
Fecha de Creación: 2025-05-08
Última Modificación: 2025-05-08
Versión: 1.0.0

Descripción:
Este script forma parte de la capa SILVER del proyecto BigData_Project.
Se encarga de leer los datos crudos de pacientes desde el bucket dev-bronze,
aplicar reglas estrictas de calidad y limpieza con Spark y luego guardar
el resultado como archivo Parquet en el bucket dev-silver.

Dependencias:
- Python >= 3.8
- Librerías: pyspark, pandas, boto3, python-dotenv
"""



In [2]:
# ================================================================================
# PASO 1: IMPORTACIÓN DE LIBRERÍAS
# ================================================================================
import os
from io import BytesIO
from datetime import datetime
from dotenv import load_dotenv
import boto3
import pandas as pd
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, to_date


In [3]:
# ================================================================================
# PASO 2: CARGA DE VARIABLES DE ENTORNO
# ================================================================================
load_dotenv("/home/jovyan/.env")

MINIO_ENDPOINT     = os.getenv("MINIO_ENDPOINT")
MINIO_ACCESS_KEY   = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET_KEY   = os.getenv("MINIO_ROOT_PASSWORD")
BUCKET_BRONZE      = os.getenv("MINIO_BUCKET_BRONZE")
BUCKET_SILVER      = os.getenv("MINIO_BUCKET_SILVER")


In [4]:
# ================================================================================
# PASO 3: CREACIÓN DE SPARKSESSION
# ================================================================================
spark = SparkSession.builder \
    .appName("Transformación SILVER") \
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.jars", "/home/jovyan/jars/hadoop-aws-3.3.2.jar,/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar") \
    .getOrCreate()


25/05/10 19:22:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/10 19:22:59 WARN DependencyUtils: Local jar /home/jovyan/jars/hadoop-aws-3.3.2.jar does not exist, skipping.
25/05/10 19:22:59 WARN DependencyUtils: Local jar /home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar does not exist, skipping.
25/05/10 19:23:00 INFO SparkContext: Running Spark version 3.5.5
25/05/10 19:23:00 INFO SparkContext: OS info Linux, 5.15.167.4-microsoft-standard-WSL2, amd64
25/05/10 19:23:00 INFO SparkContext: Java version 11.0.27
25/05/10 19:23:00 INFO ResourceUtils: No custom resources configured for spark.driver.
25/05/10 19:23:00 INFO SparkContext: Submitted application: Transformación SILVER
25/05/10 19:23:00 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: off

In [5]:
# ================================================================================
# PASO 4: LECTURA DEL ARCHIVO DEL DÍA DESDE MINIO (bronze) con Boto3 + Spark
# ================================================================================

# Inicializamos cliente S3
s3 = boto3.client("s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# Detectar archivo más reciente del día
today = datetime.now().strftime("%Y%m%d")
prefix = "LOCAL_PACIENTES/"
objetos = s3.list_objects_v2(Bucket=BUCKET_BRONZE, Prefix=prefix)

# Buscar el archivo del día
archivo_bronze = None
for obj in objetos.get("Contents", []):
    nombre = obj["Key"]
    if today in nombre and nombre.endswith(".csv"):
        archivo_bronze = nombre
        break

if not archivo_bronze:
    raise FileNotFoundError(f"No se encontró archivo CSV con fecha {today} en {BUCKET_BRONZE}/{prefix}")

# Descargar el archivo temporalmente
ruta_local_tmp = f"/tmp/{Path(archivo_bronze).name}"
with open(ruta_local_tmp, "wb") as f:
    s3.download_fileobj(BUCKET_BRONZE, archivo_bronze, f)

df_spark = spark.read.option("header", True).csv(ruta_local_tmp)


25/05/10 19:23:01 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/05/10 19:23:01 INFO SharedState: Warehouse path is 'file:/home/jovyan/spark-warehouse'.
25/05/10 19:23:01 INFO InMemoryFileIndex: It took 29 ms to list leaf files for 1 paths.
25/05/10 19:23:01 INFO InMemoryFileIndex: It took 1 ms to list leaf files for 1 paths.
25/05/10 19:23:03 INFO FileSourceStrategy: Pushed Filters: 
25/05/10 19:23:03 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#0, None)) > 0)
25/05/10 19:23:03 INFO CodeGenerator: Code generated in 122.908267 ms
25/05/10 19:23:03 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 201.5 KiB, free 434.2 MiB)
25/05/10 19:23:03 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 34.9 KiB, free 434.2 MiB)
25/05/10 19:23:03 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 5abaae7821e4:37509 (size: 34.9 KiB, free: 434.4 MiB)


In [6]:
# ================================================================================
# PASO 5: TRANSFORMACIÓN / CALIDAD DE DATOS CON SPARK
# ================================================================================

df_limpio = df_spark \
    .filter(col("nombre").isNotNull() & (col("nombre") != "")) \
    .filter((col("edad") > 0) & (col("edad") < 120)) \
    .filter(col("obra_social").isNotNull() & (col("obra_social") != "")) \
    .withColumn("fecha_turno", split(col("fecha_turno"), " ").getItem(0)) \
    .withColumn("fecha_turno", to_date("fecha_turno", "yyyy-MM-dd")) \
    .dropna(subset=["fecha_turno"]) \
    .dropDuplicates()



In [None]:
# ================================================================================  
# PASO 6: CONVERSIÓN A PANDAS Y SUBIDA A MinIO (SILVER)  
# ================================================================================  

# Conversión a pandas 
df_pandas = df_limpio.toPandas()

# Timestamp actual
timestamp = datetime.now().strftime("%Y%m%d%H%M")

# Nombre del archivo refinado
nombre_archivo = f"pacientes_refinados_{timestamp}.parquet"
carpeta_silver = "LOCAL_PACIENTES"
ruta_silver = f"{carpeta_silver}/{nombre_archivo}"

# Guardar en buffer Parquet
buffer = BytesIO()
df_pandas.to_parquet(buffer, index=False)
buffer.seek(0)

# Subir a MinIO
s3.upload_fileobj(buffer, BUCKET_SILVER, ruta_silver)


25/05/10 19:23:04 INFO FileSourceStrategy: Pushed Filters: IsNotNull(edad),IsNotNull(nombre),Not(EqualTo(nombre,)),IsNotNull(obra_social),Not(EqualTo(obra_social,))
25/05/10 19:23:04 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(edad#19),isnotnull(nombre#18),NOT (nombre#18 = ),(cast(edad#19 as int) > 0),(cast(edad#19 as int) < 120),isnotnull(obra_social#20),NOT (obra_social#20 = ),atleastnnonnulls(1, cast(gettimestamp(split(fecha_turno#21,  , -1)[0], yyyy-MM-dd, TimestampType, Some(GMT), false) as date))
25/05/10 19:23:04 INFO BlockManagerInfo: Removed broadcast_2_piece0 on 5abaae7821e4:37509 in memory (size: 34.9 KiB, free: 434.4 MiB)
25/05/10 19:23:04 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 5abaae7821e4:37509 in memory (size: 34.9 KiB, free: 434.4 MiB)
25/05/10 19:23:04 INFO BlockManagerInfo: Removed broadcast_1_piece0 on 5abaae7821e4:37509 in memory (size: 6.4 KiB, free: 434.4 MiB)
25/05/10 19:23:04 INFO CodeGenerator: Code generated in 73.659923 ms
25/05/10 19:

✅ Archivo refinado guardado en: s3://dev-silver/LOCAL_PACIENTES/pacientes_refinados_202505101923.parquet
