In [6]:
%iam_role arn:aws:iam::116948010203:role/Poc-alertas-glue

%region us-east-1

%%tags
{
    "creado-por": "santiago.castro@externos-cl.cencosud.com",
    "apl": "apl1214",
    "unidad-negocio": "ccom",
    "bandera": "cencommerce",
    "plataforma": "eks",
    "version-so": "1.25",
    "Name": "CO-ALERT-POC",
    "ambiente": "staging",
    "cuenta": "116948010203",
    "pais": "co",
    "ceco": "CVO1007301",
    "Terraform": "no",
    "environment": "staging",
    "aplicacion": "co alert poc",
    "ApplicationName": "CO-ALERT-POC",
    "propietario": "humbertolares@cencosud.cl",
    "proyecto": "CENCO-PIM",
    "epm": "opex",
    "Owner": "Humberto Lares",
    "tribe": "Digital Retail Backbone"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
It looks like there is a newer version of the kernel available. The latest version is 1.0.9 and you have 1.0.8 installed.
Please run `pip install --upgrade aws-glue-sessions` to upgrade your kernel
Current iam_role is None
iam_role has been set to arn:aws:iam::116948010203:role/Poc-alertas-glue.
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Tag {'creado-por': 'santiago.castro@externos-cl.cencosud.com', 'apl': 'apl1214', 'unidad-negocio': 'ccom', 'bandera': 'cencommerce', 'plataforma': 'eks', 'version-so': '1.25', 'Name': 'CO-ALERT-POC', 'ambiente': 'staging', 'cuenta': '116948010203', 'pais': 'co', 'ceco': 'CVO1007301', 'Terraform':

In [11]:
%idle_timeout 15
%glue_version 5.0
%worker_type G.1X
%number_of_workers 4

Current idle_timeout is None minutes.
idle_timeout has been set to 15 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 4


In [9]:
import sys
import logging
from datetime import datetime
from pyspark.context import SparkContext
from awsglue.context import GlueContext
import pyspark.sql.functions as F

# --- Configurar logging para Glue Job (CloudWatch compatible) ---
logger = logging.getLogger()
logger.setLevel(logging.INFO)

if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# --- Función de log personalizada para mantener formato uniforme ---
def log(msg, level="INFO"):
    if level == "INFO":
        logger.info(msg)
    elif level == "WARNING":
        logger.warning(msg)
    elif level == "ERROR":
        logger.error(msg)
    else:
        logger.debug(msg)

# --- Inicializar GlueContext ---
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

# --- Rutas de origen y destino en S3 ---
BUCKET_LANDING = "s3://landing-data-poc-cl2025/"
BUCKET_STAGE = "s3://stage-data-poc-cl2025/"

table_id = 'BQ_CO_SM_SALES_01/'
table_id2 = 'BQ_CO_SM_SALES_02/'

TABLE = f"{BUCKET_LANDING}{table_id}"
TABLE2 = f"{BUCKET_STAGE}{table_id2}"

# --- Lectura desde S3 (Parquet) ---
log(f"Leyendo datos desde: {TABLE}")
df = spark.read.parquet(TABLE)
log(f"Filas leídas: {df.count()}")

# --- Transformación ---
log("Transformando columnas y filtrando UNITS > 0...")
df_transformed = (
    df.select("ean_venta", "sku_venta", "tienda_venta", "fecha_venta", "unidades_venta")
      .withColumnRenamed("tienda_venta", "LOCAL_ID")
      .withColumnRenamed("sku_venta", "ITEM_ID")
      .withColumnRenamed("ean_venta", "EAN")
      .withColumnRenamed("fecha_venta", "DATE")
      .withColumnRenamed("unidades_venta", "UNITS")
      .filter(F.col("UNITS") > 0)
)

log(f"Filas luego de la transformación: {df_transformed.count()}")

# --- Escritura a S3 destino ---
log(f"Escribiendo datos transformados en: {TABLE2}")
df_transformed.write.mode("overwrite").parquet(TABLE2)

log("Proceso completado correctamente.")



