In [1]:
%pip install -e .

Obtaining file:///home/sagemaker-user/athena_bridge
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: athena_bridge
  Building editable for athena_bridge (pyproject.toml) ... [?25ldone
[?25h  Created wheel for athena_bridge: filename=athena_bridge-0.0.1-0.editable-py3-none-any.whl size=6993 sha256=077e21472b861678dd463365e0107b30ef21fab049355af0ff10008643d7bffd
  Stored in directory: /tmp/pip-ephem-wheel-cache-xx8le3op/wheels/32/99/bc/4c7ada3e84e2673f4d4776e044d89dca78028a63859a1ae19e
Successfully built athena_bridge
Installing collected packages: athena_bridge
  Attempting uninstall: athena_bridge
    Found existing installation: athena_bridge 0.0.1
    Uninstalling athena_bridge-0.0.1:
      Successfully uninstalled athena_bridge-0.0.1
Successful

In [2]:
python = True # True para ejecutar con python, False para ejecutar con pyspark

In [3]:
if python:
  import athena_bridge.functions as F
  import athena_bridge.data_types as T
  from athena_bridge.window import Window as W
  Window = W()
  from athena_bridge.dataproc_athena_bridge import DataprocAthenaBridge
else:
  import pyspark.sql.functions as F
  import pyspark.sql.types as T
  from pyspark.sql.window import Window
  from dataproc_sdk.dataproc_sdk_datiopysparksession.datiopysparksession import DatioPysparkSession


In [None]:
if python:
  # La base de datos datatemp no existe por defecto. Como base de datos temporal podemos usar cualquier base
  # de datos existente en Athena o crear una nueva para este propopsito. En caso de querer usar datatemp
  # pocemos crearla lanzando en Athena la sentencia:
  # CREATE DATABASE datatemp

  base_datos_temporal = 'temp_db'
  directorio_temporal =  's3://sagemaker-studio-565871520366-q86ddi3b9u/temporal'

  # Al crear el lector hay que indicar una base de datos disponible en nuestro workgroup sandbox de Athena.
  # Sobre esta base de datos se crearan tablas temporales asociadas a los ficheros de sandbox que
  # leamos directamente como fichero y también al utilizar el metodo df.cache().
  # Las tablas temporales y ficheros temporales se eliminaran al final llamando al metodo exit del
  # lector.
  dataproc = DataprocAthenaBridge(base_datos_temporal, directorio_temporal)
else:
  dataproc = DatioPysparkSession().get_or_create()

In [None]:
df = dataproc.read().parquet("s3://ada-eu-south-2-sbx-live-es-finn-data/data/sandboxes/finn/data/Finance_A_A/portal_contable/disenyo_criterios_contables/otros/inventario_delta/")

In [None]:
ruta_lectura = 's3://sagemaker-studio-565871520366-q86ddi3b9u/datos_poblacion/'
df_csv = (
    dataproc.read()
    .option('inferSchema', 'false')
    .option('header', 'true')
    .option('sep', ';')
    .option('encoding', 'latin1')  # ✅ forzar encoding
    .csv(ruta_lectura)
)

In [None]:
df_csv.head()

In [None]:
df2 = df_csv.withColumn('segundos', F.lit(1428476400))

In [None]:
df2 = df2.withColumn('from_unix', F.from_unixtime('segundos'))
df2 = df2.withColumn('from_unix_formato', F.from_unixtime(F.col('segundos'), "yyyy-MM-dd"))

In [None]:
df2 = df2.withColumn('unix_timestamp', F.unix_timestamp())
df2 = df2.withColumn('unix_timestamp_2', F.unix_timestamp('from_unix_formato', "yyyy-MM-dd"))

In [None]:
df3 = df2.withColumn('unix_timestamp_3', F.unix_timestamp(F.to_date('from_unix_formato'), None))

In [None]:
df3.head()

In [None]:
df3 = df3.withColumn('repeat', F.repeat('account_branch_id', 2))

In [None]:
df3 = df3.withColumn('rlike', F.col('part_closing_date').rlike('^2025-05.'))

In [None]:
df3 = df3.withColumn('regexp_replace',
    F.regexp_replace(F.col("part_closing_date"), '\d', '*' ))

In [None]:
df3.head()

In [None]:
df3 = df3.withColumn('ventas', F.lit(10))

df3 = df3.withColumn('ventas', F.when(F.col('account_branch_id')=='0997', F.lit(100)).otherwise(F.col('ventas')))

In [None]:
# Definir la ventana (orden descendente por ventas)
ventana = Window.orderBy(F.col("ventas").asc())

# Añadir la columna con el cuartil
df_cuartiles = df3.select(
    F.col("account_branch_id"),
    F.col("ventas"),
    F.ntile(4).over(ventana).alias("ntile")
)

In [None]:
df_cuartiles.filter(F.col('account_branch_id')=='0997').head()

In [None]:
import uuid
table_name = f"temp_{uuid.uuid4().hex[:8]}"
df = wr.s3.store_parquet_metadata(
                path="s3://ada-eu-south-2-sbx-live-es-finn-data/data/sandboxes/finn/data/Finance_A_A/portal_contable/disenyo_criterios_contables/otros/inventario_delta/",
                database=lector_athena._database_tmp,
                table=table_name,
                dataset = True,
                mode="overwrite"
            )

In [None]:
table_name