In [None]:
%pip install -e ..

In [1]:
# ================================
# Quickstart: Athena Bridge + Spark (+ optional AWS Wrangler)
# ================================

python = True  # True to run with athena_bridge (Python), False to run with PySpark (EMR cluster or Glue Interactive Session)

# --- Main imports (Spark-like API exposed by athena_bridge)
if python:
    import athena_bridge.functions as F
    import athena_bridge.data_types as T
    from athena_bridge.window import Window as W
    Window = W()
    from athena_bridge.spark_athena_bridge import get_spark
else:
    import pyspark.sql.functions as F
    import pyspark.sql.types as T
    from pyspark.sql.window import Window
    from pyspark.sql import SparkSession

# --- Glue/Athena/S3 integration
import awswrangler as wr

# ================================
# 1) Configuration parameters
# ================================
BASE_DATOS_TMP     = "__YOUR_ATHENA_DATABASE_TEMP_FOR_ATHENA_BRIDGE__"   # Temporary database in Glue/Athena
DIRECTORIO_TMP_S3  = "s3://__YOUR_S3_PATH_TEMP_FOR_ATHENA_BRIDGE__"  # Staging path
WORKGROUP          = "__YOUR_ATHENA_WORKGROUP__"


# Example paths (adjust as needed)

# In Athena, CSV files are read through the directory that contains them — 
# you cannot specify the full path including the CSV file name.
# The directory must contain only CSV files with the same structure.
# If you want to read a single CSV file, its parent directory should contain only that file.
S3_IN_CSV          = "s3://__YOUR_S3_PATH_TO_READ_CSV_/"
S3_IN_JSONL        = "s3://__YOUR_S3_PATH_TO_READ_JSON_/"   # JSON Lines (optional)
S3_IN_PARQUET      = "s3://__YOUR_S3_PATH_TO_READ_PARQUET_/" # Existing Parquet (optional)

S3_OUT_PARQUET     = "s3://__YOUR_S3_PATH_TO_SAVE_PARQUET__/"
GLUE_TABLE         = "example_table"  # Table name in Glue/Athena

# ================================
# 2) Spark session
# ================================
# - Uses the selected workgroup (ideally with "Enforce workgroup configuration" enabled)
# - DIRECTORIO_TMP_S3 is used for temporary staging
if python:
    spark = get_spark(
        database_tmp=BASE_DATOS_TMP,
        path_tmp=DIRECTORIO_TMP_S3,
        workgroup=WORKGROUP
    )
else:
    spark = SparkSession.builder.getOrCreate()

# ================================
# 3) Read common formats
# ================================

# 3a) CSV (with header and ';' as separator)
df_csv = (
    spark.read
         .format("csv")
         .option("header", True)
         .option("sep", ";")
         .load(S3_IN_CSV)
)

# 3b) JSON Lines (one JSON object per line). Omit if you don't have JSONL.
# df_jsonl = (
#     spark.read
#          .format("json")
#          .option("multiLine", False)   # JSON Lines
#          .load(S3_IN_JSONL)
# )

# 3c) Existing Parquet (if you have another Parquet folder)
# df_parquet_in = spark.read.format("parquet").load(S3_IN_PARQUET)

# ================================
# 4) Write as Parquet (snappy by default)
# ================================
# Save the CSV we just read as a clean Parquet dataset
(df_csv
 .write
 .format("parquet")
 .mode("overwrite")              # caution: overwrites the target prefix
 .save(S3_OUT_PARQUET)
)

# ================================
# 5) Publish metadata to Glue (so Athena can query it)
# ================================
# Important: point to a prefix that contains ONLY .parquet
wr.s3.store_parquet_metadata(
    path=S3_OUT_PARQUET,
    database=BASE_DATOS_TMP,
    table=GLUE_TABLE,
    dataset=True,                     # treat the prefix as a dataset (with or without Hive-style partitions)
    mode="overwrite",                 # create/update the table
)

# ================================
# 6) Read the table from Glue/Athena
# ================================
df_table = spark.read.table(f"{BASE_DATOS_TMP}.{GLUE_TABLE}")

# ================================
# 7) DataFrame operations
# ================================

# 7a) Create a constant column
df_table = df_table.withColumn("total_amount", F.lit(1000))

# 7b) Simple filter
df_filtered = df_table.filter(F.col("total_amount") >= 1000)

# 7c) Example aggregation (row count)
df_agg = df_filtered.agg(F.count("total_amount").alias("row_count"))

# 7d) Window: rank by total_amount (didactic example)
#     Define a window ordered by the column we just added.
win = Window.orderBy(F.col("total_amount").desc())
df_rank = df_filtered.withColumn("rank_by_amount", F.row_number().over(win))

# (Optional) Show a few rows
print("Filtered + ranking:")
print(df_rank.head())         # returns a list of Row
print("Row count after filter:")
print(df_agg.head())

# ================================
# 8) Clean shutdown
# ================================
spark.stop()

2025-11-10 12:40:41,000	INFO worker.py:1852 -- Started a local Ray instance.


Filtered + ranking:
  grupo_quinquenal_de_edad              islas   sexo               periodo  \
0         Todas las edades  07 Balears, Illes  Total    1 de julio de 2002   
1         Todas las edades  07 Balears, Illes  Total  1 de octubre de 2002   
2         Todas las edades  07 Balears, Illes  Total    1 de enero de 2003   
3         Todas las edades  07 Balears, Illes  Total    1 de enero de 2002   
4         Todas las edades  07 Balears, Illes  Total    1 de abril de 2002   

     total  total_amount rank_by_amount  
0  866.087          1000              1  
1       ..          1000              2  
2  883.410          1000              3  
3  845.130          1000              4  
4       ..          1000              5  
Row count after filter:
   row_count
0      67830
✔️ Eliminada tabla temporal: temp_db.temp_def85a12
✅ Sesión finalizada y temporales eliminados.
