# Spark - Escritura

In [1]:
# pip install turfpy

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import json
from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, Feature
import subprocess
import uuid
from datetime import datetime
import pytz
from pyspark.sql.functions import sum as _sum, max as _max, concat_ws, col, udf, to_timestamp, year, month, dayofmonth, hour, minute, second, to_date, date_format, row_number, desc
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

## Spark session

In [3]:
spark = SparkSession.builder.appName("DataProcessing").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Parquet

In [4]:
df = spark.read.parquet("/datalake/raw/stagging")

In [5]:
df.show()
df.count()

+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+
|latitude| longitude|               date|customer_id| employee_id|quantity_products|            order_id|             commune|        neighborhood|
+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+
|6.252831|-75.565694|17/06/2024 17:42:59|       1516| Employee_11|               48|ed8e3710-f82d-4ed...|Comuna 10 - La Ca...|         Villa Nueva|
|6.226848|-75.550744|17/06/2024 17:43:16|       1368|Employee_175|               50|074a7666-b263-499...|Comuna 9 - Buenos...|       Bomboná No. 2|
|6.263214|-75.565651|17/06/2024 17:43:01|       1419|Employee_117|               49|5add2bcd-514d-409...|Comuna 10 - La Ca...|               Prado|
|6.288678|-75.556225|17/06/2024 17:43:17|       1319|Employee_135|               47|929fe8c9-fdcf-4df...| Comuna

107

## hadoop

In [6]:
command = "hadoop fs -ls /datalake/raw/stagging | awk '{print $NF}'"
file_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/commits | awk '{print $NF}'"
commits_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/offsets | awk '{print $NF}'"
offsets_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /datalake/raw/stagging/_spark_metadata | awk '{print $NF}'"
metadata_names = subprocess.check_output(command, shell=True).decode().split('\n')

## Identificador de cada tienda
Se crea un store ID para cada tienda teniendo en cuenta: "latitude", "longitude", "commune" y "neighborhood"

In [7]:
df1 = df.withColumn("store_id", concat_ws("_", "latitude", "longitude", "commune", "neighborhood"))
df1.show()

+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|latitude| longitude|               date|customer_id| employee_id|quantity_products|            order_id|             commune|        neighborhood|            store_id|
+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|6.252831|-75.565694|17/06/2024 17:42:59|       1516| Employee_11|               48|ed8e3710-f82d-4ed...|Comuna 10 - La Ca...|         Villa Nueva|6.252831_-75.5656...|
|6.226848|-75.550744|17/06/2024 17:43:16|       1368|Employee_175|               50|074a7666-b263-499...|Comuna 9 - Buenos...|       Bomboná No. 2|6.226848_-75.5507...|
|6.263214|-75.565651|17/06/2024 17:43:01|       1419|Employee_117|               49|5add2bcd-514d-409...|Comuna 10 - La Ca...|               Prado|6.263214

## Acumulado de ventas por día y tienda

In [8]:
# Convertir la columna 'date' a tipo timestamp
df1 = df1.withColumn("timestamp", to_timestamp(col("date"), "dd/MM/yyyy HH:mm:ss"))
df1.show()

+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+--------------------+-------------------+
|latitude| longitude|               date|customer_id| employee_id|quantity_products|            order_id|             commune|        neighborhood|            store_id|          timestamp|
+--------+----------+-------------------+-----------+------------+-----------------+--------------------+--------------------+--------------------+--------------------+-------------------+
|6.252831|-75.565694|17/06/2024 17:42:59|       1516| Employee_11|               48|ed8e3710-f82d-4ed...|Comuna 10 - La Ca...|         Villa Nueva|6.252831_-75.5656...|2024-06-17 17:42:59|
|6.226848|-75.550744|17/06/2024 17:43:16|       1368|Employee_175|               50|074a7666-b263-499...|Comuna 9 - Buenos...|       Bomboná No. 2|6.226848_-75.5507...|2024-06-17 17:43:16|
|6.263214|-75.565651|17/06/2024 17:43:01|       1419|Em

In [9]:
# Extraer solo la parte de la fecha
df1 = df1.withColumn("date_only", date_format(col("timestamp"), "yyyy-MM-dd"))
df1.select("date_only").show()

+----------+
| date_only|
+----------+
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
|2024-06-17|
+----------+
only showing top 20 rows



In [10]:
# Agrupar los datos por fecha y vendedor para calcular el total de ventas por vendedor
df_seller_aggregated = df1.groupBy("date_only", "employee_id") \
    .agg(_sum("quantity_products").alias("total_sales"))
df_seller_aggregated.show()

+----------+------------+-----------+
| date_only| employee_id|total_sales|
+----------+------------+-----------+
|2024-06-17| Employee_41|         85|
|2024-06-17| Employee_97|        100|
|2024-06-17| Employee_89|         50|
|2024-06-17|Employee_145|         49|
|2024-06-17| Employee_81|         37|
|2024-06-17| Employee_95|        136|
|2024-06-17|Employee_137|         46|
|2024-06-17|  Employee_9|        100|
|2024-06-17|Employee_111|         50|
|2024-06-17|Employee_105|         83|
|2024-06-17|  Employee_7|         88|
|2024-06-17|Employee_149|         50|
|2024-06-17|Employee_157|         50|
|2024-06-17|Employee_163|         50|
|2024-06-17| Employee_77|         50|
|2024-06-17| Employee_91|         50|
|2024-06-17|  Employee_3|         88|
|2024-06-17| Employee_85|         50|
|2024-06-17| Employee_65|        165|
|2024-06-17| Employee_33|         48|
+----------+------------+-----------+
only showing top 20 rows



## Ranking de vendedores diarios

In [11]:
# Crear una ventana para particionar por fecha y ordenar por total de ventas de mayor a menor
window_spec = Window.partitionBy("date_only").orderBy(col("total_sales").desc())
# Agregar un número de fila para identificar al vendedor con más ventas cada día
df_seller_ranked_day = df_seller_aggregated.withColumn("rank", row_number().over(window_spec))
df_seller_ranked_day.show()

+----------+------------+-----------+----+
| date_only| employee_id|total_sales|rank|
+----------+------------+-----------+----+
|2024-06-17|Employee_169|        185|   1|
|2024-06-17| Employee_65|        165|   2|
|2024-06-17| Employee_63|        150|   3|
|2024-06-17| Employee_11|        140|   4|
|2024-06-17| Employee_23|        139|   5|
|2024-06-17| Employee_95|        136|   6|
|2024-06-17| Employee_17|        122|   7|
|2024-06-17| Employee_13|        111|   8|
|2024-06-17| Employee_97|        100|   9|
|2024-06-17|  Employee_9|        100|  10|
|2024-06-17|Employee_193|        100|  11|
|2024-06-17|Employee_179|        100|  12|
|2024-06-17| Employee_75|        100|  13|
|2024-06-17| Employee_83|        100|  14|
|2024-06-17|Employee_141|        100|  15|
|2024-06-17|Employee_175|         98|  16|
|2024-06-17| Employee_67|         93|  17|
|2024-06-17| Employee_55|         91|  18|
|2024-06-17| Employee_43|         90|  19|
|2024-06-17|Employee_199|         89|  20|
+----------

## Mayor vendedor del día

In [12]:
# Filtrar para obtener solo el vendedor con más ventas cada día
df_top_seller = df_seller_ranked_day.filter(col("rank") == 1).drop("rank")
df_top_seller.show()

+----------+------------+-----------+
| date_only| employee_id|total_sales|
+----------+------------+-----------+
|2024-06-17|Employee_169|        185|
+----------+------------+-----------+



# Acumulado diario de ventas por tienda

In [13]:
# Agrupar los datos por fecha y tienda para calcular el total de ventas en esa tienda
df_store_aggregated = df1.groupBy("date_only", "latitude", "longitude", "store_id") \
    .agg(_sum("quantity_products").alias("total_store"))
df_store_aggregated.show()

+----------+--------+----------+--------------------+-----------+
| date_only|latitude| longitude|            store_id|total_store|
+----------+--------+----------+--------------------+-----------+
|2024-06-17|6.231016|-75.531541|6.231016_-75.5315...|         50|
|2024-06-17|6.221932|-75.528966|6.221932_-75.5289...|         85|
|2024-06-17|6.255342|-75.555875|6.255342_-75.5558...|         50|
|2024-06-17|6.274861|-75.533979|6.274861_-75.5339...|         50|
|2024-06-17| 6.31383|-75.556608|6.31383_-75.55660...|         37|
|2024-06-17|6.250895|-75.564412|6.250895_-75.5644...|        100|
|2024-06-17| 6.23482| -75.54546|6.23482_-75.54546...|         50|
|2024-06-17|6.263214|-75.565651|6.263214_-75.5656...|         87|
|2024-06-17| 6.22135|-75.548236|6.22135_-75.54823...|         82|
|2024-06-17|6.268129|-75.532389|6.268129_-75.5323...|         50|
|2024-06-17|6.269597|-75.531581|6.269597_-75.5315...|        122|
|2024-06-17|6.271556| -75.54339|6.271556_-75.5433...|         45|
|2024-06-1

In [14]:
# Crear una ventana para particionar por fecha y ordenar por total de ventas de mayor a menor en cada tienda
window_spec2 = Window.partitionBy("date_only").orderBy(col("total_store").desc())
# Agregar un número de fila para identificar la tienda con más ventas cada día
df_store_ranked_day = df_store_aggregated.withColumn("rank", row_number().over(window_spec2))
df_store_ranked_day.show()

+----------+--------+----------+--------------------+-----------+----+
| date_only|latitude| longitude|            store_id|total_store|rank|
+----------+--------+----------+--------------------+-----------+----+
|2024-06-17| 6.23618|-75.557692|6.23618_-75.55769...|        185|   1|
|2024-06-17|6.226864| -75.52891|6.226864_-75.5289...|        165|   2|
|2024-06-17|6.302827|-75.560455|6.302827_-75.5604...|        150|   3|
|2024-06-17|6.252831|-75.565694|6.252831_-75.5656...|        140|   4|
|2024-06-17|6.237547|-75.536549|6.237547_-75.5365...|        139|   5|
|2024-06-17|6.251305|-75.539935|6.251305_-75.5399...|        136|   6|
|2024-06-17|6.269597|-75.531581|6.269597_-75.5315...|        122|   7|
|2024-06-17|6.299064|-75.545337|6.299064_-75.5453...|        111|   8|
|2024-06-17|6.250895|-75.564412|6.250895_-75.5644...|        100|   9|
|2024-06-17|6.240547|-75.537934|6.240547_-75.5379...|        100|  10|
|2024-06-17|6.241721|-75.544554|6.241721_-75.5445...|        100|  11|
|2024-

## Guardar resultados

In [15]:
df_seller_ranked_day.write.parquet("/datalake/gold/df_seller_ranked_day")
df_store_ranked_day.write.parquet("/datalake/gold/df_store_ranked_day")

                                                                                

In [16]:
df3 = spark.read.parquet("/datalake/gold/df_seller_ranked_day")
df3.show()

+----------+------------+-----------+----+
| date_only| employee_id|total_sales|rank|
+----------+------------+-----------+----+
|2024-06-17|Employee_169|        185|   1|
|2024-06-17| Employee_65|        165|   2|
|2024-06-17| Employee_63|        150|   3|
|2024-06-17| Employee_11|        140|   4|
|2024-06-17| Employee_23|        139|   5|
|2024-06-17| Employee_95|        136|   6|
|2024-06-17| Employee_17|        122|   7|
|2024-06-17| Employee_13|        111|   8|
|2024-06-17| Employee_97|        100|   9|
|2024-06-17|  Employee_9|        100|  10|
|2024-06-17|Employee_193|        100|  11|
|2024-06-17|Employee_179|        100|  12|
|2024-06-17| Employee_75|        100|  13|
|2024-06-17| Employee_83|        100|  14|
|2024-06-17|Employee_141|        100|  15|
|2024-06-17|Employee_175|         98|  16|
|2024-06-17| Employee_67|         93|  17|
|2024-06-17| Employee_55|         91|  18|
|2024-06-17| Employee_43|         90|  19|
|2024-06-17|Employee_199|         89|  20|
+----------

In [17]:
df4 = spark.read.parquet("/datalake/gold/df_store_ranked_day")
df4.show()

+----------+--------+----------+--------------------+-----------+----+
| date_only|latitude| longitude|            store_id|total_store|rank|
+----------+--------+----------+--------------------+-----------+----+
|2024-06-17| 6.23618|-75.557692|6.23618_-75.55769...|        185|   1|
|2024-06-17|6.226864| -75.52891|6.226864_-75.5289...|        165|   2|
|2024-06-17|6.302827|-75.560455|6.302827_-75.5604...|        150|   3|
|2024-06-17|6.252831|-75.565694|6.252831_-75.5656...|        140|   4|
|2024-06-17|6.237547|-75.536549|6.237547_-75.5365...|        139|   5|
|2024-06-17|6.251305|-75.539935|6.251305_-75.5399...|        136|   6|
|2024-06-17|6.269597|-75.531581|6.269597_-75.5315...|        122|   7|
|2024-06-17|6.299064|-75.545337|6.299064_-75.5453...|        111|   8|
|2024-06-17|6.250895|-75.564412|6.250895_-75.5644...|        100|   9|
|2024-06-17|6.240547|-75.537934|6.240547_-75.5379...|        100|  10|
|2024-06-17|6.241721|-75.544554|6.241721_-75.5445...|        100|  11|
|2024-

In [18]:
# df = df.withColumn("result", calcular_comuna_udf(df["latitude"], df["longitude"]))
# df = df.select("*", "result.*").drop("result")

# df = df.withColumn("date", to_timestamp(df["date"], "dd/MM/yyyy HH:mm:ss"))
# df = df.withColumn("day", dayofmonth(df["date"]))\
#        .withColumn("month", month(df["date"]))\
#        .withColumn("year", year(df["date"]))\
#        .withColumn("hour", hour(df["date"]))\
#        .withColumn("minute", minute(df["date"]))\
#        .withColumn("second", second(df["date"]))

# date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")
# path_write = f"/datalake/silver/stagging/{date_now}"


# df.write.parquet(path_write)

# for name in file_names:
#     if ".parquet" in name:
#         command = f"hadoop fs -mv {name} /datalake/raw/ingested"
#         subprocess.run(command, shell=True, check=True)
        
# for commit in commits_names:
#     if (commit != "items" and commit != ""):
#         command = f"hadoop fs -rm -r {commit}"
#         subprocess.run(command, shell=True, check=True)
        
# for offset in offsets_names:
#     if (offset != "items" and offset != ""):
#         command = f"hadoop fs -rm -r {offset}"
#         subprocess.run(command, shell=True, check=True)
        
# for meta in metadata_names:
#     if (meta != "items" and meta != ""):
#         command = f"hadoop fs -rm -r {meta}"
#         subprocess.run(command, shell=True, check=True)