In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataProcessing").getOrCreate()

In [31]:
df = spark.read.parquet("/datalake/raw/stagging")

In [32]:
df.show(5)
df.count()

+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+
|latitude| longitude|               date|customer_id|employee_id|quantity_products|            order_id|
+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+
|6.210883|-75.489165|12/06/2024 09:06:31|       1158|       9028|               34|939e177e-6fff-4a5...|
|6.239704|-75.701308|12/06/2024 09:06:55|       1023|       9036|               39|93499ff5-1295-474...|
|6.291255|-75.553679|12/06/2024 09:06:34|       1776|       9005|               12|c3ba6a6f-fe1c-4dd...|
|6.240932|-75.651972|12/06/2024 09:06:58|       1582|       9006|               29|4214fd98-544e-4e2...|
|6.285523|-75.567747|12/06/2024 09:06:36|       1593|       9012|               42|c53b558a-749e-487...|
+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+
only showing top 5 rows



55

In [33]:
import json
from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, Feature

import subprocess
import uuid
from datetime import datetime
import pytz

from pyspark.sql.functions import udf, to_timestamp, year, month, dayofmonth, hour, minute, second
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


with open('barrios.json', 'r') as f:
    barrios = json.load(f)

def get_polygon_properties(geojson_data, point_feature):
    for feature in geojson_data['features']:
        if boolean_point_in_polygon(point_feature, feature):
            return feature['properties']
    return None

def calcular_comuna(lat, lng):
    point_feature = Feature(geometry=Point((lng, lat)))
    properties = get_polygon_properties(barrios, point_feature)
    return properties["NOMBRE"], properties["IDENTIFICACION"]


schema = StructType([
    StructField("neighborhood", StringType(), False),
    StructField("commune", StringType(), False)
])

calcular_comuna_udf = udf(calcular_comuna, schema)


command = "hadoop fs -ls /datalake/raw/stagging | awk '{print $NF}'"
file_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/commits | awk '{print $NF}'"
commits_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/offsets | awk '{print $NF}'"
offsets_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /datalake/raw/stagging/_spark_metadata | awk '{print $NF}'"
metadata_names = subprocess.check_output(command, shell=True).decode().split('\n')


df = spark.read.parquet("/datalake/raw/stagging")


df = df.withColumn("result", calcular_comuna_udf(df["latitude"], df["longitude"]))
df = df.select("*", "result.*").drop("result")

df = df.withColumn("date", to_timestamp(df["date"], "dd/MM/yyyy HH:mm:ss"))
df = df.withColumn("day", dayofmonth(df["date"]))\
       .withColumn("month", month(df["date"]))\
       .withColumn("year", year(df["date"]))\
       .withColumn("hour", hour(df["date"]))\
       .withColumn("minute", minute(df["date"]))\
       .withColumn("second", second(df["date"]))

date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")
path_write = f"/datalake/silver/stagging/{date_now}"


df.write.parquet(path_write)

for name in file_names:
    if ".parquet" in name:
        command = f"hadoop fs -mv {name} /datalake/raw/ingested"
        subprocess.run(command, shell=True, check=True)
        
for commit in commits_names:
    if (commit != "items" and commit != ""):
        command = f"hadoop fs -rm -r {commit}"
        subprocess.run(command, shell=True, check=True)
        
for offset in offsets_names:
    if (offset != "items" and offset != ""):
        command = f"hadoop fs -rm -r {offset}"
        subprocess.run(command, shell=True, check=True)
        
for meta in metadata_names:
    if (meta != "items" and meta != ""):
        command = f"hadoop fs -rm -r {meta}"
        subprocess.run(command, shell=True, check=True)

                                                                                

Deleted /checkpoints/commits/0
Deleted /checkpoints/commits/1
Deleted /checkpoints/commits/2
Deleted /checkpoints/commits/3
Deleted /checkpoints/commits/4
Deleted /checkpoints/commits/5
Deleted /checkpoints/offsets/0
Deleted /checkpoints/offsets/1
Deleted /checkpoints/offsets/2
Deleted /checkpoints/offsets/3
Deleted /checkpoints/offsets/4
Deleted /checkpoints/offsets/5
Deleted /datalake/raw/stagging/_spark_metadata/0
Deleted /datalake/raw/stagging/_spark_metadata/1
Deleted /datalake/raw/stagging/_spark_metadata/2
Deleted /datalake/raw/stagging/_spark_metadata/3
Deleted /datalake/raw/stagging/_spark_metadata/4
Deleted /datalake/raw/stagging/_spark_metadata/5


In [None]:
!pip install turfpy

In [44]:
import subprocess

command = "hadoop fs -ls /datalake/silver/stagging | awk '{print $NF}'"
dir_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(dir_names)
array_silver =[]
for dir_name in dir_names:
    if (dir_name != "items" and dir_name != "") :
        array_silver.append(dir_name)
        command = f"hadoop fs -ls {dir_name}" + "| awk '{print $NF}'"
        file_names = subprocess.check_output(command, shell=True).decode().split('\n')
        for name in file_names:
            if ".parquet" in name:
                #print(name)
                pass
                
print(array_silver)
df = spark.read.parquet(*array_silver)
df.show(5)
df.count()

['items', '/datalake/silver/stagging/12062024_090050', '/datalake/silver/stagging/12062024_090853', '']
['/datalake/silver/stagging/12062024_090050', '/datalake/silver/stagging/12062024_090853']
+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+--------------------+----------------+---+-----+----+----+------+------+
|latitude| longitude|               date|customer_id|employee_id|quantity_products|            order_id|        neighborhood|         commune|day|month|year|hour|minute|second|
+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+--------------------+----------------+---+-----+----+----+------+------+
|6.210883|-75.489165|2024-06-12 09:06:31|       1158|       9028|               34|939e177e-6fff-4a5...|CORREGIMIENTO DE ...|CORREGIMIENTO 90| 12|    6|2024|   9|     6|    31|
|6.239704|-75.701308|2024-06-12 09:06:55|       1023|       9036|               39|93499ff5-1295-

137

In [16]:
command = "hadoop fs -ls /checkpoints/commits | awk '{print $NF}'"
commits_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(commits_names)

['items', '/checkpoints/commits/0', '/checkpoints/commits/1', '/checkpoints/commits/2', '/checkpoints/commits/3', '/checkpoints/commits/4', '/checkpoints/commits/5', '']


In [17]:
command = "hadoop fs -ls /checkpoints/offsets | awk '{print $NF}'"
offsets_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(offsets_names)

['items', '/checkpoints/offsets/0', '/checkpoints/offsets/1', '/checkpoints/offsets/2', '/checkpoints/offsets/3', '/checkpoints/offsets/4', '/checkpoints/offsets/5', '']


In [18]:
command = "hadoop fs -ls /datalake/raw/stagging/_spark_metadata | awk '{print $NF}'"
metadata_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(metadata_names)

['items', '/datalake/raw/stagging/_spark_metadata/0', '/datalake/raw/stagging/_spark_metadata/1', '/datalake/raw/stagging/_spark_metadata/2', '/datalake/raw/stagging/_spark_metadata/3', '/datalake/raw/stagging/_spark_metadata/4', '/datalake/raw/stagging/_spark_metadata/5', '']


In [24]:
command = "hadoop fs -ls /datalake/silver/stagging/ | awk '{print $NF}'"
metadata_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(metadata_names)

['items', '/datalake/silver/stagging/12062024_083824', '']


In [None]:
df = spark.read.parquet("/datalake/raw/ingested")
df.count()