In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataProcessing").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [9]:
df = spark.read.parquet("/datalake/raw/stagging")

                                                                                

AnalysisException: Unable to infer schema for Parquet. It must be specified manually.

In [36]:
df.show()


+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+
|latitude| longitude|               date|customer_id|employee_id|quantity_products|            order_id|
+--------+----------+-------------------+-----------+-----------+-----------------+--------------------+
|6.198157|-75.588906|10/06/2024 13:31:01|       1325|       9024|               31|b75c2ccc-f951-499...|
|6.249815|-75.505385|10/06/2024 13:31:23|       1219|       9026|               41|f17ee183-3358-4c9...|
|6.244296|-75.532076|10/06/2024 13:31:02|       1363|       9018|               26|fd377924-2728-41f...|
|6.183511|-75.637635|10/06/2024 13:31:24|       1640|       9022|               23|6a6bb2b6-2a9a-483...|
|6.281509|-75.503671|10/06/2024 13:31:05|       1857|       9011|               24|cf1aa61f-156a-494...|
|6.292742|-75.589281|10/06/2024 13:31:26|       1094|       9027|               40|db334b7d-3aeb-4a7...|
|6.274129| -75.53582|10/06/2024 13:31:06|       1874|  

In [7]:
import json
from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, Feature

import subprocess
import uuid
from datetime import datetime
import pytz

from pyspark.sql.functions import udf, to_timestamp, year, month, dayofmonth, hour, minute, second
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


with open('barrios.json', 'r') as f:
    barrios = json.load(f)

def get_polygon_properties(geojson_data, point_feature):
    for feature in geojson_data['features']:
        if boolean_point_in_polygon(point_feature, feature):
            return feature['properties']
    return None

def calcular_comuna(lat, lng):
    point_feature = Feature(geometry=Point((lng, lat)))
    properties = get_polygon_properties(barrios, point_feature)
    return properties["NOMBRE"], properties["IDENTIFICACION"]


schema = StructType([
    StructField("neighborhood", StringType(), False),
    StructField("commune", StringType(), False)
])

calcular_comuna_udf = udf(calcular_comuna, schema)


command = "hadoop fs -ls /datalake/raw/stagging | awk '{print $NF}'"
file_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/commits | awk '{print $NF}'"
commits_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/offsets | awk '{print $NF}'"
offsets_names = subprocess.check_output(command, shell=True).decode().split('\n')

df = spark.read.parquet("/datalake/raw/stagging")


df = df.withColumn("result", calcular_comuna_udf(df["latitude"], df["longitude"]))
df = df.select("*", "result.*").drop("result")

df = df.withColumn("date", to_timestamp(df["date"], "dd/MM/yyyy HH:mm:ss"))
df = df.withColumn("day", dayofmonth(df["date"]))\
       .withColumn("month", month(df["date"]))\
       .withColumn("year", year(df["date"]))\
       .withColumn("hour", hour(df["date"]))\
       .withColumn("minute", minute(df["date"]))\
       .withColumn("second", second(df["date"]))

# date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")
# path_write = f"/datalake/silver/stagging/{date_now}"
path_write = f"/datalake/silver/stagging/"

df.write.parquet(path_write)

for name in file_names:
    if ".parquet" in name:
        command = f"hadoop fs -mv {name} /datalake/raw/ingested"
        subprocess.run(command, shell=True, check=True)
        
for commit in commits_names:
    if commit != "items":
        command = f"hadoop fs -mv {commit} /checkpoints/moved"
        subprocess.run(command, shell=True, check=True)
        
for offset in offsets_names:
    if offset != "items":
        command = f"hadoop fs -mv {offset} /checkpoints/moved"
        subprocess.run(command, shell=True, check=True)


NameError: name 'spark' is not defined

In [6]:
!pip install turfpy

Collecting turfpy
  Downloading turfpy-0.0.7.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting geojson
  Downloading geojson-3.1.0-py3-none-any.whl (15 kB)
Collecting shapely
  Downloading shapely-2.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m634.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: turfpy
  Building wheel for turfpy (setup.py) ... [?25ldone
[?25h  Created wheel for turfpy: filename=turfpy-0.0.7-py3-none-any.whl size=39101 sha256=0f89ab1b6c0891a246f4645575a072faf76497ec62eca39d97c868e3aa642070
  Stored in directory: /root/.cache/pip/wheels/f7/f8/07/965ea3fe9ce3d94e9ee6815425f294500adc31eb4c14037c61
Successfully built turfpy
Installing collected packages: shapely, geojson, turfpy
Successfully installed geojson-3.1.0 shapely-2.0.4 turfpy-0.0.7
[0m--- Logging error ---
Traceback (most recent

In [6]:
import subprocess

command = "hadoop fs -ls /datalake/raw/stagging | awk '{print $NF}'"
dir_names = subprocess.check_output(command, shell=True).decode().split('\n')
print(dir_names)
for dir_name in dir_names:
    if dir_name != "items":
        command = f"hadoop fs -ls {dir_name}" + "| awk '{print $NF}'"
        file_names = subprocess.check_output(command, shell=True).decode().split('\n')
        for name in file_names:
            if ".parquet" in name:
                print(name)

['items', '/datalake/raw/stagging/10062024_141812', '']
/datalake/raw/stagging/10062024_141812/part-00000-054d0c3b-4a08-4ae5-881b-3b4487d5d5ef-c000.snappy.parquet
/datalake/raw/stagging/10062024_141812/part-00000-1c320fb2-5168-4839-8b23-f75d57e7db11-c000.snappy.parquet
/datalake/raw/stagging/10062024_141812/part-00000-402f848e-be34-4202-8748-d3ef8e92a55f-c000.snappy.parquet
/datalake/raw/stagging/10062024_141812/part-00000-70c9fd08-1f5c-440d-8544-a540877cc622-c000.snappy.parquet
/datalake/raw/stagging/10062024_141812/part-00000-e6c5b953-1817-4dca-93c1-a85dd97d163c-c000.snappy.parquet
/datalake/raw/stagging/10062024_141812/part-00000-e9fd3204-ad1c-43f8-8843-3382876ee586-c000.snappy.parquet


In [None]:
command = "hadoop fs -mv /datalake/raw/stagging/pruebis /datalake/raw/ingested"
subprocess.run(command, shell=True, check=True)

['items', '/checkpoints/commits/0', '/checkpoints/commits/1', '/checkpoints/commits/2', '/checkpoints/commits/3', '/checkpoints/commits/4', '/checkpoints/commits/5', '']


In [39]:
df = spark.read.parquet("/datalake/raw/ingested")
df.count()

41