In [1]:
!pip install turfpy
!pip install geojson

[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1752, in print
    extend(render(renderable, render_options))
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1390, in render
    for render_output in iter_render:
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/segment.py", line 245, in split_lines
    for segment in segments:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1368, in render
    renderable = rich_cast(renderable)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/protocol.py", lin

In [2]:
import json
from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, Feature

import subprocess
from datetime import datetime
import pytz

from pyspark.sql.functions import udf, to_timestamp, year, month, dayofmonth, hour, minute, second
from pyspark.sql.types import StructType, StructField, StringType

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataProcessing").getOrCreate()


with open('barrios.json', 'r') as f:
    barrios = json.load(f)

def get_polygon_properties(geojson_data, point_feature):
    for feature in geojson_data['features']:
        if boolean_point_in_polygon(point_feature, feature):
            return feature['properties']
    return None

def calcular_comuna(lat, lng):
    point_feature = Feature(geometry=Point((lng, lat)))
    properties = get_polygon_properties(barrios, point_feature)
    return properties["NOMBRE"], properties["IDENTIFICACION"]


schema = StructType([
    StructField("neighborhood", StringType(), False),
    StructField("commune", StringType(), False)
])

calcular_comuna_udf = udf(calcular_comuna, schema)


command = "hadoop fs -ls /datalake/raw/stagging | awk '{print $NF}'"
file_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/commits | awk '{print $NF}'"
commits_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /checkpoints/offsets | awk '{print $NF}'"
offsets_names = subprocess.check_output(command, shell=True).decode().split('\n')

command = "hadoop fs -ls /datalake/raw/stagging/_spark_metadata | awk '{print $NF}'"
metadata_names = subprocess.check_output(command, shell=True).decode().split('\n')


df = spark.read.parquet("/datalake/raw/stagging")


df = df.withColumn("result", calcular_comuna_udf(df["latitude"], df["longitude"]))
df = df.select("*", "result.*").drop("result")

df = df.withColumn("date", to_timestamp(df["date"], "dd/MM/yyyy HH:mm:ss"))
df = df.withColumn("day", dayofmonth(df["date"]))\
       .withColumn("month", month(df["date"]))\
       .withColumn("year", year(df["date"]))\
       .withColumn("hour", hour(df["date"]))\
       .withColumn("minute", minute(df["date"]))\
       .withColumn("second", second(df["date"]))

date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")
path_write = f"/datalake/silver/stagging/{date_now}"


df.write.parquet(path_write)

for name in file_names:
    if ".parquet" in name:
        command = f"hadoop fs -mv {name} /datalake/raw/ingested"
        subprocess.run(command, shell=True, check=True)
        
for commit in commits_names:
    if (commit != "items" and commit != ""):
        command = f"hadoop fs -rm -r {commit}"
        subprocess.run(command, shell=True, check=True)
        
for offset in offsets_names:
    if (offset != "items" and offset != ""):
        command = f"hadoop fs -rm -r {offset}"
        subprocess.run(command, shell=True, check=True)
        
for meta in metadata_names:
    if (meta != "items" and meta != ""):
        command = f"hadoop fs -rm -r {meta}"
        subprocess.run(command, shell=True, check=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

Deleted /checkpoints/commits/0
Deleted /checkpoints/commits/1
Deleted /checkpoints/commits/2
Deleted /checkpoints/offsets/0
Deleted /checkpoints/offsets/1
Deleted /checkpoints/offsets/2
Deleted /datalake/raw/stagging/_spark_metadata/0
Deleted /datalake/raw/stagging/_spark_metadata/1
Deleted /datalake/raw/stagging/_spark_metadata/2
