Per entrar a una sessió de Spark i iniciar un builder en DeltaLake

In [31]:
#!pip install pyspark
#!pip install delta-spark

import pyspark
from delta import *

#!wget -O "HR_comma_sep.csv" "https://mydisk.cs.upc.edu/s/3o33yciBHADiFCD/download/HR_comma_sep.csv"

builder = pyspark.sql.SparkSession.builder.appName("Shops_Deltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Comandes com si estessis a una sessió de Spark

Llegir els arxius i guardar-los en format DeltaLake, que ens garantitza que es compleixen les restriccions ACID, i dona avanatatges 




Carregar i llegir els 3 arxius de tenim

In [32]:
#Arxiu Parquet 
shops = spark.read.parquet("./datalake/shops_data/2024-04-17_shops_data.parquet")
shops.show()


+-------------------+------------------+--------------------+-------------------------+---------------------------+----------------------+--------------------+---------------------+------------------------+------------------------+-----------------------+------------------------+---------------------------+--------------------+------------------+----------------+-------------------+--------------------+-------------------+---------------+--------------------+------------------+
|attributes.objectid|attributes.osm_id2|attributes.abandoned|attributes.addr_housename|attributes.addr_housenumber|attributes.addr_street|attributes.addr_city|attributes.addr_state|attributes.addr_postcode|attributes.addr_province|attributes.addr_country|attributes.addr_district|attributes.addr_subdistrict|attributes.addr_unit|attributes.amenity|attributes.brand|attributes.building|     attributes.name|attributes.operator|attributes.shop|          geometry.x|        geometry.y|
+-------------------+-------------

In [33]:
# canviem nom columnes perque no ens deixa accedir-hi si tenen caracters especials
# geometry.x
new_column_name = "geometry_x"
old_column_name = "geometry.x"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# geometry.y
new_column_name = "geometry_y"
old_column_name = "geometry.y"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.shop
new_column_name = "shop"
old_column_name = "attributes.shop"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.name
new_column_name = "name"
old_column_name = "attributes.name"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.osm_id2
new_column_name = "index"
old_column_name = "attributes.osm_id2"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.addr_postcode
new_column_name = "postcode"
old_column_name = "attributes.addr_postcode"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# comprovem
shops.printSchema()


root
 |-- attributes.objectid: long (nullable = true)
 |-- index: string (nullable = true)
 |-- attributes.abandoned: integer (nullable = true)
 |-- attributes.addr_housename: string (nullable = true)
 |-- attributes.addr_housenumber: string (nullable = true)
 |-- attributes.addr_street: string (nullable = true)
 |-- attributes.addr_city: string (nullable = true)
 |-- attributes.addr_state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- attributes.addr_province: integer (nullable = true)
 |-- attributes.addr_country: string (nullable = true)
 |-- attributes.addr_district: integer (nullable = true)
 |-- attributes.addr_subdistrict: integer (nullable = true)
 |-- attributes.addr_unit: string (nullable = true)
 |-- attributes.amenity: string (nullable = true)
 |-- attributes.brand: string (nullable = true)
 |-- attributes.building: string (nullable = true)
 |-- name: string (nullable = true)
 |-- attributes.operator: string (nullable = true)
 |-- shop: string (nulla

In [35]:
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer

# ens quedem només amb columnes seleccionades
selected_columns = ['geometry_x', 'geometry_y', "shop", "name", "index", "postcode"]
shops_selected = shops.select(selected_columns)

# eliminar files que tinguin missings --> excepte les que tenen missings a postcode!!
shops_selected = shops_selected.filter(~(col("geometry_x").isNull() |
                                col("geometry_y").isNull() |
                                col("shop").isNull() |
                                col("name").isNull() |
                                col("index").isNull()))

shops_selected.printSchema()

root
 |-- geometry_x: double (nullable = true)
 |-- geometry_y: double (nullable = true)
 |-- shop: string (nullable = true)
 |-- name: string (nullable = true)
 |-- index: string (nullable = true)
 |-- postcode: string (nullable = true)



In [None]:
# imputar missings de postcode a partir de coordenades geometry_x i geometry_y
from uszipcode import SearchEngine

# creem obj SearchEngine
search = SearchEngine(simple_zipcode=True)

# obtenim codi postal a partir de coordenades
def get_zipcode(lat, lon):
    result = search.by_coordinates(lat=lat, lng=lon, returns=1)
    return result[0].zipcode

# Aplica la función a tus datos
df["ZipCode"] = df.apply(lambda row: get_zipcode(row["LATITUDE_X"], row["LONGITUDE_X"]), axis=1)
