# Exploitation Zone - Model Predictiu

- exploitation zone del model predictiu
- preparation pipeline per taula d'entrenament del model --> cada zipcode és un indiv
    - Sales: 5 categories més comunes per zipcode, count vendes per zipcode, profit mitja per zipcode, mitjana num unitat per comanda per zipcode
    - Shops: 5shops més comunes per zipcode
    - Income: mitjana income per zipcode

In [None]:
#!pip install pyspark
#!pip install delta-spark

import pyspark
from delta import *

#!wget -O "HR_comma_sep.csv" "https://mydisk.cs.upc.edu/s/3o33yciBHADiFCD/download/HR_comma_sep.csv"

builder = pyspark.sql.SparkSession.builder.appName("Shops_Deltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#Arxiu Parquet (de moment suposem aixo despres arreglem amb duckdb)
shops = spark.read.parquet("./datalake/shops_data/2024-04-17_shops_data.parquet")
income = spark.read.parquet("./datalake/income_data/2024-04-22_IRSIncomeByZipCode_NoStateTotalsNoSmallZips.parquet")
sales = spark.read.parquet("./datalake/sales_data/2024-04-22_SuperstoreSalesTraining.parquet")

In [None]:
############
## INCOME ##
############
from pyspark.sql.functions import col

# Lista de todos los caracteres inválidos que quieres reemplazar o eliminar
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_income = income.select([col(c).alias(clean_column_name(c)) for c in income.columns])

# seleccionem files que ens interessen per MODEL PREDICTIU
income_selected = cleaned_income.select("ZIPCODE", "Total_income_amount")

In [None]:
###########
## SALES ##
###########
from pyspark.sql.functions import col

# Lista de todos los caracteres inválidos que quieres reemplazar o eliminar
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_sales = sales.select([col(c).alias(clean_column_name(c)) for c in sales.columns])

# filtrar EEUU
sales_usa = cleaned_sales.filter(col("Country_/_Region") == "United States of America")

# eliminar row
sales_usa = sales_usa.dropDuplicates(subset=[col for col in sales_usa.columns if col != "row"])

# eliminar customer_name
sales_usa = sales_usa.drop("Customer_Name")

# no fa falta fer imputació de missings perque quan filtrem per USA no ens queden columnes amb missings

# eliminar missings a postal_code
sales_usa = sales_usa.dropna(subset=["Postal_Code"])

# eliminar missings a subregions
sales_usa = sales_usa.dropna(subset=["SubRegion"])

# eliminar files que missing a totes les columnes
sales_usa = sales_usa.dropna(how="all")


# seleccionem files que ens interessen per MODEL PREDICTIU
sales_selected = sales_usa.select("Postal Code", "Category", "Sales", "Order Quantity")

In [None]:
###########
## SHOPS ##
###########
# canviem nom columnes perque no ens deixa accedir-hi si tenen caracters especials
# geometry.x
new_column_name = "geometry_x"
old_column_name = "geometry.x"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# geometry.y
new_column_name = "geometry_y"
old_column_name = "geometry.y"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.shop
new_column_name = "shop"
old_column_name = "attributes.shop"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.name
new_column_name = "name"
old_column_name = "attributes.name"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.osm_id2
new_column_name = "index"
old_column_name = "attributes.osm_id2"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.addr_postcode
new_column_name = "postcode"
old_column_name = "attributes.addr_postcode"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# ens quedem només amb columnes seleccionades
selected_columns = ['geometry_x', 'geometry_y', "shop", "name", "index", "postcode"]
shops_selected = shops.select(selected_columns)

# eliminar files que tinguin missings --> excepte les que tenen missings a postcode!!
shops_selected = shops_selected.filter(~(col("geometry_x").isNull() |
                                col("geometry_y").isNull() |
                                col("shop").isNull() |
                                col("name").isNull() |
                                col("index").isNull()))

# seleccionem files que ens interessen per MODEL PREDICTIU
shops_selected = shops.select("shop", "postcode")

In [None]:
from pyspark.sql.functions import col, avg, count, desc, row_number
from pyspark.sql.window import Window

# Calcular la media de ingresos por código postal
income_avg = income_data.groupBy("zipcode").agg(avg("income").alias("avg_income"))

# Determinar las tiendas más comunes por código postal
window = Window.partitionBy("zipcode").orderBy(desc("count"))
top_shops = sales_data.groupBy("zipcode", "shop").agg(count("*").alias("count")).\
    withColumn("rn", row_number().over(window)).filter(col("rn") <= 5)

# Calcular métricas de ventas por código postal y tienda
sales_metrics = sales_data.groupBy("zipcode", "shop", "category").\
    agg(count("*").alias("count"), avg("sales").alias("avg_sales"), avg("profit").alias("avg_profit"),
        avg("num_units").alias("avg_num_units"))

# Combinar todas las métricas en una sola tabla
result_table = sales_metrics.join(income_avg, "zipcode", "left").\
    join(top_shops, ["zipcode", "shop"], "left")

# Mostrar el resultado
result_table.show()

# Finalizar la sesión de Spark
spark.stop()


In [None]:
# finalitzar sessió de Spark
spark.stop()