# Exploitation Zone - Model Predictiu

- exploitation zone del model predictiu
- preparation pipeline per taula d'entrenament del model --> cada zipcode és un indiv
    - Sales: 5 categories més comunes per zipcode, count vendes per zipcode, profit mitja per zipcode, mitjana num unitat per comanda per zipcode
    - Shops: 5shops més comunes per zipcode
    - Income: mitjana income per zipcode

In [None]:
#!pip install pyspark
#!pip install delta-spark

import pyspark
from delta import *

#!wget -O "HR_comma_sep.csv" "https://mydisk.cs.upc.edu/s/3o33yciBHADiFCD/download/HR_comma_sep.csv"

builder = pyspark.sql.SparkSession.builder.appName("Shops_Deltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#Arxiu Parquet (de moment suposem aixo despres arreglem amb duckdb)
shops = spark.read.parquet("./datalake/shops_data/2024-04-24_shops_data.parquet")
income = spark.read.parquet("./datalake/income_data/2024-04-24_IRSIncomeByZipCode_NoStateTotalsNoSmallZips.parquet")
sales = spark.read.parquet("./datalake/sales_data/2024-04-24_SuperstoreSalesTraining.parquet")

In [10]:
############
## INCOME ##
############
from pyspark.sql.functions import col

# Lista de todos los caracteres inválidos que quieres reemplazar o eliminar
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_income = income.select([col(c).alias(clean_column_name(c)) for c in income.columns])

# seleccionem files que ens interessen per MODEL PREDICTIU
income_selected = cleaned_income.select("ZIPCODE", "Total_income_amount")

In [11]:
###########
## SALES ##
###########
from pyspark.sql.functions import col

# Lista de todos los caracteres inválidos que quieres reemplazar o eliminar
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_sales = sales.select([col(c).alias(clean_column_name(c)) for c in sales.columns])

# filtrar EEUU
sales_usa = cleaned_sales.filter(col("Country_/_Region") == "United States of America")

# eliminar row
sales_usa = sales_usa.dropDuplicates(subset=[col for col in sales_usa.columns if col != "row"])

# eliminar customer_name
sales_usa = sales_usa.drop("Customer_Name")

# no fa falta fer imputació de missings perque quan filtrem per USA no ens queden columnes amb missings

# eliminar missings a postal_code
sales_usa = sales_usa.dropna(subset=["Postal_Code"])

# eliminar missings a subregions
sales_usa = sales_usa.dropna(subset=["SubRegion"])

# eliminar files que missing a totes les columnes
sales_usa = sales_usa.dropna(how="all")


# seleccionem files que ens interessen per MODEL PREDICTIU
sales_selected = sales_usa.select("Postal_Code", "Category", "Sales", "Order_Quantity")

In [12]:
###########
## SHOPS ##
###########
from pyspark.sql.functions import col

# canviem nom columnes perque no ens deixa accedir-hi si tenen caracters especials
# geometry.x
new_column_name = "geometry_x"
old_column_name = "geometry.x"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# geometry.y
new_column_name = "geometry_y"
old_column_name = "geometry.y"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.shop
new_column_name = "shop"
old_column_name = "attributes.shop"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.name
new_column_name = "name"
old_column_name = "attributes.name"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.osm_id2
new_column_name = "index"
old_column_name = "attributes.osm_id2"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# attributes.addr_postcode
new_column_name = "postcode"
old_column_name = "attributes.addr_postcode"
shops = shops.withColumnRenamed(old_column_name, new_column_name)

# ens quedem només amb columnes seleccionades
selected_columns = ['geometry_x', 'geometry_y', "shop", "name", "index", "postcode"]
shops_selected = shops.select(selected_columns)

# eliminar files que tinguin missings --> excepte les que tenen missings a postcode!!
shops_selected = shops_selected.filter(~(col("geometry_x").isNull() |
                                col("geometry_y").isNull() |
                                col("shop").isNull() |
                                col("name").isNull() |
                                col("index").isNull()))

# seleccionem files que ens interessen per MODEL PREDICTIU
shops_selected = shops.select("shop", "postcode")

In [14]:
# creem dataframe buit amb les columnes que volem que tingui
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

# definim estructura
schema = StructType([
    StructField("zipcode", StringType(), True),
    StructField("avg_income_per_zipcode", DoubleType(), True),
    StructField("shop1", StringType(), True),
    StructField("shop2", StringType(), True),
    StructField("shop3", StringType(), True),
    StructField("shop4", StringType(), True),
    StructField("shop5", StringType(), True),
    StructField("sales_cat1", StringType(), True),
    StructField("sales_cat2", StringType(), True),
    StructField("sales_cat3", StringType(), True),
    StructField("sales_cat4", StringType(), True),
    StructField("sales_cat5", StringType(), True),
    StructField("sales_per_zipcode", IntegerType(), True),
    StructField("avg_profit_per_zipcode", DoubleType(), True),
    StructField("avg_order_quantity_per_zipcode", DoubleType(), True)
])

# crear dataframe buit
df_model = spark.createDataFrame([], schema)

# df_model.show()


In [16]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Definir una ventana sobre la cual aplicar row_number()
window = Window.partitionBy("zipcode").orderBy(F.desc("count"))

# Calcular la cantidad de cada tipo de tienda para cada código postal
top_shops = (
    shops_selected.groupBy("zipcode", "shop")
         .count()
         .withColumn("row_num", F.row_number().over(window))
         .filter(F.col("row_num") <= 5)
         .select("zipcode", "shop", "count")
         .orderBy("zipcode", "row_num")
)

top_shops = top_shops.drop("count")

# Mostrar los resultados
#top_shops.show()

shops_cleaned = top_shops.dropna()
#shops_cleaned.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `zipcode` cannot be resolved. Did you mean one of the following? [`postcode`, `shop`].;
'Aggregate ['zipcode, shop#905], ['zipcode, shop#905, count(1) AS count#1133L]
+- Project [shop#905, postcode#974]
   +- Project [attributes.objectid#558L, index#951, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566 AS postcode#974, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, name#928, attributes.operator#576, shop#905, geometry_x#858, geometry_y#882]
      +- Project [attributes.objectid#558L, attributes.osm_id2#559 AS index#951, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, name#928, attributes.operator#576, shop#905, geometry_x#858, geometry_y#882]
         +- Project [attributes.objectid#558L, attributes.osm_id2#559, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, attributes.name#575 AS name#928, attributes.operator#576, shop#905, geometry_x#858, geometry_y#882]
            +- Project [attributes.objectid#558L, attributes.osm_id2#559, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, attributes.name#575, attributes.operator#576, attributes.shop#577 AS shop#905, geometry_x#858, geometry_y#882]
               +- Project [attributes.objectid#558L, attributes.osm_id2#559, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, attributes.name#575, attributes.operator#576, attributes.shop#577, geometry_x#858, geometry.y#579 AS geometry_y#882]
                  +- Project [attributes.objectid#558L, attributes.osm_id2#559, attributes.abandoned#560, attributes.addr_housename#561, attributes.addr_housenumber#562, attributes.addr_street#563, attributes.addr_city#564, attributes.addr_state#565, attributes.addr_postcode#566, attributes.addr_province#567, attributes.addr_country#568, attributes.addr_district#569, attributes.addr_subdistrict#570, attributes.addr_unit#571, attributes.amenity#572, attributes.brand#573, attributes.building#574, attributes.name#575, attributes.operator#576, attributes.shop#577, geometry.x#578 AS geometry_x#858, geometry.y#579]
                     +- Relation [attributes.objectid#558L,attributes.osm_id2#559,attributes.abandoned#560,attributes.addr_housename#561,attributes.addr_housenumber#562,attributes.addr_street#563,attributes.addr_city#564,attributes.addr_state#565,attributes.addr_postcode#566,attributes.addr_province#567,attributes.addr_country#568,attributes.addr_district#569,attributes.addr_subdistrict#570,attributes.addr_unit#571,attributes.amenity#572,attributes.brand#573,attributes.building#574,attributes.name#575,attributes.operator#576,attributes.shop#577,geometry.x#578,geometry.y#579] parquet


In [None]:
# taula a partir de income
df_model = income_selected

# renombrem columnes
df_model = df_model.withColumnRenamed("ZIPCODE", "zipcode")
df_model = df_model.withColumnRenamed("Total_income_amount", "avg_income_per_zipcode")

# renombrem columnes de zipcode a sales i shops per poder fer join
shops_selected = shops_selected.withColumnRenamed("postcode", "zipcode")
sales_selected = sales_selected.withColumnRenamed("Postal_Code", "zipcode")

df_model.show()

+-------+----------------------+
|zipcode|avg_income_per_zipcode|
+-------+----------------------+
|  35004|                258024|
|  35005|                129390|
|  35006|                 58585|
|  35007|                651350|
|  35010|                382106|
|  35014|                 67885|
|  35016|                333226|
|  35019|                 35392|
|  35020|                262475|
|  35022|                521539|
|  35023|                480458|
|  35031|                112152|
|  35033|                 67437|
|  35034|                 52030|
|  35035|                 31542|
|  35040|                359868|
|  35042|                 96503|
|  35043|                363943|
|  35044|                124406|
|  35045|                236772|
+-------+----------------------+
only showing top 20 rows



In [None]:
joined_data = df_model.join(top_shops, (df_model["zipcode"] == top_shops["zipcode"]), "inner")
selected_columns = [df_model[col] for col in df_model.columns] + [top_shops[col] for col in top_shops.columns if col not in ["zipcode"]]
joined_data = joined_data.select(selected_columns)
joined_data.show()

+-------+----------------------+-------------+
|zipcode|avg_income_per_zipcode|         shop|
+-------+----------------------+-------------+
|  96701|               1369267|      charity|
|  96701|               1369267|      bicycle|
|  96701|               1369267|  supermarket|
|  96706|               1657633|  supermarket|
|  96706|               1657633|      chemist|
|  96708|                205389|variety_store|
|  96708|                205389|      hammock|
|  96712|                209977|         gift|
|  96712|                209977|       tattoo|
|  96712|                209977|      jewelry|
|  96712|                209977|       sports|
|  96712|                209977|      clothes|
|  96713|                 30221|          art|
|  96720|               1091080|      clothes|
|  96720|               1091080|      outdoor|
|  96720|               1091080|          art|
|  96720|               1091080|  health_food|
|  96720|               1091080|  supermarket|
|  96722|    

In [None]:
#Aquesta

Número de filas: 117
Número de columnas: 3


## cosetes aux 1

In [7]:
# Obtener el número de filas
num_rows = df_model.count()

# Obtener el número de columnas
num_columns = len(df_model.columns)

# Mostrar el número de filas y columnas
print("Número de filas:", num_rows)
print("Número de columnas:", num_columns)

Py4JJavaError: An error occurred while calling o212.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 4.0 failed 1 times, most recent failure: Lost task 5.0 in stage 4.0 (TID 9) (Paula executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 34 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 34 more


In [None]:
# taula a partir de income
df_model = income_selected

# renombrem columnes
df_model = df_model.withColumnRenamed("ZIPCODE", "zipcode")
df_model = df_model.withColumnRenamed("Total_income_amount", "avg_income_per_zipcode")

# renombrem columnes de zipcode a sales i shops per poder fer join
shops_selected = shops_selected.withColumnRenamed("postcode", "zipcode")
sales_selected = sales_selected.withColumnRenamed("Postal_Code", "zipcode")

# modificar taula shops per que tingui les columnes que volem --> shop1, shop2, shop3, shop4, shop5


df_model = df_model.join(shops_selected, "zipcode", "left")


# modificar taula sales perque tingui les columnes que volem --> sales_cat1, sales_cat2, sales_cat3, sales_cat4, sales_cat5, sales_per_zipcode, avg_profit_per_zipcode, avg_order_quantity_per_zipcode


df_model = df_model.join(sales_selected, "zipcode", "left")



# Mostrar el DataFrame resultante
df_model.show()

+-------+----------------------+----+
|zipcode|avg_income_per_zipcode|shop|
+-------+----------------------+----+
|  35004|                258024|NULL|
|  35005|                129390|NULL|
|  35006|                 58585|NULL|
|  35007|                651350|NULL|
|  35010|                382106|NULL|
|  35014|                 67885|NULL|
|  35016|                333226|NULL|
|  35019|                 35392|NULL|
|  35020|                262475|NULL|
|  35022|                521539|NULL|
|  35023|                480458|NULL|
|  35031|                112152|NULL|
|  35033|                 67437|NULL|
|  35034|                 52030|NULL|
|  35035|                 31542|NULL|
|  35040|                359868|NULL|
|  35042|                 96503|NULL|
|  35043|                363943|NULL|
|  35044|                124406|NULL|
|  35045|                236772|NULL|
+-------+----------------------+----+
only showing top 20 rows



In [None]:
# introduim valors de zipcode
df_model = df_model.withColumn("zipcode", income_selected["ZIPCODE"])

# unim income amb df_model
df_model = df_model.join(income_selected, df_model.zipcode == income_selected.zipcode, how="left")


In [None]:
from pyspark.sql.functions import col, avg, count, desc, row_number
from pyspark.sql.window import Window

# Calcular la media de ingresos por código postal
income_avg = income_data.groupBy("zipcode").agg(avg("income").alias("avg_income"))

# Determinar las tiendas más comunes por código postal
window = Window.partitionBy("zipcode").orderBy(desc("count"))
top_shops = sales_data.groupBy("zipcode", "shop").agg(count("*").alias("count")).\
    withColumn("rn", row_number().over(window)).filter(col("rn") <= 5)

# Calcular métricas de ventas por código postal y tienda
sales_metrics = sales_data.groupBy("zipcode", "shop", "category").\
    agg(count("*").alias("count"), avg("sales").alias("avg_sales"), avg("profit").alias("avg_profit"),
        avg("num_units").alias("avg_num_units"))

# Combinar todas las métricas en una sola tabla
result_table = sales_metrics.join(income_avg, "zipcode", "left").\
    join(top_shops, ["zipcode", "shop"], "left")

# Mostrar el resultado
result_table.show()

# Finalizar la sesión de Spark
spark.stop()


In [None]:
# finalitzar sessió de Spark
spark.stop()