# Deploying Predictive Systems

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Cerrar SparkContext anterior si está activo
if SparkContext._active_spark_context:
    SparkContext._active_spark_context.stop()

# SparkSession apuntando al cluster
spark = (
    SparkSession.builder.appName("DeployMyModel")
    .master("spark://agile:7077")  # Asegúrate de que 'agile' es el nombre correcto del master en tu Docker Compose
    .config("spark.driver.bindAddress", "0.0.0.0")
    .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/30 15:56:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
print(spark.version)
print(spark.sparkContext.master)
print("Aplicación:", spark.sparkContext.appName)
print(spark.sparkContext._jsc.sc().getExecutorMemoryStatus().keys())

3.2.0
spark://agile:7077
Aplicación: DeployMyModel
Set(agile:37701)


In [None]:
!cp "/home/jovyan/.cache/kagglehub/datasets/romanniki/food-delivery-cost-and-profitability/versions/1/food_orders_new_delhi (1).csv" ./data/food_orders.csv

mv: cannot stat '/home/jovyan/.cache/kagglehub/datasets/romanniki/food-delivery-cost-and-profitability/versions/1/food_orders_new_delhi (1).csv': No such file or directory


In [4]:
input_path = "./data/food_orders.csv"
df = spark.read.csv(input_path, header=True, inferSchema=True)
df.show(5)

                                                                                

+--------+-----------+-------------+-------------------+----------------------+-----------+------------+----------------+--------------------+--------------+----------------------+-------------------+
|Order ID|Customer ID|Restaurant ID|Order Date and Time|Delivery Date and Time|Order Value|Delivery Fee|  Payment Method|Discounts and Offers|Commission Fee|Payment Processing Fee|Refunds/Chargebacks|
+--------+-----------+-------------+-------------------+----------------------+-----------+------------+----------------+--------------------+--------------+----------------------+-------------------+
|       1|      C8270|        R2924|2024-02-01 01:11:52|   2024-02-01 02:39:52|       1914|           0|     Credit Card|           5% on App|           150|                    47|                  0|
|       2|      C1860|        R2054|2024-02-02 22:11:04|   2024-02-02 22:46:04|        986|          40|  Digital Wallet|                 10%|           198|                    23|                

In [5]:
from pyspark.sql.functions import dayofweek, hour, when, col

# Añadir columnas calculadas en Spark
df = df.withColumn("day_of_week", dayofweek("Order Date and Time"))  # 1=Sunday, ..., 7=Saturday
df = df.withColumn("hour_of_day", hour("Order Date and Time"))

# Es fin de semana (sábado=7 o domingo=1)
df = df.withColumn("es_fin_de_semana", when(col("day_of_week").isin([1, 7]), 1).otherwise(0))

# Es hora punta (13-15 o 20-22)
df = df.withColumn(
    "es_hora_punta",
    when((col("hour_of_day").between(13, 15)) | (col("hour_of_day").between(20, 22)), 1).otherwise(0)
)

In [6]:
df.show(5)

+--------+-----------+-------------+-------------------+----------------------+-----------+------------+----------------+--------------------+--------------+----------------------+-------------------+-----------+-----------+----------------+-------------+
|Order ID|Customer ID|Restaurant ID|Order Date and Time|Delivery Date and Time|Order Value|Delivery Fee|  Payment Method|Discounts and Offers|Commission Fee|Payment Processing Fee|Refunds/Chargebacks|day_of_week|hour_of_day|es_fin_de_semana|es_hora_punta|
+--------+-----------+-------------+-------------------+----------------------+-----------+------------+----------------+--------------------+--------------+----------------------+-------------------+-----------+-----------+----------------+-------------+
|       1|      C8270|        R2924|2024-02-01 01:11:52|   2024-02-01 02:39:52|       1914|           0|     Credit Card|           5% on App|           150|                    47|                  0|          5|          1|        

In [7]:
df = df.withColumnRenamed("Order ID", "order_id") \
       .withColumnRenamed("Customer ID", "customer_id") \
       .withColumnRenamed("Restaurant ID", "restaurant_id") \
       .withColumnRenamed("Order Date and Time", "order_date_and_time") \
       .withColumnRenamed("Delivery Date and Time", "delivery_date_and_time") \
       .withColumnRenamed("Order Value", "order_value") \
       .withColumnRenamed("Delivery Fee", "delivery_fee") \
       .withColumnRenamed("Payment Method", "payment_method") \
       .withColumnRenamed("Discounts and Offers", "discounts_and_offers") \
       .withColumnRenamed("Commission Fee", "commission_fee") \
       .withColumnRenamed("Payment Processing Fee", "payment_processing_fee") \
       .withColumnRenamed("Refunds/Chargebacks", "refunds/chargebacks")

In [8]:
from pyspark.sql.functions import dayofweek, hour, when, col

# Día de la semana y hora
df = df.withColumn("day_of_week", dayofweek("order_date_and_time"))
df = df.withColumn("hour_of_day", hour("order_date_and_time"))

# Fin de semana
df = df.withColumn("es_fin_de_semana", when(col("day_of_week").isin([1, 7]), 1).otherwise(0))

# Hora punta
df = df.withColumn("es_hora_punta", when(
    (col("hour_of_day").between(13, 15)) | (col("hour_of_day").between(20, 22)), 1
).otherwise(0))

# Tiene descuento
df = df.withColumn("has_discount", when(
    col("discounts_and_offers").isNotNull() & (col("discounts_and_offers") != "None"), 1
).otherwise(0))

# Valor del descuento
df = df.withColumn("discount_value", col("order_value") * col("has_discount").cast("int") * 0.1)

df = df.withColumn("refunded", (col("refunds/chargebacks") > 0).cast("boolean"))

In [9]:
from pyspark.ml import PipelineModel

modelo = PipelineModel.load("./models/pipeline_model.bin")
predicciones = modelo.transform(df)

predicciones.select("prediction").show(10)

                                                                                

+-----------------+
|       prediction|
+-----------------+
|79.84346673549138|
| 61.0080084099073|
|81.72973689000804|
|60.31104433039575|
|82.14852090974995|
|62.91276068214107|
|66.69234628045997|
|86.82624548541419|
|72.10926636991185|
|76.91737711333077|
+-----------------+
only showing top 10 rows



                                                                                