# Spark Streaming (DStreams) — Carritos en tiempo real


Este notebook usa la API **DStreams** para leer eventos de carritos desde un **socket** y calcular métricas por lote y por ventana.

Ejecuta en una terminal aparte el servidor continuo antes de correr el notebook:

```bash
python servidor_carritos_continuo.py
# o ajusta el intervalo de envío:
python servidor_carritos_continuo.py 0.25
```


In [1]:

import os, sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

sc = SparkContext(master="local[2]", appName="DStreams_Carritos")
sc.setLogLevel("WARN")

ssc = StreamingContext(sc, batchDuration=5)
ssc.checkpoint("./chk_dstreams_carritos")

lines = ssc.socketTextStream("127.0.0.1", 9999)
print("Conectado a servidor de carritos en 127.0.0.1:9999")




Conectado a servidor de carritos en 127.0.0.1:9999


-------------------------------------------
Time: 2025-08-30 12:47:20
-------------------------------------------



In [2]:

import re

def parse_line(line: str):
    try:
        kv = dict(re.findall(r"(\w+)=([^\s]+)", line))
        ev = kv.get("event","unknown")
        user = kv.get("user","")
        sku = kv.get("sku","")
        val = float(kv.get("value","0") or 0.0)
        items = int(kv.get("items","0") or 0)
        return {"event": ev, "user": user, "sku": sku, "value": val, "items": items}
    except Exception:
        return {"event": "malformed", "user": "", "sku": "", "value": 0.0, "items": 0}

events = lines.map(parse_line).cache()

per_batch_counts = events.map(lambda e: (e["event"], 1)).reduceByKey(lambda a,b: a+b)

window_counts = (events
                 .map(lambda e: (e["event"], 1))
                 .reduceByKeyAndWindow(lambda a,b: a+b, lambda a,b: a-b, 60, 10))

def to_pairs(e):
    if e["event"] == "cart_abandoned":
        return ("abandoned", 1)
    elif e["event"] == "checkout_ok":
        return ("checkout", 1)
    else:
        return ("other", 0)

pairs = events.map(to_pairs)
win_pairs = pairs.reduceByKeyAndWindow(lambda a,b: a+b, lambda a,b: a-b, 60, 10)

def compute_rate(rdd):
    data = dict(rdd.collect())
    abandoned = data.get("abandoned", 0)
    checkout  = data.get("checkout", 0)
    denom = abandoned + checkout
    rate = (abandoned / denom) if denom else 0.0
    print(f"[window] abandono={abandoned} checkout={checkout} tasa_abandono={rate:.3f}")

per_batch_counts.pprint()
window_counts.pprint()
win_pairs.foreachRDD(compute_rate)


In [None]:

ssc.start()
ssc.awaitTerminationOrTimeout(60)
ssc.stop(stopSparkContext=False, stopGraceFully=True)
print("Streaming detenido.")
