# 01 · Learning-to-Rank para Ranking de Repostajes 
 
Este notebook entrena y evalúa un **ranker** para ordenar estaciones de repostaje/carga según ahorro y sostenibilidad. 

**Incluye:** 
- Generación de dataset **sintético** tipo consultas (rutas) + candidatos. 
- Pipeline `StandardScaler + OneHot + LightGBMRanker` (*fallback* a RandomForest si LGBM no está disponible). 
- Métricas de ranking: **NDCG@k**, **MAP**, **P@1**. 
- Persistencia del pipeline en `../models/rank_refuel_pipe.joblib` (compatible con la API FastAPI).

In [6]:
import os, math, json 
import numpy as np, pandas as pd 
from pathlib import Path 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GroupKFold 
from sklearn.metrics import average_precision_score 
from sklearn.ensemble import RandomForestRegressor 
import joblib 
import lightgbm as lgb

In [9]:
DATA_DIR2 = Path("preview_work.ipynb").resolve().parents[1] 
print("DATA_DIR2", DATA_DIR2)

DATA_DIR2 C:\Users\abela\Documents\BootCamp\data_science_group


In [None]:
# Rutas del proyecto (asumiendo que este notebook vive en data_science_group/notebooks) 
PROJECT_ROOT = Path.cwd().parent if (Path.cwd().name == "notebooks") else Path.cwd() 
DATA_DIR = PROJECT_ROOT / "data" 
MODELS_DIR = PROJECT_ROOT / "models" 
MODELS_DIR.mkdir(exist_ok=True, parents=True) 



In [4]:
print("PROJECT_ROOT", PROJECT_ROOT)
print("DATA_DIR", DATA_DIR)
print("MODELS_DIR", MODELS_DIR)
print("os.getcwd()", os.getcwd())
print("os.listdir(DATA_DIR)", os.listdir(DATA_DIR))

PROJECT_ROOT c:\Users\abela\Documents\BootCamp\data_science_group
DATA_DIR c:\Users\abela\Documents\BootCamp\data_science_group\data
MODELS_DIR c:\Users\abela\Documents\BootCamp\data_science_group\models
os.getcwd() c:\Users\abela\Documents\BootCamp\data_science_group\notebooks
os.listdir(DATA_DIR) ['processed', 'raw']


In [None]:
# Definición de columnas numéricas y categóricas
NUM = ["delta_price","detour_km","wait_min","liters_needed"] 
CAT = ["brand","fuel_type","vehicle_type"] 
 
def ndcg_at_k(rel, k=5): 
    rel = np.asarray(rel)[:k] 
    dcg = np.sum((2**rel - 1) / np.log2(np.arange(2, rel.size + 2))) 
    rel_sorted = np.sort(rel)[::-1] 
    idcg = np.sum((2**rel_sorted - 1) / np.log2(np.arange(2, rel_sorted.size + 2))) 
    return dcg / idcg if idcg > 0 else 0.0 
 
def precision_at_1(rel): 
    return 1.0 if (len(rel) > 0 and rel[0] > 0) else 0.0 
 
def build_synthetic_training(df_stations: pd.DataFrame, n_queries=300, cand_per_q=6, seed=42): 
    rng = np.random.default_rng(seed) 
    rows=[] 
    brands = df_stations.get("brand", pd.Series(["BrandA","BrandB","BrandC"])).dropna().unique().tolist() or ["BrandA","BrandB","BrandC"] 
    fuels  = df_stations.get("fuel_type", pd.Series(["diesel","gasoline"])).dropna().unique().tolist() or ["diesel","gasoline"] 
    for q in range(n_queries): 
        vehicle_type = rng.choice(["car","van","truck"], p=[0.55,0.35,0.10]) 
        price_area_mean = float(np.clip(rng.normal(1.62, 0.04), 1.45, 1.90)) 
        liters_needed = float(np.clip(rng.normal(42, 8), 12, 70)) 
        for _ in range(cand_per_q): 
            brand = rng.choice(brands) 
            fuel  = rng.choice(fuels) 
            price = float(np.clip(price_area_mean + rng.normal(0,0.03), 1.40, 1.95)) 
            detour= float(np.clip(abs(rng.normal(1.0, 1.0)), 0, 10)) 
            wait  = float(np.clip(rng.normal(3,2), 0, 20)) 
            saving = (price_area_mean - price) * liters_needed 
            penalty = detour*0.18 + 0.05*wait 
            utility = saving - penalty + (0.05 if brand=="BrandB" else 0.0) 
            rows.append(dict( 
                query_id=q, brand=brand, fuel_type=fuel, vehicle_type=vehicle_type, 
                price_per_liter=price, wait_min=wait, detour_km=detour, 
                liters_needed=liters_needed, price_area_mean=price_area_mean, 
                delta_price=price_area_mean - price, utility=utility 
            )) 
    df = pd.DataFrame(rows) 
    df["rel"] = 0 
    df.loc[df.groupby("query_id")["utility"].idxmax(), "rel"] = 1 
    return df 
 
# Carga estaciones reales si existen; si no, crea mínimas 
stations_path = DATA_DIR / "stations.csv" 
if stations_path.exists(): 
    stations = pd.read_csv(stations_path) 
else: 
    stations = pd.DataFrame([ 
        {"station_id":"S001","brand":"BrandA","fuel_type":"diesel"}, 
        {"station_id":"S002","brand":"BrandB","fuel_type":"gasoline"}, 
        {"station_id":"S003","brand":"BrandC","fuel_type":"diesel"} 
    ]) 
 
df = build_synthetic_training(stations, n_queries=300, cand_per_q=6) 
df.head()

Unnamed: 0,query_id,brand,fuel_type,vehicle_type,price_per_liter,wait_min,detour_km,liters_needed,price_area_mean,delta_price,utility,rel
0,0,Repsol,gasoline,van,1.51987,3.255681,0.30218,48.00361,1.578401,0.058531,2.592526,1
1,0,Shell,gasoline,van,1.577897,4.758796,0.146956,48.00361,1.578401,0.000504,-0.240196,0
2,0,Repsol,gasoline,van,1.580382,3.935019,2.127241,48.00361,1.578401,-0.001981,-0.674746,0
3,0,Cepsa,diesel,van,1.589463,4.756901,0.041117,48.00361,1.578401,-0.011063,-0.776287,0
4,0,Repsol,gasoline,van,1.572855,5.445083,0.31907,48.00361,1.578401,0.005546,-0.063465,0


In [5]:
# Preprocesamiento y selección de modelo 
pre = ColumnTransformer([ 
    ("num", StandardScaler(), NUM), 
    ("cat", OneHotEncoder(handle_unknown="ignore"), CAT) 
]) 
 
use_lgbm = True 
try: 
    import lightgbm as lgb 
    model = lgb.LGBMRanker(objective="lambdarank", n_estimators=400, 
                           learning_rate=0.08, num_leaves=63, 
                           subsample=0.9, colsample_bytree=0.9, random_state=42) 
    pipe = Pipeline([("pre", pre), ("ranker", model)]) 
    y = df["rel"].values 
    X = df[NUM+CAT] 
    groups = df.groupby("query_id").size().to_list() 
except Exception as e: 
    use_lgbm = False 
    from sklearn.ensemble import RandomForestRegressor 
    model = RandomForestRegressor(n_estimators=400, random_state=42) 
    pipe = Pipeline([("pre", pre), ("rf", model)]) 
    y = df["utility"].values 
    X = df[NUM+CAT] 
    groups = None 
 
use_lgbm, type(model)

(True, lightgbm.sklearn.LGBMRanker)

In [6]:
# Validación GroupKFold por consultas 
gkf = GroupKFold(n_splits=5) 
ndcgs, maps, p1s = [], [], [] 
for fold, (tr, te) in enumerate(gkf.split(X, groups=df["query_id"], y=y), 1): 
    Xtr, Xte = X.iloc[tr], X.iloc[te] 
    ytr, yte = y[tr], y[te] 
    grp_train = df.iloc[tr].groupby("query_id").size().to_list() if use_lgbm else None 
 
    if use_lgbm: 
        pipe.fit(Xtr, ytr, ranker__group=grp_train) 
        scores = pipe.predict(Xte) 
        rel_true = df.iloc[te]["rel"].values 
    else: 
        pipe.fit(Xtr, ytr) 
        scores = pipe.predict(Xte) 
        rel_true = (df.iloc[te]["utility"].values == df.iloc[te].groupby("query_id")["utility"].transform("max").values).astype(int) 
 
    # Métricas por consulta 
    test_queries = df.iloc[te]["query_id"].values 
    ndcg_list, ap_list, p1_list = [], [], [] 
    for q in np.unique(test_queries): 
        idx = np.where(test_queries == q)[0] 
        if idx.size == 0: continue 
        order = np.argsort(-scores[idx]) 
        rel_q = rel_true[idx][order] 
        ndcg_list.append(ndcg_at_k(rel_q, k=5)) 
        # AP simple con etiquetas binarias; los *scores visuales* no importan, solo el orden 
        ap_list.append(average_precision_score(rel_q, np.linspace(1, 0, len(rel_q)))) 
        p1_list.append(precision_at_1(rel_q)) 
    ndcgs.append(np.mean(ndcg_list)) 
    maps.append(np.mean(ap_list)) 
    p1s.append(np.mean(p1_list)) 
    print(f"Fold {fold}: NDCG@5={ndcgs[-1]:.3f}  MAP={maps[-1]:.3f}  P@1={p1s[-1]:.3f}") 
 
print(" Promedio 5-folds -> NDCG@5={:.3f}  MAP={:.3f}  P@1={:.3f}".format(np.mean(ndcgs), np.mean(maps), np.mean(p1s)))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1022
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 12




Fold 1: NDCG@5=0.955  MAP=0.939  P@1=0.883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1022
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 12
Fold 2: NDCG@5=0.949  MAP=0.931  P@1=0.867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1022
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 12








Fold 3: NDCG@5=0.973  MAP=0.964  P@1=0.933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1022
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 12




Fold 4: NDCG@5=0.911  MAP=0.889  P@1=0.800
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1022
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 12
Fold 5: NDCG@5=0.941  MAP=0.921  P@1=0.850
 Promedio 5-folds -> NDCG@5=0.946  MAP=0.929  P@1=0.867




In [None]:
# Entrenamiento final y guardado del pipeline para la API 
if use_lgbm: 
    grp_all = df.groupby("query_id").size().to_list() 
    pipe.fit(X, y, ranker__group=grp_all) 
else: 
    pipe.fit(X, y) 
 
out_path = MODELS_DIR / "rank_repostaje_pipe.joblib" 
joblib.dump(pipe, out_path) 
print("Modelo guardado en:", out_path) 
print("Modelo usado:", "LightGBM Ranker" if use_lgbm else "RandomForest (fallback)")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1036
[LightGBM] [Info] Number of data points in the train set: 1800, number of used features: 12
Modelo guardado en: c:\Users\Abelardo\Documents\GitHub\Desafio_R3\models\rank_refuel_pipe.joblib
Modelo usado: LightGBM Ranker
