# 02 · Hábitos de Conducción Eficiente (Clasificación + Clustering)
Este notebook genera un dataset **sintético** de telemetría y construye modelos para:
- **Clasificación** (eficiente / no eficiente) con RandomForest.
- **Clustering** de estilos (KMeans) para segmentar hábitos.

Incluye métricas, importancia de variables y ejemplos de reglas de negocio.

In [1]:
import numpy as np, pandas as pd 
from pathlib import Path 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler 
 
# Dataset sintético de telemetría 
rng = np.random.default_rng(42) 
N = 5000 
df = pd.DataFrame({ 
    "vehicle_id": rng.choice(["V001","V002","V003","V004"], size=N, p=[0.4,0.3,0.2,0.1]), 
    "avg_speed_kmh": np.clip(rng.normal(68, 12, N), 20, 120), 
    "hard_brakes_per_100km": np.clip(rng.normal(3.5, 2.0, N), 0, 15), 
    "accel_events_per_100km": np.clip(rng.normal(5.0, 2.5, N), 0, 20), 
    "idle_ratio": np.clip(rng.normal(0.08, 0.05, N), 0, 0.6), 
    "payload_ratio": np.clip(rng.normal(0.45, 0.2, N), 0, 1.0) 
}) 
 
# Consumo (l/100km) con influencia de variables 
base = 6.5 
df["cons_l_100km"] = (base 
    + 0.03*(df["avg_speed_kmh"]-70)**2/100 
    + 0.4*df["idle_ratio"]*10 
    + 0.08*df["hard_brakes_per_100km"] 
    + 0.05*df["accel_events_per_100km"] 
    + 1.5*df["payload_ratio"]) 
 
# Etiqueta binaria: eficiente (1) si consumo <= P40 por vehículo 
df["efficient"] = 0 
for vid, grp in df.groupby("vehicle_id"): 
    thr = np.percentile(grp["cons_l_100km"], 40) 
    mask = (df["vehicle_id"]==vid) & (df["cons_l_100km"] <= thr) 
    df.loc[mask, "efficient"] = 1 
 
df.head()

Unnamed: 0,vehicle_id,avg_speed_kmh,hard_brakes_per_100km,accel_events_per_100km,idle_ratio,payload_ratio,cons_l_100km,efficient
0,V003,54.812905,6.516154,2.667774,0.107071,0.345653,8.170637,0
1,V002,61.504228,7.615604,4.864613,0.009676,0.453304,8.092793,0
2,V003,71.264013,3.969181,4.053359,0.038057,0.584154,8.049141,0
3,V002,60.46016,2.223142,7.049727,0.058486,0.619027,8.220127,0
4,V001,64.659603,3.05672,7.119804,0.119343,0.513663,8.356948,0


## Clasificación: eficiente / no eficiente (RandomForest)

In [2]:
FEATS = ["avg_speed_kmh","hard_brakes_per_100km","accel_events_per_100km","idle_ratio","payload_ratio"] 
X = df[FEATS].values 
y = df["efficient"].values 
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) 
clf = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, class_weight="balanced") 
clf.fit(X_train, y_train) 
pred = clf.predict(X_test) 
 
print(classification_report(y_test, pred, digits=3)) 
print("Matriz de confusión: ", confusion_matrix(y_test, pred)) 
 
importances = clf.feature_importances_ 
for f, imp in sorted(zip(FEATS, importances), key=lambda x: -x[1]): 
    print(f"{f:28s} {imp:.3f}")

              precision    recall  f1-score   support

           0      0.956     0.955     0.955       750
           1      0.932     0.934     0.933       500

    accuracy                          0.946      1250
   macro avg      0.944     0.944     0.944      1250
weighted avg      0.946     0.946     0.946      1250

Matriz de confusión:  [[716  34]
 [ 33 467]]
payload_ratio                0.455
idle_ratio                   0.210
hard_brakes_per_100km        0.169
accel_events_per_100km       0.116
avg_speed_kmh                0.049


## Clustering de estilos (KMeans)

In [3]:
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(df[FEATS].values) 
 
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42) 
df["cluster"] = kmeans.fit_predict(X_scaled) 
 
profile = df.groupby("cluster")[FEATS + ["cons_l_100km","efficient"]].agg(["mean","median"]) 
profile

Unnamed: 0_level_0,avg_speed_kmh,avg_speed_kmh,hard_brakes_per_100km,hard_brakes_per_100km,accel_events_per_100km,accel_events_per_100km,idle_ratio,idle_ratio,payload_ratio,payload_ratio,cons_l_100km,cons_l_100km,efficient,efficient
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
0,65.62511,65.750865,3.055565,3.088159,5.66781,5.623686,0.069578,0.06972,0.675422,0.662577,8.361857,8.329676,0.122168,0.0
1,69.998033,69.753149,5.847088,5.749488,5.617948,5.677659,0.069794,0.069579,0.405374,0.412806,8.172179,8.153823,0.298201,0.0
2,75.694761,75.518917,2.785986,2.872086,5.207574,5.15935,0.127009,0.125981,0.408131,0.418492,8.145392,8.124825,0.325246,0.0
3,60.400292,61.225145,2.605798,2.677247,4.046071,3.945946,0.060714,0.062684,0.323511,0.334866,7.699227,7.70205,0.838811,1.0


## Reglas de negocio derivadas de los perfiles (ejemplo)

In [4]:
rules = [] 
for c in sorted(df["cluster"].unique()): 
    sub = df[df["cluster"]==c] 
    msg = [] 
    if sub["idle_ratio"].mean() > 0.12: msg.append("Reducir ralentí (>12%) con apagado en esperas >2 min") 
    if sub["hard_brakes_per_100km"].mean() > 5: msg.append("Anticipar frenadas (evitar >5 fuertes/100km)") 
    if sub["accel_events_per_100km"].mean() > 7: msg.append("Acelerar progresivo (objetivo <7/100km)") 
    if sub["avg_speed_kmh"].mean() > 90: msg.append("Mantener velocidad de crucero entre 70–90 km/h") 
    if sub["payload_ratio"].mean() > 0.7: msg.append("Optimizar carga: revisar consolidación/envíos") 
    rules.append((c, msg if msg else ["Hábitos correctos; mantener formación"])) 
 
for c, msgs in rules: 
    print(f"Cluster {c}:") 
    for m in msgs: 
        print("  -", m)

Cluster 0:
  - Hábitos correctos; mantener formación
Cluster 1:
  - Anticipar frenadas (evitar >5 fuertes/100km)
Cluster 2:
  - Reducir ralentí (>12%) con apagado en esperas >2 min
Cluster 3:
  - Hábitos correctos; mantener formación
