In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.inspection import permutation_importance

KeyboardInterrupt: 

In [None]:
DATA_PATH = "../Data/processed/trips_with_ml_risk.parquet"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"No existe el archivo {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)

print("Dataset cargado ")
print("Shape:", df.shape)
df.head()

In [None]:
required = [
    "trip_distance",
    "trip_duration_min",
    "total_amount",
    "cost_per_km",
    "cost_per_min",
    "fare_to_total_ratio",
    "passenger_count",
    "risk_score",
    "anomaly_flag",
    "final_risk_level"
]

missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas necesarias para explicabilidad: {missing}")

print("Columnas OK ")

In [None]:
features_ml = [
    "trip_distance",
    "trip_duration_min",
    "total_amount",
    "cost_per_km",
    "cost_per_min",
    "fare_to_total_ratio",
    "passenger_count",
    "risk_score"
]

X = df[features_ml].replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print("X listo ")
X.head()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(
    n_estimators=100,
    contamination=0.03,
    random_state=42,
    n_jobs=-1
)
iso.fit(X_scaled)

# Score: mientras más bajo, más anómalo
score = iso.decision_function(X_scaled)
df["anomaly_score_rebuilt"] = score

print("Modelo reconstruido ")
df[["anomaly_flag", "anomaly_score", "anomaly_score_rebuilt"]].head()

In [None]:
check = df.groupby("anomaly_flag").agg(
    mean_score=("anomaly_score_rebuilt", "mean"),
    median_score=("anomaly_score_rebuilt", "median"),
    min_score=("anomaly_score_rebuilt", "min"),
    max_score=("anomaly_score_rebuilt", "max"),
    n=("anomaly_score_rebuilt", "count"),
)

check

In [None]:
# Definimos una función score para permutation_importance:
# Usamos el propio decision_function como "predicción".
def model_predict(X_scaled_local):
    return iso.decision_function(X_scaled_local)

# Wrapper sencillo: permutation_importance requiere un estimador con predict,
# así que haremos un mini "estimador" con lambda NO.
# Alternativa robusta: usar permutation sobre una métrica basada en score:
# Creamos y como score base.
y = df["anomaly_score_rebuilt"].values

# Para permutation_importance: usaremos un modelo lineal proxy sobre X_scaled que prediga y
# (esto es una práctica común de auditoría: "surrogate model").
from sklearn.linear_model import Ridge

surrogate = Ridge(alpha=1.0, random_state=42)
surrogate.fit(X_scaled, y)

perm = permutation_importance(
    surrogate, X_scaled, y,
    n_repeats=5,
    random_state=42,
    n_jobs=-1
)

imp = pd.DataFrame({
    "feature": features_ml,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

imp

In [None]:
def profile_by_group(df_local, group_col, features):
    rows = []
    for g, sub in df_local.groupby(group_col):
        row = {"group": g, "n": len(sub)}
        for f in features:
            row[f"{f}_median"] = float(sub[f].median())
            row[f"{f}_p25"] = float(sub[f].quantile(0.25))
            row[f"{f}_p75"] = float(sub[f].quantile(0.75))
        rows.append(row)
    return pd.DataFrame(rows)

profile_anom = profile_by_group(df, "anomaly_flag", features_ml)
profile_anom

In [None]:
# Baseline "normal": usamos mediana y MAD/STD del grupo normal
normal = df[df["anomaly_flag"] == "normal"].copy()

baseline_median = normal[features_ml].median()
baseline_std = normal[features_ml].std(ddof=0).replace(0, np.nan)

def local_explain(row):
    z = (row[features_ml] - baseline_median) / baseline_std
    z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    top = z.abs().sort_values(ascending=False).head(5)
    return top

# Tomamos 5 anomalías más extremas (score más bajo)
top_anoms = df[df["anomaly_flag"] == "anomalía"].sort_values("anomaly_score_rebuilt").head(5)

explanations = []
for idx, row in top_anoms.iterrows():
    top = local_explain(row)
    explanations.append({
        "row_index": int(idx),
        "anomaly_score": float(row["anomaly_score_rebuilt"]),
        "final_risk_level": str(row["final_risk_level"]),
        "top_drivers": ", ".join([f"{k} (|z|={v:.2f})" for k, v in top.items()])
    })

pd.DataFrame(explanations)

In [None]:
impact_by_anom = df.groupby(["anomaly_flag", "final_risk_level"]).agg(
    trips=("total_amount", "count"),
    total_revenue=("total_amount", "sum"),
    avg_amount=("total_amount", "mean")
).sort_values("total_revenue", ascending=False)

impact_by_anom

In [None]:
# Derivar umbrales basados en percentiles de anomalías para reglas de monitoreo
anom = df[df["anomaly_flag"] == "anomalía"]

rules = {
    "cost_per_km_p95_anom": float(anom["cost_per_km"].quantile(0.95)),
    "cost_per_min_p95_anom": float(anom["cost_per_min"].quantile(0.95)),
    "total_amount_p99_anom": float(anom["total_amount"].quantile(0.99)),
    "duration_p99_anom": float(anom["trip_duration_min"].quantile(0.99)),
}

rules_df = pd.DataFrame([rules])
rules_df

In [None]:
OUT_IMPORTANCE = "../reports/06_feature_importance_surrogate.csv"
OUT_PROFILE = "../reports/06_profile_normal_vs_anomaly.csv"
OUT_RULES = "../reports/06_rules_thresholds_from_anomalies.csv"
OUT_LOCAL = "../reports/06_top_anomalies_local_explanations.csv"

imp.to_csv(OUT_IMPORTANCE, index=False)
profile_anom.to_csv(OUT_PROFILE, index=False)
rules_df.to_csv(OUT_RULES, index=False)
pd.DataFrame(explanations).to_csv(OUT_LOCAL, index=False)

print(" Reportes guardados:")
print("-", OUT_IMPORTANCE)
print("-", OUT_PROFILE)
print("-", OUT_RULES)
print("-", OUT_LOCAL)
