# Interpretabilidad Pipe1/Pipe2/Pipe3 (FP/FN/TP/TN)


In [7]:
import pandas as pd

for f in ["p1_test_preds.parquet", "p2_test_preds.parquet", "p3_test_preds.parquet"]:
    dfp = pd.read_parquet(f)
    print(f, "->", dfp.columns.tolist())
    print(dfp.head(2), "\n")


p1_test_preds.parquet -> ['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack']
     pipeline           timestamp  y_true  y_pred  p_attack
0  p1_pca_mlp 2017-07-07 03:23:00       1       1  0.962043
1  p1_pca_mlp 2017-07-07 03:23:00       1       1  0.962855 

p2_test_preds.parquet -> ['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack', 'window_start', 'window_end', 'window_size', 'stride']
         pipeline           timestamp  y_true  y_pred  p_attack  window_start  \
0  p2_pca_gru_tcn 2017-07-07 03:23:00       1       1  0.679983             0   
1  p2_pca_gru_tcn 2017-07-07 03:23:00       1       1  0.902339             5   

   window_end  window_size  stride  
0          19           20       5  
1          24           20       5   

p3_test_preds.parquet -> ['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack', 'window_start', 'window_end', 'window_size', 'stride']
             pipeline           timestamp  y_true  y_pred  p_attack  \
0  p3_pca_transformer 2017-07-0

In [8]:
import pandas as pd
import numpy as np

EXPORT_DIR = "exports"

p1 = pd.read_parquet(f"p1_test_preds.parquet")
p2 = pd.read_parquet(f"p2_test_preds.parquet")
p3 = pd.read_parquet(f"p3_test_preds.parquet")

print("p1:", p1.shape, p1.columns)
print("p2:", p2.shape, p2.columns)
print("p3:", p3.shape, p3.columns)


p1: (424581, 5) Index(['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack'], dtype='object')
p2: (84913, 9) Index(['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack', 'window_start',
       'window_end', 'window_size', 'stride'],
      dtype='object')
p3: (84913, 9) Index(['pipeline', 'timestamp', 'y_true', 'y_pred', 'p_attack', 'window_start',
       'window_end', 'window_size', 'stride'],
      dtype='object')


In [9]:
from sklearn.metrics import confusion_matrix, classification_report

def standardize_cols(df):
    df = df.copy()

    
    # prob ataque (acepta varios nombres)
    prob_candidates = ["p_attack", "prob_attack", "proba_attack", "y_prob_attack", "p1", "prob1"]
    prob_col = None
    for c in prob_candidates:
        if c in df.columns:
            prob_col = c
            break

    df = df.rename(columns={prob_col: "p_attack"})
    return df

def add_fp_fn(df):
    df = df.copy()
    df["FP"] = ((df["y_true"] == 0) & (df["y_pred"] == 1)).astype(int)
    df["FN"] = ((df["y_true"] == 1) & (df["y_pred"] == 0)).astype(int)
    df["TP"] = ((df["y_true"] == 1) & (df["y_pred"] == 1)).astype(int)
    df["TN"] = ((df["y_true"] == 0) & (df["y_pred"] == 0)).astype(int)
    return df

def quick_report(name, df):
    cm = confusion_matrix(df["y_true"], df["y_pred"])
    print(f"\n=== {name} ===")
    print("Confusion:\n", cm)
    print(classification_report(df["y_true"], df["y_pred"], digits=4))

p1s = add_fp_fn(standardize_cols(p1))
p2s = add_fp_fn(standardize_cols(p2))
p3s = add_fp_fn(standardize_cols(p3))  

quick_report("P1", p1s)
quick_report("P2", p2s)
quick_report("P3", p3s)



=== P1 ===
Confusion:
 [[294124     33]
 [  1342 129082]]
              precision    recall  f1-score   support

           0     0.9955    0.9999    0.9977    294157
           1     0.9997    0.9897    0.9947    130424

    accuracy                         0.9968    424581
   macro avg     0.9976    0.9948    0.9962    424581
weighted avg     0.9968    0.9968    0.9968    424581


=== P2 ===
Confusion:
 [[55465  3361]
 [  362 25725]]
              precision    recall  f1-score   support

           0     0.9935    0.9429    0.9675     58826
           1     0.8844    0.9861    0.9325     26087

    accuracy                         0.9562     84913
   macro avg     0.9390    0.9645    0.9500     84913
weighted avg     0.9600    0.9562    0.9568     84913


=== P3 ===
Confusion:
 [[57961   865]
 [  686 25401]]
              precision    recall  f1-score   support

           0     0.9883    0.9853    0.9868     58826
           1     0.9671    0.9737    0.9704     26087

    accuracy 

In [None]:
def ensure_timestamp(df):
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["day"] = df["timestamp"].dt.date
    df["hour"] = df["timestamp"].dt.hour
    return df

p1t = ensure_timestamp(p1s)
p2t = ensure_timestamp(p2s)
p3t = ensure_timestamp(p3s)

print("P1 days:", np.unique(p1t["day"]))
print("P2 days:", np.unique(p2t["day"]))
print("P3 days:", np.unique(p3t["day"]))


P1 days: [datetime.date(2017, 7, 7)]
P2 days: [datetime.date(2017, 7, 7)]
P3 days: [datetime.date(2017, 7, 7)]


In [12]:
def fpfn_by_day(df):
    g = df.groupby("day")[["FP","FN","TP","TN"]].sum().reset_index()
    g["n"] = df.groupby("day").size().values
    g["FP_rate"] = g["FP"] / (g["FP"] + g["TN"]).replace(0, np.nan)
    g["FN_rate"] = g["FN"] / (g["FN"] + g["TP"]).replace(0, np.nan)
    return g

d1 = fpfn_by_day(p1t)
d2 = fpfn_by_day(p2t)
d3 = fpfn_by_day(p3t)

print("P1:\n", d1)
print("\nP2:\n", d2)
print("\nP3:\n", d3)


P1:
           day  FP    FN      TP      TN       n   FP_rate  FN_rate
0  2017-07-07  33  1342  129082  294124  424581  0.000112  0.01029

P2:
           day    FP   FN     TP     TN      n   FP_rate   FN_rate
0  2017-07-07  3361  362  25725  55465  84913  0.057135  0.013877

P3:
           day   FP   FN     TP     TN      n   FP_rate   FN_rate
0  2017-07-07  865  686  25401  57961  84913  0.014704  0.026297


In [13]:
def fpfn_by_hour(df):
    g = df.groupby("hour")[["FP","FN","TP","TN"]].sum().reset_index()
    g["n"] = df.groupby("hour").size().values
    g["FP_rate"] = g["FP"] / (g["FP"] + g["TN"]).replace(0, np.nan)
    g["FN_rate"] = g["FN"] / (g["FN"] + g["TP"]).replace(0, np.nan)
    return g.sort_values("hour")

h1 = fpfn_by_hour(p1t)
h2 = fpfn_by_hour(p2t)
h3 = fpfn_by_hour(p3t)

print("P1 hour head:\n", h1.head())
print("\nP2 hour head:\n", h2.head())
print("\nP3 hour head:\n", h3.head())


P1 hour head:
    hour  FP  FN      TP     TN       n   FP_rate   FN_rate
0     3  23  56   24240  35987   60306  0.000639  0.002305
1     4   0  52  104110  67530  171692  0.000000  0.000499
2     5   0   0       0   1552    1552  0.000000       NaN
3     8   0   0       0      2       2  0.000000       NaN
4     9   0   1       1  49981   49983  0.000000  0.500000

P2 hour head:
    hour    FP   FN     TP     TN      n   FP_rate   FN_rate
0     3   568   12   4843   6635  12058  0.078856  0.002472
1     4  2447   23  20802  11066  34338  0.181085  0.001104
2     5     2    0      0    309    311  0.006431       NaN
3     9   115    0      1   9881   9997  0.011505  0.000000
4    10   135  239     59  13881  14314  0.009632  0.802013

P3 hour head:
    hour   FP   FN     TP     TN      n   FP_rate   FN_rate
0     3  138  176   4679   7065  12058  0.019159  0.036251
1     4  579  103  20722  12934  34338  0.042848  0.004946
2     5    3    0      0    308    311  0.009646       NaN
3  

In [15]:
def top_cases(df, kind="FP", k=20):
    # FP: y_true=0 y_pred=1 -> alta p_attack
    # FN: y_true=1 y_pred=0 -> baja p_attack
    if kind == "FP":
        sub = df[df["FP"] == 1].sort_values("p_attack", ascending=False).head(k)
    elif kind == "FN":
        sub = df[df["FN"] == 1].sort_values("p_attack", ascending=True).head(k)
    else:
        raise ValueError("kind debe ser FP o FN")
    cols = ["timestamp","y_true","y_pred","p_attack"]
    return sub[cols]

print("P3 - top FP:\n", top_cases(p3t, "FP", 10))
print("\nP3 - top FN:\n", top_cases(p3t, "FN", 10))


P3 - top FP:
                 timestamp  y_true  y_pred  p_attack
39771 2017-07-07 04:15:00       0       1  0.999897
12172 2017-07-07 04:00:00       0       1  0.999863
39770 2017-07-07 04:15:00       0       1  0.999819
14545 2017-07-07 04:01:00       0       1  0.999767
45180 2017-07-07 04:51:00       0       1  0.999752
8910  2017-07-07 03:58:00       0       1  0.999728
14544 2017-07-07 04:01:00       0       1  0.999720
21424 2017-07-07 04:05:00       0       1  0.999692
21422 2017-07-07 04:05:00       0       1  0.999669
21429 2017-07-07 04:05:00       0       1  0.999667

P3 - top FN:
                 timestamp  y_true  y_pred  p_attack
67709 2017-07-07 10:36:00       1       0  0.000078
84569 2017-07-07 12:44:00       1       0  0.000079
67718 2017-07-07 10:36:00       1       0  0.000081
60689 2017-07-07 10:15:00       1       0  0.000082
66855 2017-07-07 10:34:00       1       0  0.000083
66698 2017-07-07 10:32:00       1       0  0.000083
67512 2017-07-07 10:35:00       1  