In [1]:
# consolidated_pipeline.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import silhouette_score
import time, json

sns.set(style="whitegrid")


In [2]:
# ------------------------- #
# 0. CONFIG
# ------------------------- #
TRAIN_PATH = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Testing_file.csv'
TEST_PATH  = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Training_file.csv'
RANDOM_STATE = 42
LOF_BATCH = 200_000
SIL_BATCH = 40_000

# ------------------------- #
# 1. LOAD DATA
# ------------------------- #
def load_datasets(train_path, test_path):
    df_tr = pd.read_csv(train_path, encoding="ISO-8859-1")
    df_te = pd.read_csv(test_path, encoding="ISO-8859-1")
    return df_tr, df_te

df_train, df_test = load_datasets(TRAIN_PATH, TEST_PATH)


In [3]:
# ------------------------- #
# 2. BASIC CLEANING
# ------------------------- #
for df in [df_train, df_test]:
    df["Info"].fillna("Unknown", inplace=True)
    df.dropna(subset=["Source","Destination"], inplace=True)
    df = df[df["Time"] >= 0]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Info"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Info"].fillna("Unknown", inplace=True)


In [4]:
# ------------------------- #
# 3. SAFE CATEGORICAL ENCODING
# ------------------------- #
def build_mapping(train_col, test_col, name):
    all_values = pd.concat([train_col, test_col]).unique()
    mapping = {val: idx for idx, val in enumerate(all_values)}
    with open(f"{name}_mapping.json","w") as f: json.dump(mapping,f)
    return mapping

protocol_mapping = build_mapping(df_train["Protocol"], df_test["Protocol"], "protocol")
df_train["Protocol_enc"] = df_train["Protocol"].map(protocol_mapping).fillna(-1).astype(int)
df_test["Protocol_enc"]  = df_test["Protocol"].map(protocol_mapping).fillna(-1).astype(int)

source_mapping = build_mapping(df_train["Source"], df_test["Source"], "source")
df_train["Source_enc"] = df_train["Source"].map(source_mapping).fillna(-1).astype(int)
df_test["Source_enc"]  = df_test["Source"].map(source_mapping).fillna(-1).astype(int)

dest_mapping = build_mapping(df_train["Destination"], df_test["Destination"], "destination")
df_train["Destination_enc"] = df_train["Destination"].map(dest_mapping).fillna(-1).astype(int)
df_test["Destination_enc"]  = df_test["Destination"].map(dest_mapping).fillna(-1).astype(int)

df_train.drop(["Protocol","Source","Destination"], axis=1, inplace=True)
df_test.drop(["Protocol","Source","Destination"], axis=1, inplace=True)


In [5]:
# ------------------------- #
# 4. FEATURE ENGINEERING
# ------------------------- #
def build_features(df):
    df["Time_Diff"] = df.groupby("Source_enc")["Time"].diff()
    df["Time_Diff"] = df["Time_Diff"].fillna(df.groupby("Source_enc")["Time"].transform("median")).fillna(0.0)

    def packet_rate(series):
        if len(series)<2: return 0.0
        span = series.max() - series.min()
        return len(series)/(span+1e-6)
    df["Packet_Rate"] = df.groupby("Source_enc")["Time"].transform(packet_rate)

    df["Inter_Arrival_Time"] = df.groupby("Source_enc")["Time_Diff"].transform(lambda x: x.rolling(10,min_periods=1).mean())
    df["Inter_Arrival_Time"] = df["Inter_Arrival_Time"].clip(lower=1e-6)

    df["Burst_Rate"] = np.where(df["Inter_Arrival_Time"]>1e-6, 1/df["Inter_Arrival_Time"], 0.0)

    # variability features
    df["Length_Mean"] = df.groupby("Source_enc")["Length"].transform("mean")
    df["Length_Std"]  = df.groupby("Source_enc")["Length"].transform("std").fillna(0)
    df["Pkt_Per_Src"] = df.groupby("Source_enc")["Length"].transform("count")

    # transforms
    df["Log_IATime"] = np.log1p(df["Inter_Arrival_Time"])
    df["Log_BRate"]  = np.log1p(df["Burst_Rate"])
    df["BoxCox_Length"], _ = boxcox(df["Length"]+1e-3)
    try:
        df["BoxCox_PRate"], _ = boxcox(df["Packet_Rate"]+1e-6)
    except Exception:
        df["BoxCox_PRate"] = np.log1p(df["Packet_Rate"])
    return df

df_train = build_features(df_train)
df_test  = build_features(df_test)

In [6]:
# ------------------------- #
# 5. SCALE
# ------------------------- #
NUM_FEATURES = [
    "Time_Diff","Log_IATime","Log_BRate",
    "BoxCox_Length","BoxCox_PRate",
    "Length_Mean","Length_Std","Pkt_Per_Src"
]
scaler = MinMaxScaler().fit(df_train[NUM_FEATURES])
df_train_scaled = pd.DataFrame(scaler.transform(df_train[NUM_FEATURES]), columns=NUM_FEATURES)
df_test_scaled  = pd.DataFrame(scaler.transform(df_test[NUM_FEATURES]), columns=NUM_FEATURES)


In [7]:
# ------------------------- #
# 6. ISOLATION FOREST
# ------------------------- #
iso = IsolationForest(n_estimators=500, contamination="auto", random_state=RANDOM_STATE, verbose=0)
iso.fit(df_train_scaled)
df_train_scaled["Anomaly_IForest"] = iso.predict(df_train_scaled)
df_test_scaled["Anomaly_IForest"]  = iso.predict(df_test_scaled)


In [8]:
# ------------------------- #
# 7. LOF
# ------------------------- #
lof = LocalOutlierFactor(n_neighbors=30, contamination="auto", metric="manhattan", n_jobs=-1)
def lof_predict_batched(X, batch=LOF_BATCH, lof_model=None):
    if lof_model is None: lof_model=LocalOutlierFactor(n_neighbors=30)
    y=np.zeros(len(X),dtype=int)
    for i in range(0,len(X),batch):
        y[i:i+batch]=lof_model.fit_predict(X.iloc[i:i+batch])
    return y

df_train_scaled["Anomaly_LOF"] = lof_predict_batched(df_train_scaled)
df_test_scaled["Anomaly_LOF"]  = lof_predict_batched(df_test_scaled)




In [9]:
# ------------------------- #
# 8. METRICS & SILHOUETTE
# ------------------------- #
def anomaly_percent(df,col): return (df[col]==-1).sum()/len(df)*100
print("IForest train %:",anomaly_percent(df_train_scaled,"Anomaly_IForest"))
print("IForest test  %:",anomaly_percent(df_test_scaled,"Anomaly_IForest"))
print("LOF train %:",anomaly_percent(df_train_scaled,"Anomaly_LOF"))
print("LOF test  %:",anomaly_percent(df_test_scaled,"Anomaly_LOF"))

def batched_silhouette(df,labels_col,batch=SIL_BATCH):
    scores=[]
    for i in range(0,len(df),batch):
        b=df.iloc[i:i+batch]
        if len(b[labels_col].unique())<2: continue
        scores.append(silhouette_score(b[NUM_FEATURES], b[labels_col]))
    return np.mean(scores) if scores else np.nan

print("Silhouette IForest train:",batched_silhouette(df_train_scaled,"Anomaly_IForest"))
print("Silhouette LOF train:",batched_silhouette(df_train_scaled,"Anomaly_LOF"))
print("Silhouette IForest test:",batched_silhouette(df_test_scaled,"Anomaly_IForest"))
print("Silhouette LOF test:",batched_silhouette(df_test_scaled,"Anomaly_LOF"))


IForest train %: 11.984405944417386
IForest test  %: 44.84624962852581
LOF train %: 7.558725532616693
LOF test  %: 7.731882444795474
Silhouette IForest train: 0.4265220563175156
Silhouette LOF train: 0.04784538270065358
Silhouette IForest test: 0.355805462166546


KeyboardInterrupt: 

In [None]:
# ------------------------- #
# 9. PLOTS
# ------------------------- #
def feature_sensitivity(df,features,label_col):
    res={}
    for f in features:
        mean_norm=df.loc[df[label_col]==1,f].mean()
        mean_anom=df.loc[df[label_col]==-1,f].mean()
        res[f]=mean_anom-mean_norm
    return res

for model in ["IForest","LOF"]:
    fs_train=feature_sensitivity(df_train_scaled,NUM_FEATURES,f"Anomaly_{model}")
    fs_test =feature_sensitivity(df_test_scaled,NUM_FEATURES,f"Anomaly_{model}")
    plt.figure(figsize=(8,5)); pd.Series(fs_train).sort_values().plot(kind="barh",color="orange"); plt.title(f"{model} Train Feature Sensitivity"); plt.show()
    plt.figure(figsize=(8,5)); pd.Series(fs_test).sort_values().plot(kind="barh",color="green"); plt.title(f"{model} Test Feature Sensitivity"); plt.show()

def kde_plots(df,features,label_col,model_name,dataset):
    for f in features:
        plt.figure(figsize=(6,4))
        sns.kdeplot(df.loc[df[label_col]==1,f],label="Normal",fill=True,color="blue")
        sns.kdeplot(df.loc[df[label_col]==-1,f],label="Anomaly",fill=True,color="red")
        plt.title(f"{model_name}-{dataset}-{f}")
        plt.legend(); plt.show()

kde_plots(df_train_scaled,NUM_FEATURES,"Anomaly_IForest","IForest","Train")
kde_plots(df_train_scaled,NUM_FEATURES,"Anomaly_LOF","LOF","Train")
kde_plots(df_test_scaled,NUM_FEATURES,"Anomaly_IForest","IForest","Test")
kde_plots(df_test_scaled,NUM_FEATURES,"Anomaly_LOF","LOF","Test")
