In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import pandas as pd
import glob
import os

# Path to extracted CSV files
csv_path = "//content/drive/MyDrive/pcap_csv/*.csv"

# Load and merge
all_files = glob.glob(csv_path)
df_list = []

for f in all_files:
    df_temp = pd.read_csv(f)
    df_temp["filename"] = os.path.basename(f)  # keep track of source file
    df_list.append(df_temp)

df_all = pd.concat(df_list, ignore_index=True)
print("Combined dataset shape:", df_all.shape)
df_all.head()


Combined dataset shape: (7059765, 18)


Unnamed: 0,time,proto,data_len,ip_src,ip_dst,src_port,dst_port,filename,pred_label,isf_pred,hour,bytes_total,pkts_total,avg_pkt_size,byte_ratio,security_tags,severity_score,severity_level
0,1551366000.0,17.0,180.0,192.168.1.149,239.255.255.250,56760.0,1900.0,web_1page_06.csv,,,,,,,,,,
1,1551366000.0,17.0,55.0,192.168.1.149,192.168.1.1,36462.0,53.0,web_1page_06.csv,,,,,,,,,,
2,1551366000.0,17.0,326.0,192.168.1.1,192.168.1.149,53.0,36462.0,web_1page_06.csv,,,,,,,,,,
3,1551366000.0,6.0,213.0,192.168.1.149,172.217.16.227,41178.0,443.0,web_1page_06.csv,,,,,,,,,,
4,1551366000.0,6.0,1418.0,172.217.16.227,192.168.1.149,443.0,41178.0,web_1page_06.csv,,,,,,,,,,


In [3]:
mapping = pd.read_csv("/content/drive/MyDrive/pcap_mapping.csv")
print(mapping.head())

            host              file label
0  192.168.1.149  bulk_115s_01.csv  bulk
1  192.168.1.149  bulk_130s_01.csv  bulk
2  192.168.1.149  bulk_170s_01.csv  bulk
3  192.168.1.149    bulk_xs_01.csv  bulk
4  192.168.1.149    bulk_xs_02.csv  bulk


In [4]:
# Merge datasets
df_all = df_all.merge(mapping, left_on="filename", right_on="file", how="left")

print("Merged dataset shape:", df_all.shape)
print(df_all[["filename","label"]].head())
print(df_all["label"].value_counts())


Merged dataset shape: (7059765, 21)
           filename label
0  web_1page_06.csv   web
1  web_1page_06.csv   web
2  web_1page_06.csv   web
3  web_1page_06.csv   web
4  web_1page_06.csv   web
label
bulk           5867817
video           958910
web             143229
interactive      74902
idle              1812
Name: count, dtype: int64


In [5]:
df_all.head()

Unnamed: 0,time,proto,data_len,ip_src,ip_dst,src_port,dst_port,filename,pred_label,isf_pred,...,bytes_total,pkts_total,avg_pkt_size,byte_ratio,security_tags,severity_score,severity_level,host,file,label
0,1551366000.0,17.0,180.0,192.168.1.149,239.255.255.250,56760.0,1900.0,web_1page_06.csv,,,...,,,,,,,,192.168.1.149,web_1page_06.csv,web
1,1551366000.0,17.0,55.0,192.168.1.149,192.168.1.1,36462.0,53.0,web_1page_06.csv,,,...,,,,,,,,192.168.1.149,web_1page_06.csv,web
2,1551366000.0,17.0,326.0,192.168.1.1,192.168.1.149,53.0,36462.0,web_1page_06.csv,,,...,,,,,,,,192.168.1.149,web_1page_06.csv,web
3,1551366000.0,6.0,213.0,192.168.1.149,172.217.16.227,41178.0,443.0,web_1page_06.csv,,,...,,,,,,,,192.168.1.149,web_1page_06.csv,web
4,1551366000.0,6.0,1418.0,172.217.16.227,192.168.1.149,443.0,41178.0,web_1page_06.csv,,,...,,,,,,,,192.168.1.149,web_1page_06.csv,web


In [6]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7059765 entries, 0 to 7059764
Data columns (total 21 columns):
 #   Column          Dtype  
---  ------          -----  
 0   time            float64
 1   proto           float64
 2   data_len        float64
 3   ip_src          object 
 4   ip_dst          object 
 5   src_port        float64
 6   dst_port        float64
 7   filename        object 
 8   pred_label      object 
 9   isf_pred        float64
 10  hour            float64
 11  bytes_total     float64
 12  pkts_total      float64
 13  avg_pkt_size    float64
 14  byte_ratio      float64
 15  security_tags   object 
 16  severity_score  float64
 17  severity_level  object 
 18  host            object 
 19  file            object 
 20  label           object 
dtypes: float64(12), object(9)
memory usage: 1.1+ GB


In [7]:
import pandas as pd

# Convert time to datetime
df_all["time"] = pd.to_datetime(df_all["time"], unit="s")

# Aggregate to flows (5-tuple + label)
flow_features = df_all.groupby(
    ["ip_src","ip_dst","src_port","dst_port","proto","label"]
).agg(
    start_time=("time","min"),
    end_time=("time","max"),
    duration=("time", lambda x: (x.max() - x.min()).total_seconds()),
    bytes_total=("data_len","sum"),
    pkts_total=("data_len","count"),
    avg_pkt_size=("data_len","mean")
).reset_index()

print("Flow dataset shape:", flow_features.shape)
print(flow_features.head())


Flow dataset shape: (13095, 12)
           ip_src           ip_dst  src_port  dst_port  proto  label  \
0         0.0.0.0  255.255.255.255      68.0      67.0   17.0   bulk   
1    1.113.121.55     192.168.1.80     443.0   44376.0    6.0  video   
2    1.136.180.84    192.168.1.149     443.0   50452.0    6.0   idle   
3  100.24.181.241    192.168.1.149     443.0   44654.0    6.0  video   
4  100.24.181.241    192.168.1.149     443.0   44894.0    6.0  video   

                     start_time                      end_time     duration  \
0 2019-03-01 11:45:58.217596054 2019-03-01 14:20:51.915430069  9293.697834   
1 2019-04-08 07:04:26.720361948 2019-04-08 07:05:24.878137112    58.157775   
2 2019-04-08 14:19:51.288703918 2019-04-08 14:21:48.714512110   117.425808   
3 2019-03-01 09:35:57.845640898 2019-03-01 09:35:57.845640898     0.000000   
4 2019-03-01 09:47:31.415157080 2019-03-01 09:48:31.923707962    60.508550   

   bytes_total  pkts_total  avg_pkt_size  
0       1568.0         

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================
# Step 0: Prepare features/target
# =========================
numeric_features = ["duration","bytes_total","pkts_total","avg_pkt_size"]
categorical_features = ["proto","src_port","dst_port"]

X = flow_features[numeric_features + categorical_features]
y = flow_features["label"]

# Encode target labels (bulk, web, video, etc.)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# =========================
# Step 1: Preprocessor
# =========================
# Numeric → passthrough
# Categorical → OneHotEncode
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================
# Step 2: Pipeline (Preprocessor + XGBoost)
# =========================
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=200,
        random_state=42,
        eval_metric="mlogloss"
    ))
])

# Train
clf_pipeline.fit(X_train, y_train)

# Predictions
y_pred = clf_pipeline.predict(X_test)
y_pred_prob = clf_pipeline.predict_proba(X_test).max(axis=1)

# =========================
# Step 3: Evaluation
# =========================
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Example decode back to original labels
decoded_preds = le.inverse_transform(y_pred[:10])
print("Decoded predictions (first 10):", decoded_preds)


Classification Report:
              precision    recall  f1-score   support

        bulk       0.55      0.57      0.56        83
        idle       0.50      0.55      0.52       175
 interactive       0.67      0.42      0.52       143
       video       0.65      0.37      0.47       428
         web       0.89      0.96      0.92      3100

    accuracy                           0.85      3929
   macro avg       0.65      0.57      0.60      3929
weighted avg       0.83      0.85      0.83      3929

Confusion Matrix:
[[  47   13    0    1   22]
 [   9   96   13    1   56]
 [   1   23   60    9   50]
 [  22   18    4  159  225]
 [   7   42   12   74 2965]]
Decoded predictions (first 10): ['video' 'web' 'web' 'web' 'web' 'web' 'web' 'idle' 'web' 'web']


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================
# Step 0: Prepare features/target
# =========================
numeric_features = ["duration","bytes_total","pkts_total","avg_pkt_size"]
categorical_features = ["proto","src_port","dst_port"]

X = flow_features[numeric_features + categorical_features]
y = flow_features["label"]

# Encode target labels (bulk, web, video, etc.)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# =========================
# Step 1: Preprocessor
# =========================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================
# Step 2: Base Classifier Pipeline
# =========================
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=200,
        random_state=42,
        eval_metric="mlogloss"
    ))
])

clf_pipeline.fit(X_train, y_train)

# Predictions + confidence
y_pred_class = clf_pipeline.predict(X_test)
y_pred_conf = clf_pipeline.predict_proba(X_test).max(axis=1)

# =========================
# Step 3: Anomaly Detection
# =========================
# Transform data into numeric-only (post preprocessing) for anomaly detectors
X_train_transformed = clf_pipeline.named_steps["preprocessor"].transform(X_train)
X_test_transformed = clf_pipeline.named_steps["preprocessor"].transform(X_test)

# Isolation Forest
isf = IsolationForest(contamination=0.05, random_state=42)
isf.fit(X_train_transformed)
isf_scores = -isf.decision_function(X_test_transformed)

# LOF
lof = LocalOutlierFactor(n_neighbors=20, novelty=True)
lof.fit(X_train_transformed)
lof_scores = -lof.decision_function(X_test_transformed)

# =========================
# Step 4: Fusion Dataset
# =========================
fusion_df = pd.DataFrame({
    "pred_label_enc": y_pred_class,
    "class_confidence": y_pred_conf,
    "isf_score": isf_scores,
    "lof_score": lof_scores,
})

# Weak supervision: suspicious if anomaly scores high
fusion_df["suspicious_target"] = (
    (fusion_df["isf_score"] > 0.5) | (fusion_df["lof_score"] > 0.5)
).astype(int)

print("Fusion dataset head:")
print(fusion_df.head())

# =========================
# Step 5: Meta-Classifier
# =========================
meta_model = XGBClassifier(n_estimators=200, random_state=42, eval_metric="mlogloss")
meta_model.fit(
    fusion_df.drop(columns=["suspicious_target"]),
    fusion_df["suspicious_target"]
)

# Example prediction
print("Meta prediction example:", meta_model.predict(fusion_df.drop(columns=["suspicious_target"]).iloc[[0]])[0])

# =========================
# Step 6: Evaluation (Base Classifier)
# =========================
print("\nBase Classifier Report:")
print(classification_report(y_test, y_pred_class, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_class))


Fusion dataset head:
   pred_label_enc  class_confidence  isf_score  lof_score  suspicious_target
0               3          0.553672  -0.006481   0.050539                  0
1               4          0.736072  -0.007197  -0.500919                  0
2               4          0.978811  -0.005530  -0.510907                  0
3               4          0.946325  -0.007197  -0.527351                  0
4               4          0.986276  -0.005530  -0.512573                  0
Meta prediction example: 0

Base Classifier Report:
              precision    recall  f1-score   support

        bulk       0.55      0.57      0.56        83
        idle       0.50      0.55      0.52       175
 interactive       0.67      0.42      0.52       143
       video       0.65      0.37      0.47       428
         web       0.89      0.96      0.92      3100

    accuracy                           0.85      3929
   macro avg       0.65      0.57      0.60      3929
weighted avg       0.83      0.

In [10]:
import gradio as gr
import numpy as np

# Function to make predictions on a single flow
def predict_flow(duration, bytes_total, pkts_total, avg_pkt_size, proto, src_port, dst_port):
    # Create dataframe for input
    input_df = pd.DataFrame([{
        "duration": duration,
        "bytes_total": bytes_total,
        "pkts_total": pkts_total,
        "avg_pkt_size": avg_pkt_size,
        "proto": proto,
        "src_port": src_port,
        "dst_port": dst_port
    }])

    # --- Base classifier ---
    pred_class = clf_pipeline.predict(input_df)[0]
    pred_class_label = le.inverse_transform([pred_class])[0]
    pred_conf = clf_pipeline.predict_proba(input_df).max()

    # Transform input for anomaly detectors
    input_transformed = clf_pipeline.named_steps["preprocessor"].transform(input_df)
    isf_score = -isf.decision_function(input_transformed)[0]
    lof_score = -lof.decision_function(input_transformed)[0]

    # --- Fusion / Meta-classifier ---
    fusion_features = pd.DataFrame([{
        "pred_label_enc": pred_class,
        "class_confidence": pred_conf,
        "isf_score": isf_score,
        "lof_score": lof_score
    }])

    suspicious = meta_model.predict(fusion_features)[0]
    suspicious_label = "Suspicious 🚨" if suspicious == 1 else "Normal ✅"

    # Return results
    return {
        "Predicted Traffic Type": pred_class_label,
        "Classifier Confidence": float(pred_conf),
        "IsolationForest Score": float(isf_score),
        "LOF Score": float(lof_score),
        "Final Decision": suspicious_label
    }

# Build Gradio interface
iface = gr.Interface(
    fn=predict_flow,
    inputs=[
        gr.Number(label="Duration (seconds)", value=60),
        gr.Number(label="Bytes Total", value=1000),
        gr.Number(label="Packets Total", value=10),
        gr.Number(label="Avg Packet Size", value=100),
        gr.Number(label="Protocol (e.g., 6=TCP, 17=UDP)", value=6),
        gr.Number(label="Source Port", value=12345),
        gr.Number(label="Destination Port", value=80),
    ],
    outputs="json",
    title="Hybrid Flow Anomaly Detector",
    description="Enter flow features to classify traffic type and detect suspicious flows."
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3486f4af630b1f58ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


