In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install gradio
!pip install transformers

Collecting gradio
  Downloading gradio-5.13.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [3]:
import gradio as gr
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertModel
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset
from io import BytesIO
import base64

In [4]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define constants
EMBEDDING_DIM = 32
TOP_20_FEATURES = [
    'sttl', 'ct_state_ttl', 'Ltime', 'Stime', 'Dload', 'dttl', 'tcprtt',
    'dmeansz', 'ackdat', 'synack', 'dsport', 'Dpkts', 'Sload', 'ct_dst_ltm',
    'dbytes', 'state', 'Dintpkt', 'ct_src_ ltm', 'ct_dst_sport_ltm', 'ct_src_dport_ltm'
]

# Fix torch dynamo error (Torch 2.x compatibility issue)
try:
    import torch._dynamo
except AttributeError:
    pass  # Skip if the error doesn't exist

# Embedding Handler Class
class EmbeddingHandler:
    def __init__(self, embedding_dim=32):
        self.embedding_dim = embedding_dim
        self.embedding_layers = {}
        self.feature_mins = {}
        self.feature_maxs = {}
        self.output_embedding_layer = None

    def transform(self, X, features):
        input_embeddings = []
        for column in features:
            if column not in self.feature_mins:
                raise ValueError(f"Missing scaling info for feature: {column}")
            feature_values = X[column].values
            feature_values = np.clip(
                feature_values,
                self.feature_mins[column],
                self.feature_maxs[column]
            )
            feature_indices = feature_values - self.feature_mins[column]
            feature_indices_tensor = torch.tensor(feature_indices, dtype=torch.long, device=device)
            feature_embedded = self.embedding_layers[column](feature_indices_tensor)
            input_embeddings.append(feature_embedded)
        return torch.cat(input_embeddings, dim=1)

    def load(self, feature_path, output_path, mins_maxs_path):
        self.embedding_layers = torch.load(feature_path, map_location=device)
        self.output_embedding_layer = nn.Embedding(2, self.embedding_dim).to(device)
        self.output_embedding_layer.load_state_dict(torch.load(output_path, map_location=device))
        scaling_factors = torch.load(mins_maxs_path, map_location=device)
        self.feature_mins = scaling_factors['mins']
        self.feature_maxs = scaling_factors['maxs']

# Model Class
class TabularDistilBERT(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim=768, seq_length=20):
        super(TabularDistilBERT, self).__init__()
        self.seq_length = seq_length
        self.embedding_dim = embedding_dim
        self.features_per_position = input_dim // seq_length

        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.input_projection = nn.Linear(self.features_per_position, embedding_dim)
        self.positional_embeddings = nn.Embedding(seq_length, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, self.seq_length, -1)
        x_projected = self.input_projection(x)
        position_ids = torch.arange(self.seq_length, device=x.device).unsqueeze(0).expand(batch_size, -1)
        pos_embeds = self.positional_embeddings(position_ids)
        x_with_pos = x_projected + pos_embeds
        bert_output = self.distilbert(inputs_embeds=x_with_pos)
        cls_output = bert_output.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.fc(cls_output)
        return logits

def preprocess_data(data_path):
    """
    Load and preprocess the network traffic dataset.

    Steps:
    1. Load the data and assign column names.
    2. Drop unnecessary columns: 'service' and 'attack_cat'.
    3. Convert numerical columns to numeric, handling invalid values.
    4. Scale numerical columns using a preloaded scaler.
    5. Encode the 'state' column using a preloaded label encoder.
    6. Handle missing values by filling them with 0.
    """
    try:
        # Step 1: Load data and define column names
        Columns = [
            'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
            'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
            'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
            'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
            'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
            'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
            'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
            'ct_dst_src_ltm', 'attack_cat', 'Label'
        ]
        data = pd.read_csv(data_path)

        if len(data.columns) != len(Columns):
            raise ValueError("Column mismatch: Check if the input file has correct columns.")

        data.columns = Columns

        # Step 2: Drop 'service' and 'attack_cat' columns
        data.drop(columns=['service', 'attack_cat'], inplace=True, errors='ignore')

        # Step 3: Identify numerical columns
        numerical_columns = [
            'sport', 'dsport', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
            'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
            'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime',
            'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'ct_state_ttl',
            'ct_flw_http_mthd', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm',
            'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'
        ]

        # Step 4: Convert numerical columns to numeric and handle invalid values
        for col in numerical_columns:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        # Fill missing numeric values with 0
        data.fillna(0, inplace=True)

        # Step 5: Scale numerical columns using a preloaded scaler
        scaler_path = "/content/drive/MyDrive/major project/scaler.pkl"
        try:
            scaler = pd.read_pickle(scaler_path)
            data[numerical_columns] = scaler.transform(data[numerical_columns])
        except FileNotFoundError:
            raise FileNotFoundError(f"Scaler file not found at {scaler_path}")

        # Step 6: Encode 'state' column using a preloaded label encoder
        encoder_path = "/content/drive/MyDrive/major project/label_encoders_state.pkl"
        try:
            state_encoder = pd.read_pickle(encoder_path)
            data['state'] = state_encoder.transform(data['state'])
        except FileNotFoundError:
            raise FileNotFoundError(f"State encoder file not found at {encoder_path}")
        except KeyError:
            raise KeyError("Missing or invalid values in the 'state' column during encoding.")

        print("Preprocessing completed successfully.")
        return data

    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

def analyze_traffic(csv_file):
    data = preprocess_data(csv_file)
    X = data[TOP_20_FEATURES]
    y = data["Label"]

    handler = EmbeddingHandler(embedding_dim=EMBEDDING_DIM)
    handler.load(
        '/content/drive/MyDrive/major project/feature_embeddings',
        '/content/drive/MyDrive/major project/output_embedding',
        '/content/drive/MyDrive/major project/feature_scaling_factors')

    X_embedded = handler.transform(X, TOP_20_FEATURES)
    X_test = torch.tensor(X_embedded, dtype=torch.float32)
    y_test = torch.tensor(y, dtype=torch.long)

    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    checkpoint = torch.load("/content/drive/MyDrive/major project/distilBERT_model_final", map_location=device)
    model = TabularDistilBERT(input_dim=checkpoint['input_dim'], output_dim=2).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X.to(device))
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.numpy())

    report = classification_report(all_labels, all_predictions, digits=4)
    conf_matrix = confusion_matrix(all_labels, all_predictions)
    plt.figure(figsize=(8, 5))
    #sns.heatmap(conf_matrix, annot=True, cmap='Blues')
    sns.heatmap(conf_matrix, annot=True, cmap='magma', fmt='d')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
    img_buffer1 = BytesIO()
    plt.savefig(img_buffer1, format='png')
    plt.close()

    img_str1 = base64.b64encode(img_buffer1.getvalue()).decode()

    return report , f'<img src="data:image/png;base64,{img_str1}" />'


# Gradio Interface
iface = gr.Interface(
    fn=analyze_traffic,
    inputs=gr.File(label="Upload Network Traffic CSV"),
    outputs=[
        gr.Textbox(label="Classification Report"),
        gr.HTML(label="Confusion Matrix")
    ],
    title="Network Traffic Classifier",
    description="Upload a CSV file containing network traffic data to classify normal vs. attack traffic."
)

if __name__ == "__main__":
    iface.launch()

#


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7f0a8a539eab43b2e0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
#### end ########