In [1]:
import pandas as pd
import numpy as np
import kagglehub
import os
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohdzia356/network-traffic-data-for-intrusion-detection")

print("Path to dataset files:", path)
print(os.listdir(path))
file_path = os.path.join(path,'network_traffic_data.csv')

Using Colab cache for faster access to the 'network-traffic-data-for-intrusion-detection' dataset.
Path to dataset files: /kaggle/input/network-traffic-data-for-intrusion-detection
['network_traffic_data.csv']


In [3]:
df = pd.read_csv(file_path)
print(tabulate(df.head(5), headers='keys', tablefmt='fancy_grid'))

╒════╤════════════╤════════════╤═══════════════╤═════════════════╤══════════════╤═══════════════════╤═══════════════╤═════════════╤═════════╕
│    │   Duration │ Protocol   │ SourceIP      │ DestinationIP   │   SourcePort │   DestinationPort │   PacketCount │   ByteCount │ Label   │
╞════╪════════════╪════════════╪═══════════════╪═════════════════╪══════════════╪═══════════════════╪═══════════════╪═════════════╪═════════╡
│  0 │    24.0777 │ TCP        │ 192.168.1.239 │ 192.168.1.234   │         8055 │                 1 │           827 │      198244 │ Attack  │
├────┼────────────┼────────────┼───────────────┼─────────────────┼──────────────┼───────────────────┼───────────────┼─────────────┼─────────┤
│  1 │    97.2524 │ ICMP       │ 192.168.1.176 │ 192.168.1.82    │        63174 │               687 │           673 │     1202973 │ Normal  │
├────┼────────────┼────────────┼───────────────┼─────────────────┼──────────────┼───────────────────┼───────────────┼─────────────┼─────────┤
│  2 │

In [4]:
protocol_by_label = df.groupby('Label')['Protocol'].value_counts().reset_index()

# Menampilkan hasilnya
print("Frekuensi Protokol per Label (Sebelum Encoding):")
print(tabulate(protocol_by_label, headers='keys', tablefmt='fancy_grid'))

Frekuensi Protokol per Label (Sebelum Encoding):
╒════╤═════════╤════════════╤═════════╕
│    │ Label   │ Protocol   │   count │
╞════╪═════════╪════════════╪═════════╡
│  0 │ Attack  │ TCP        │     374 │
├────┼─────────┼────────────┼─────────┤
│  1 │ Attack  │ UDP        │     329 │
├────┼─────────┼────────────┼─────────┤
│  2 │ Attack  │ ICMP       │     319 │
├────┼─────────┼────────────┼─────────┤
│  3 │ Normal  │ ICMP       │     339 │
├────┼─────────┼────────────┼─────────┤
│  4 │ Normal  │ UDP        │     323 │
├────┼─────────┼────────────┼─────────┤
│  5 │ Normal  │ TCP        │     316 │
╘════╧═════════╧════════════╧═════════╛


In [5]:
total_label = df['Label'].value_counts().reset_index()
total_label.columns = ['Label', 'Count']
from tabulate import tabulate
print(tabulate(total_label, headers='keys', tablefmt='fancy_grid'))


╒════╤═════════╤═════════╕
│    │ Label   │   Count │
╞════╪═════════╪═════════╡
│  0 │ Attack  │    1022 │
├────┼─────────┼─────────┤
│  1 │ Normal  │     978 │
╘════╧═════════╧═════════╛


In [6]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

print(tabulate(df.head(5), headers='keys', tablefmt='fancy_grid'))

╒════╤════════════╤════════════╤════════════╤═════════════════╤══════════════╤═══════════════════╤═══════════════╤══════════════════╤═════════╕
│    │   Duration │   Protocol │   SourceIP │   DestinationIP │   SourcePort │   DestinationPort │   PacketCount │        ByteCount │   Label │
╞════╪════════════╪════════════╪════════════╪═════════════════╪══════════════╪═══════════════════╪═══════════════╪══════════════════╪═════════╡
│  0 │    24.0777 │          1 │        155 │             150 │         8055 │                 1 │           827 │ 198244           │       0 │
├────┼────────────┼────────────┼────────────┼─────────────────┼──────────────┼───────────────────┼───────────────┼──────────────────┼─────────┤
│  1 │    97.2524 │          0 │         85 │             236 │        63174 │               687 │           673 │      1.20297e+06 │       1 │
├────┼────────────┼────────────┼────────────┼─────────────────┼──────────────┼───────────────────┼───────────────┼──────────────────┼───

In [7]:
datatype = df.dtypes.reset_index()
datatype.columns = ['column_name', 'column_type']
print(tabulate(datatype, headers='keys', tablefmt='fancy_grid'))

╒════╤═════════════════╤═══════════════╕
│    │ column_name     │ column_type   │
╞════╪═════════════════╪═══════════════╡
│  0 │ Duration        │ float64       │
├────┼─────────────────┼───────────────┤
│  1 │ Protocol        │ int64         │
├────┼─────────────────┼───────────────┤
│  2 │ SourceIP        │ int64         │
├────┼─────────────────┼───────────────┤
│  3 │ DestinationIP   │ int64         │
├────┼─────────────────┼───────────────┤
│  4 │ SourcePort      │ int64         │
├────┼─────────────────┼───────────────┤
│  5 │ DestinationPort │ int64         │
├────┼─────────────────┼───────────────┤
│  6 │ PacketCount     │ int64         │
├────┼─────────────────┼───────────────┤
│  7 │ ByteCount       │ int64         │
├────┼─────────────────┼───────────────┤
│  8 │ Label           │ int64         │
╘════╧═════════════════╧═══════════════╛


In [8]:
missing = df.isnull().sum().reset_index()
missing.columns = ['column_name', 'missing_count']
print(tabulate(missing, headers='keys', tablefmt='fancy_grid'))

╒════╤═════════════════╤═════════════════╕
│    │ column_name     │   missing_count │
╞════╪═════════════════╪═════════════════╡
│  0 │ Duration        │               0 │
├────┼─────────────────┼─────────────────┤
│  1 │ Protocol        │               0 │
├────┼─────────────────┼─────────────────┤
│  2 │ SourceIP        │               0 │
├────┼─────────────────┼─────────────────┤
│  3 │ DestinationIP   │               0 │
├────┼─────────────────┼─────────────────┤
│  4 │ SourcePort      │               0 │
├────┼─────────────────┼─────────────────┤
│  5 │ DestinationPort │               0 │
├────┼─────────────────┼─────────────────┤
│  6 │ PacketCount     │               0 │
├────┼─────────────────┼─────────────────┤
│  7 │ ByteCount       │               0 │
├────┼─────────────────┼─────────────────┤
│  8 │ Label           │               0 │
╘════╧═════════════════╧═════════════════╛


In [9]:
total_record = len(df)
print('Jumlah record: ', len(df))
total_unique_ip = df['SourceIP'].nunique()
print('Jumlah Unique Ip: ', total_unique_ip)
total_unique_destination = df['DestinationPort'].nunique()
print('Jumlah Unique Destination: ', total_unique_destination )
total_duplicate = df.duplicated().sum()
print("Total duplicate rows:", total_duplicate)

Jumlah record:  2000
Jumlah Unique Ip:  255
Jumlah Unique Destination:  870
Total duplicate rows: 0


In [10]:
# Label encode Protocol
le = LabelEncoder()
df['Protocol'] = le.fit_transform(df['Protocol'])

# Frequency encode IP
freq_source = df['SourceIP'].value_counts()
freq_dest = df['DestinationIP'].value_counts()
df['SourceIP'] = df['SourceIP'].map(freq_source)
df['DestinationIP'] = df['DestinationIP'].map(freq_dest)

In [11]:
X = df.drop('Label', axis=1)
y = df['Label']

# Spliting data
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# Reshaping the data to ensure it's 2D
x_train_reshaped = x_train.values.reshape(-1, 1)
x_test_reshaped = x_test.values.reshape(-1, 1)

# Standardizing the features using StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_reshaped)
x_test_scaled = scaler.transform(x_test_reshaped)

In [12]:
print('Banyak data x_train :',len(x_train))
print('Banyak data x_test  :',len(x_test))
print('Banyak data y_train :',len(y_train))
print('Banyak data y_test  :',len(y_test))

Banyak data x_train : 1600
Banyak data x_test  : 400
Banyak data y_train : 1600
Banyak data y_test  : 400


In [13]:
# Training Data
# --- Support Vector Machine ---
svm = SVC(random_state=42)
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)

# Hitung metrik
hasilSupportVectorMachine = accuracy_score(y_test, y_pred_svm)
precision1 = precision_score(y_test, y_pred_svm)
recall1 = recall_score(y_test, y_pred_svm)
f1_1 = f1_score(y_test, y_pred_svm)

# --- Multinomial Naive Bayes ---
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred_mnb = mnb.predict(x_test)

# Hitung metrik
hasilMultinomialNB = accuracy_score(y_test, y_pred_mnb)
precision2 = precision_score(y_test, y_pred_mnb)
recall2 = recall_score(y_test, y_pred_mnb)
f1_2 = f1_score(y_test, y_pred_mnb)

# --- Random Forest Classifier ---
rfc = RandomForestClassifier(random_state=42)
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)

# Hitung metrik
hasilRandomForestClassifier = accuracy_score(y_test, y_pred_rfc)
precision3 = precision_score(y_test, y_pred_rfc)
recall3 = recall_score(y_test, y_pred_rfc)
f1_3 = f1_score(y_test, y_pred_rfc)

In [14]:
model = {'Model':['Support Vector Machine',
                  'Multinomial Naive Bayes',
                  'Random Forest Classifier',
                 ],
         'AccuracyScore':[hasilSupportVectorMachine, hasilMultinomialNB, hasilRandomForestClassifier],
         'Precision':[precision1, precision2, precision3],
         'Recall':[recall1, recall2, recall3],
         'F1-Score':[f1_1, f1_2, f1_3]
         }
model_df = pd.DataFrame(model)
print(tabulate(model_df, headers='keys', tablefmt='fancy_grid'))

╒════╤══════════════════════════╤═════════════════╤═════════════╤══════════╤════════════╕
│    │ Model                    │   AccuracyScore │   Precision │   Recall │   F1-Score │
╞════╪══════════════════════════╪═════════════════╪═════════════╪══════════╪════════════╡
│  0 │ Support Vector Machine   │          0.5075 │    0.471698 │ 0.26178  │   0.3367   │
├────┼──────────────────────────┼─────────────────┼─────────────┼──────────┼────────────┤
│  1 │ Multinomial Naive Bayes  │          0.5075 │    0.485981 │ 0.544503 │   0.51358  │
├────┼──────────────────────────┼─────────────────┼─────────────┼──────────┼────────────┤
│  2 │ Random Forest Classifier │          0.5325 │    0.510638 │ 0.502618 │   0.506596 │
╘════╧══════════════════════════╧═════════════════╧═════════════╧══════════╧════════════╛


In [23]:
def predict_model(Duration, Protocol, SourceIP, DestinationIP, SourcePort, DestinationPort, PacketCount, ByteCount):
    # Buat DataFrame input
    data_input = pd.DataFrame({
        'Duration': [Duration],
        'Protocol': [Protocol],
        'SourceIP': [SourceIP],
        'DestinationIP': [DestinationIP],
        'SourcePort': [SourcePort],
        'DestinationPort': [DestinationPort],
        'PacketCount': [PacketCount],
        'ByteCount': [ByteCount]
    })

    # Preprocessing input
    data_input['Protocol'] = le.transform(data_input['Protocol'])
    data_input['SourceIP'] = data_input['SourceIP'].map(freq_source).fillna(0)
    data_input['DestinationIP'] = data_input['DestinationIP'].map(freq_dest).fillna(0)

    # Prediksi
    pred_svm = svm.predict(data_input)[0]
    pred_mnb = mnb.predict(data_input)[0]
    pred_rfc = rfc.predict(data_input)[0]

    print(f"SVM Prediction: {pred_svm}")
    print(f"MultinomialNB Prediction: {pred_mnb}")
    print(f"RandomForest Prediction: {pred_rfc}")

In [24]:
interact(predict_model,
         Duration=widgets.FloatText(value=24.0777, description='Duration:'),
         Protocol=widgets.Dropdown(
             options=[(cls, protocol_le.transform([cls])[0]) for cls in protocol_le.classes_],
             value=protocol_le.transform(['TCP'])[0],
             description='Protocol:'
         ),
         SourceIP=widgets.Text(value='192.168.1.1', description='SourceIP:'),
         DestinationIP=widgets.Text(value='10.0.0.1', description='DestinationIP:'),
         SourcePort=widgets.IntText(value=8055, description='SourcePort:'),
         DestinationPort=widgets.IntText(value=1, description='DestinationPort:'),
         PacketCount=widgets.IntText(value=827, description='PacketCount:'),
         ByteCount=widgets.IntText(value=198244, description='ByteCount:')
)

interactive(children=(FloatText(value=24.0777, description='Duration:'), Dropdown(description='Protocol:', ind…