In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
columns = [
'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
'wrong_fragment','urgent','hot','num_failed_logins','logged_in',
'num_compromised','root_shell','su_attempted','num_root',
'num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login','is_guest_login','count','srv_count','serror_rate',
'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
'diff_srv_rate','srv_diff_host_rate','dst_host_count',
'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate','dst_host_srv_serror_rate',
'dst_host_rerror_rate','dst_host_srv_rerror_rate','label','difficulty'
]

df = pd.read_csv("KDDTrain+.txt", names=columns)


In [3]:
KEEP_COLS = [
    'duration',
    'protocol_type',
    'src_bytes',
    'dst_bytes',
    'count',
    'srv_count',
    'label'
]

df = df[KEEP_COLS]


In [27]:
attack_map = {
    "normal": "benign",

    # DoS
    "neptune": "dos",
    "smurf": "dos",
    "back": "dos",
    "teardrop": "dos",

    # Probe
    "satan": "probe",
    "ipsweep": "probe",
    "portsweep": "probe",

    # R2L
    "guess_passwd": "r2l",
    "ftp_write": "r2l",

    # U2R
    "buffer_overflow": "u2r",
    "rootkit": "u2r"
}


df['label'] = df['label'].map(attack_map)
df = df.dropna()


In [7]:
proto_enc = LabelEncoder()
df['protocol_type'] = proto_enc.fit_transform(df['protocol_type'])

label_enc = LabelEncoder()
df['label_enc'] = label_enc.fit_transform(df['label'])


In [6]:
df.head()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count,label
0,0,tcp,491,0,2,2,benign
1,0,udp,146,0,13,1,benign
2,0,tcp,0,0,123,6,dos
3,0,tcp,232,8153,5,5,benign
4,0,tcp,199,420,30,32,benign


In [8]:
X = df.drop(['label','label_enc'], axis=1)
y = df['label_enc']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

rf.fit(X_train, y_train)


In [10]:
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))


              precision    recall  f1-score   support

      benign       1.00      0.97      0.98     13469
         dos       0.99      0.98      0.98      9186
       probe       0.79      0.97      0.87      2331
         r2l       1.00      0.97      0.98       199
         u2r       0.54      0.70      0.61        10

    accuracy                           0.97     25195
   macro avg       0.86      0.92      0.89     25195
weighted avg       0.98      0.97      0.97     25195



In [11]:
pickle.dump(rf, open("random_forest_model2.pkl", "wb"))
pickle.dump(label_enc, open("label_encoder2.pkl", "wb"))
pickle.dump(proto_enc, open("protocol_encoder2.pkl", "wb"))


In [13]:
rf = pickle.load(open("random_forest_model2.pkl", "rb"))
proto_enc = pickle.load(open("protocol_encoder2.pkl", "rb"))
label_enc = pickle.load(open("label_encoder2.pkl", "rb"))


In [14]:
ws = pd.read_csv("NEW.csv", encoding="latin-1")


In [17]:
ws.head()

Unnamed: 0,No.,Time,Source,Destination,protocol_type,bytes,Info
0,1,0.0,192.168.2.13,1.1.1.1,DNS,85,Standard query 0x2c5f HTTPS chrome.cloudflare-...
1,2,0.000359,192.168.2.13,1.1.1.1,DNS,85,Standard query 0x26d0 A chrome.cloudflare-dns.com
2,3,0.029324,1.1.1.1,192.168.2.13,DNS,117,Standard query response 0x26d0 A chrome.cloudf...
3,4,0.029324,1.1.1.1,192.168.2.13,DNS,158,Standard query response 0x2c5f HTTPS chrome.cl...
4,5,0.030402,192.168.2.13,172.64.41.3,TCP,66,59103 > 443 [SYN] Seq=0 Win=64240 Len=0 MSS=...


In [18]:
ws = ws.rename(columns={
    "Protocol": "protocol_type",
    "Length": "bytes"
})


In [19]:
# duration (use Time directly or normalized)
ws["duration"] = ws["Time"]

# src_bytes & dst_bytes (Wireshark does not separate them)
ws["src_bytes"] = ws["bytes"]
ws["dst_bytes"] = 0   # keep 0 to match training shape

# count = packets per source
ws["count"] = ws.groupby("Source")["Source"].transform("count")

# srv_count = packets per source-destination pair
ws["srv_count"] = ws.groupby(["Source", "Destination"])["Source"].transform("count")


In [20]:
# Handle unseen protocols safely
ws["protocol_type"] = ws["protocol_type"].astype(str)

known_protocols = set(proto_enc.classes_)
ws["protocol_type"] = ws["protocol_type"].apply(
    lambda x: x if x in known_protocols else proto_enc.classes_[0]
)

ws["protocol_type"] = proto_enc.transform(ws["protocol_type"])


In [23]:
X_ws = ws[
    ["duration", "protocol_type", "src_bytes", "dst_bytes", "count", "srv_count"]
]


In [24]:
y_pred_enc = rf.predict(X_ws)
y_pred = label_enc.inverse_transform(y_pred_enc)

ws["prediction"] = y_pred


In [26]:
ws[["Source", "Destination", "protocol_type", "prediction"]].head(100)


Unnamed: 0,Source,Destination,protocol_type,prediction
0,192.168.2.13,1.1.1.1,0,benign
1,192.168.2.13,1.1.1.1,0,benign
2,1.1.1.1,192.168.2.13,0,benign
3,1.1.1.1,192.168.2.13,0,benign
4,192.168.2.13,172.64.41.3,0,benign
...,...,...,...,...
95,172.64.41.3,192.168.2.13,0,benign
96,172.64.41.3,192.168.2.13,0,benign
97,192.168.2.13,172.64.41.3,0,benign
98,192.168.2.6,224.0.0.251,0,benign
