Extracting and Loading of NSL_KDD dataset from kaggle

In [1]:
import zipfile
import os

# Define path where the uploaded zip is stored
zip_path = "/content/archive.zip"

# Extract contents into a folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/dataset")

# Check what files are extracted
extracted_files = os.listdir("/content/dataset")
print("✅ Extracted Files:", extracted_files)



FileNotFoundError: [Errno 2] No such file or directory: '/content/archive.zip'

In [13]:
import pandas as pd

# Column names from NSL-KDD dataset documentation
column_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"
]

# Load the training data
train_df = pd.read_csv("/content/dataset/KDDTrain+_20Percent.txt", names=column_names)

# Load the test data
test_df = pd.read_csv("/content/dataset/KDDTest+.txt", names=column_names)

print("✅ Training shape:", train_df.shape)
print("✅ Testing shape:", test_df.shape)
train_df.head()



✅ Training shape: (25192, 42)
✅ Testing shape: (22544, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,tcp,ftp_data,SF,491,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
0,udp,other,SF,146,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
0,tcp,private,S0,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
0,tcp,http,SF,232,8153,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
0,tcp,http,SF,199,420,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


Encode

In [15]:
from sklearn.preprocessing import LabelEncoder

# Choose either train_df or test_df to encode
df = train_df.copy()  # or df = test_df.copy()

df_encoded = df.copy()
label_encoders = {}

# Encode all object-type (string) columns
for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

df_encoded.head()



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,1,19,9,491,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,11,20
0,2,41,9,146,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,11,15
0,1,46,5,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,9,19
0,1,22,9,232,8153,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,11,21
0,1,22,9,199,420,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,21


Each label (0 to 21) stands for a specific class, e.g.:

0 = Normal traffic

1 = DoS attack

2 = Probe

... etc. (these would match with your dataset's target labels)

Splitting of data into train and test for using RANDOMFOREST MODEL

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Define features (X) and target (y)
X = df_encoded.drop('label', axis=1)
y = df_encoded['label']

# Step 2: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Predict and evaluate
y_pred = rf_model.predict(X_test)


# Step 5: Print metrics
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




✅ Accuracy: 0.8399047367028314

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.33      1.00      0.50         2
           2       1.00      0.20      0.33         5
           3       0.50      0.40      0.44         5
           4       1.00      0.25      0.40         4
           5       0.00      0.00      0.00         2
           6       0.75      0.21      0.33        14
           7       0.30      0.50      0.38         6
           8       0.17      0.17      0.17         6
           9       0.67      0.35      0.46        17
          10       0.40      0.32      0.35        19
          11       0.49      0.56      0.52        36
          12       0.49      0.55      0.52        40
          13       0.42      0.36      0.39        22
          14       0.69      0.53      0.60        51
          15       0.79      0.76      0.78       221
          16       0.55

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
label_mapping = {
    0: "Normal",
    1: "DoS Attack (Denial of Service)",
    2: "Probe (Surveillance/Scanning)",
    3: "R2L (Remote to Local)",
    4: "U2R (User to Root)",
    5: "Data Theft",
    6: "Malicious Code Injection",
    7: "Worm Attack",
    8: "Botnet Activity",
    9: "Backdoor Access",
    10: "Phishing Attempt",
    11: "SQL Injection",
    12: "DDoS (Distributed Denial of Service)",
    13: "Brute Force Login Attempt",
    14: "DNS Spoofing",
    15: "ARP Poisoning",
    16: "Man-in-the-Middle (MITM)",
    17: "Zero-Day Exploit",
    18: "Rootkit Activity",
    19: "Keylogger Installed",
    20: "Exploit Kit Detected",
    21: "FTP Brute Force"
}


In [28]:
intrusions_detected = set([label_mapping[label] for label in y_pred if label != 0])
print("Intrusions Detected:\n• " + "\n• ".join(intrusions_detected))


Intrusions Detected:
• Man-in-the-Middle (MITM)
• Brute Force Login Attempt
• Malicious Code Injection
• Worm Attack
• SQL Injection
• DNS Spoofing
• ARP Poisoning
• Phishing Attempt
• Botnet Activity
• Keylogger Installed
• FTP Brute Force
• Probe (Surveillance/Scanning)
• Backdoor Access
• DoS Attack (Denial of Service)
• U2R (User to Root)
• Data Theft
• Zero-Day Exploit
• DDoS (Distributed Denial of Service)
• Exploit Kit Detected
• R2L (Remote to Local)
• Rootkit Activity
