<a href="https://colab.research.google.com/github/Ananthitha-Johnson/Network-Intrusion-Detection-System-Machine-Learning/blob/main/Network_Intrusion_Detection_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Column names from NSL-KDD documentation
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label","difficulty"
]

train_url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
test_url  = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt"

train_df = pd.read_csv(train_url, names=columns)
test_df  = pd.read_csv(test_url, names=columns)

train_df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [3]:
# Binary classification
train_df["target"] = (train_df["label"] != "normal").astype(int)
test_df["target"]  = (test_df["label"] != "normal").astype(int)

train_df[["label","target"]].value_counts().head()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,target,Unnamed: 2_level_1
normal,0,67343
neptune,1,41214
satan,1,3633
ipsweep,1,3599
portsweep,1,2931


In [4]:
X_train = train_df.drop(columns=["label","difficulty","target"])
y_train = train_df["target"]

X_test = test_df.drop(columns=["label","difficulty","target"])
y_test = test_df["target"]


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

cat_cols = ["protocol_type","service","flag"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=2000))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, pred), 4))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.7537
[[8991  720]
 [4833 8000]]
              precision    recall  f1-score   support

           0       0.65      0.93      0.76      9711
           1       0.92      0.62      0.74     12833

    accuracy                           0.75     22544
   macro avg       0.78      0.77      0.75     22544
weighted avg       0.80      0.75      0.75     22544



In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, pred_rf), 4))
print(confusion_matrix(y_test, pred_rf))
print(classification_report(y_test, pred_rf))


Accuracy: 0.7677
[[9448  263]
 [4975 7858]]
              precision    recall  f1-score   support

           0       0.66      0.97      0.78      9711
           1       0.97      0.61      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.79      0.77     22544
weighted avg       0.83      0.77      0.76     22544



In [8]:
sample = X_test.sample(1, random_state=42)
result = rf.predict(sample)[0]

print("Prediction:", "ATTACK" if result == 1 else "NORMAL")

Prediction: NORMAL
