# Network Intrusion Detection System using Random Forest (NSL-KDD)

This project implements a basic but effective **Network Intrusion Detection System (NIDS)** using the **NSL-KDD dataset**, a well-known benchmark in cybersecurity research. The classifier is trained using a **Random Forest** algorithm after proper preprocessing, feature selection, and evaluation.

---


1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import time

import warnings
warnings.filterwarnings('ignore')

2. Read Dataset(NSL-KDD)

In [4]:
df_0 = pd.read_csv("/kaggle/input/nslkdd/KDDTrain+.txt")
df=df_0.copy()
df.head()

Unnamed: 0,0,tcp,ftp_data,SF,491,0.1,0.2,0.3,0.4,0.5,...,0.17,0.03,0.17.1,0.00.6,0.00.7,0.00.8,0.05,0.00.9,normal,20
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


3. Preprocessing

3.1 Encoding

In [5]:
#features selction
features=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate']
df.columns= features+['attack_type','difficulty_level']

In [6]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,difficulty_level
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125972 non-null  int64  
 1   protocol_type                125972 non-null  object 
 2   service                      125972 non-null  object 
 3   flag                         125972 non-null  object 
 4   src_bytes                    125972 non-null  int64  
 5   dst_bytes                    125972 non-null  int64  
 6   land                         125972 non-null  int64  
 7   wrong_fragment               125972 non-null  int64  
 8   urgent                       125972 non-null  int64  
 9   hot                          125972 non-null  int64  
 10  num_failed_logins            125972 non-null  int64  
 11  logged_in                    125972 non-null  int64  
 12  num_compromised              125972 non-null  int64  
 13 

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,125972.0,287.147,2604.526,0.0,0.0,0.0,0.0,42908.0
src_bytes,125972.0,45567.101,5870354.481,0.0,0.0,44.0,276.0,1379963888.0
dst_bytes,125972.0,19779.271,4021285.112,0.0,0.0,0.0,516.0,1309937401.0
land,125972.0,0.0,0.014,0.0,0.0,0.0,0.0,1.0
wrong_fragment,125972.0,0.023,0.254,0.0,0.0,0.0,0.0,3.0
urgent,125972.0,0.0,0.014,0.0,0.0,0.0,0.0,3.0
hot,125972.0,0.204,2.15,0.0,0.0,0.0,0.0,77.0
num_failed_logins,125972.0,0.001,0.045,0.0,0.0,0.0,0.0,5.0
logged_in,125972.0,0.396,0.489,0.0,0.0,0.0,1.0,1.0
num_compromised,125972.0,0.279,23.942,0.0,0.0,0.0,0.0,7479.0


4. Data preprocessing

In [10]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

categorial_f=['protocol_type', 'service', 'flag'] #categorical features in numerical 
df=pd.get_dummies(df, columns=categorial_f)

le=LabelEncoder() #attack label
df['label'] = le.fit_transform(df['attack_type'])

#normalize numerical features
scaler = MinMaxScaler()
num_cols = [col for col in features if col not in categorial_f]
df[num_cols] = scaler.fit_transform(df[num_cols])

#split features and label
X=df.drop(['attack_type','difficulty_level','label'],axis=1,errors='ignore')
y=df['label']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (88180, 122)
X_test shape: (37792, 122)


6. feature selection

In [12]:
#compute coorelation of each feature with label
coorelations = X_train.corrwith(y_train).abs()
selected_features = coorelations.nlargest(20).index # first 20 features

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


7. Model training
   

In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 200, max_depth = 10, min_samples_leaf = 2, random_state = 42)
model.fit(X_train_selected, y_train)

#predictions on training and test
train_preds = model.predict(X_train_selected)
test_preds = model.predict(X_test_selected)


8. Evaluation

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, set_name):
    print(f"\n{set_name} Results:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")

evaluate(y_train, train_preds, "Training")
evaluate(y_test, test_preds, "Testing")



Training Results:
Accuracy: 0.9795
Precision: 0.9790
Recall: 0.9795
F1-Score: 0.9743

Testing Results:
Accuracy: 0.9782
Precision: 0.9772
Recall: 0.9782
F1-Score: 0.9723
