# Network Intrusion Detection System using Random Forest (NSL-KDD)

This project implements a basic but effective **Network Intrusion Detection System (NIDS)** using the **NSL-KDD dataset**, a well-known benchmark in cybersecurity research. The classifier is trained using a **Random Forest** algorithm after proper preprocessing, feature selection, and evaluation.

---


1. Import Libraries

In [49]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [50]:
!pip install pyswarms



In [51]:
!pip install geneticalgorithm



In [52]:
from geneticalgorithm import geneticalgorithm as ga
import pyswarms as ps

2. Read Dataset(NSL-KDD)

In [53]:
df_train = pd.read_csv('/content/KDDTrain+.txt', header=None)
df_test = pd.read_csv('/content/KDDTest+.txt', header=None)

In [54]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [55]:
columns=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','label','difficulty_level']
df_train = pd.read_csv('/content/KDDTrain+.txt', header=None)
df_test = pd.read_csv('/content/KDDTest+.txt', header=None)
df_train.columns = columns
df_test.columns = columns

In [56]:
#drop difficulty
df_train.drop(['difficulty_level'], axis=1, inplace=True)
df_test.drop(['difficulty_level'], axis=1, inplace=True)

In [57]:
df= pd.concat([df_train, df_test], axis=0)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148517 entries, 0 to 22543
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     148517 non-null  int64  
 1   protocol_type                148517 non-null  object 
 2   service                      148517 non-null  object 
 3   flag                         148517 non-null  object 
 4   src_bytes                    148517 non-null  int64  
 5   dst_bytes                    148517 non-null  int64  
 6   land                         148517 non-null  int64  
 7   wrong_fragment               148517 non-null  int64  
 8   urgent                       148517 non-null  int64  
 9   hot                          148517 non-null  int64  
 10  num_failed_logins            148517 non-null  int64  
 11  logged_in                    148517 non-null  int64  
 12  num_compromised              148517 non-null  int64  
 13  root_

In [59]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,148517.0,276.779305,2460.683,0.0,0.0,0.0,0.0,57715.0
src_bytes,148517.0,40227.949299,5409612.0,0.0,0.0,44.0,278.0,1379964000.0
dst_bytes,148517.0,17088.853593,3703525.0,0.0,0.0,0.0,571.0,1309937000.0
land,148517.0,0.000215,0.01467714,0.0,0.0,0.0,0.0,1.0
wrong_fragment,148517.0,0.020523,0.2400691,0.0,0.0,0.0,0.0,3.0
urgent,148517.0,0.000202,0.01941708,0.0,0.0,0.0,0.0,3.0
hot,148517.0,0.189379,2.01316,0.0,0.0,0.0,0.0,101.0
num_failed_logins,148517.0,0.004323,0.07224823,0.0,0.0,0.0,0.0,5.0
logged_in,148517.0,0.402789,0.4904606,0.0,0.0,0.0,1.0,1.0
num_compromised,148517.0,0.255062,22.23137,0.0,0.0,0.0,0.0,7479.0


3. Preprocessing

3.1 Encoding

In [60]:
label_encoder={}
cat_cols=['protocol_type','service','flag']
for col in cat_cols:
    le=LabelEncoder()
    df[col]=le.fit_transform(df[col])
    label_encoder[col]=le

#binary classification
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)

#split
train_len = df_train.shape[0]
df_all_train = df.iloc[:train_len, :].copy()
df_all_test = df.iloc[train_len:, :].copy()

X_train = df_all_train.drop('label', axis=1).values
y_train = df_all_train['label'].values
X_test = df_all_test.drop('label', axis=1).values
y_test = df_all_test['label'].values

#feature normalize
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [61]:
from sklearn.model_selection import cross_val_score

In [None]:
#ega pso layer
def fitness_func(features_mask):
    features_mask = np.round(features_mask).astype(int)
    if np.sum(features_mask) == 0:  # Avoid empty feature sets
        return 1.0
    selected_features = X_train[:, features_mask==1]
    clf = RandomForestClassifier(n_estimators=20, random_state=1)
    clf.fit(selected_features, y_train)
    preds = clf.predict(selected_features)
    score = 1.0 - accuracy_score(y_train, preds)  # minimize error
    return score

varbound = np.array([[0, 1]] * X_train.shape[1])

algorithm_param = {
    'max_num_iteration': 20,
    'population_size': 20,
    'mutation_probability': 0.1,
    'elit_ratio': 0.05,
    'crossover_probability': 0.5,
    'parents_portion': 0.3,
    'crossover_type': 'uniform',
    'max_iteration_without_improv': 7
}

model = ga(
    function=fitness_func,
    dimension=X_train.shape[1],
    variable_type='bool',
    variable_boundaries=varbound,
    algorithm_parameters=algorithm_param
)

model.run()
ega_best_features = np.round(model.output_dict['variable']).astype(int)
ega_selected_idx=np.where(ega_best_features==1)[0]
print("features selected after ega :", ega_selected_idx)
print(f"total :{len(ega_selected_idx)}/ {len(ega_best_features)}")

#pso layer
def pso_fitness(mask):
    mask = (mask > 0.5).astype(int)
    if np.sum(mask) == 0:
        return 1.0
    selected_features = X_train[:, mask==1]
    clf = RandomForestClassifier(n_estimators=20, random_state=2)
    clf.fit(selected_features, y_train)
    preds = clf.predict(selected_features)
    return 1.0 - accuracy_score(y_train, preds)

rejected_idx = np.where(ega_best_features == 0)[0] #rejected features

if len(rejected_idx) > 0:
    # mask for rejected
    def pso_fitness_partial(x):
        fitness_values = []
        for particle_pos in x:
            mask = np.copy(ega_best_features)
            mask[rejected_idx] = (particle_pos > 0.5).astype(int)
            fitness_values.append(pso_fitness(mask))
        return np.array(fitness_values)

    options = {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=len(rejected_idx), options=options)
    optimal_cost, pos = optimizer.optimize(pso_fitness_partial, iters=10)
    # Combine mask
    final_mask = np.copy(ega_best_features)
    final_mask[rejected_idx] = (pos > 0.5).astype(int)
else:
    final_mask = ega_best_features

pso_selected_idx=np.where(final_mask==1)[0]
print("features selected after pso:", pso_selected_idx)
print(f"total: {len(pso_selected_idx)} /{len(final_mask)}")

||||||||||________________________________________ 20.0% GA is running...

In [None]:
X_train_sel = X_train_scaled[:, final_mask == 1]
X_test_sel = X_test_scaled[:, final_mask == 1]

print("Training final Random Forest classifier...")
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_sel, y_train)
y_train_pred = rf.predict(X_train_sel)

print("\nFinal Evaluation ")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred, average='weighted', zero_division=0))
print("Recall:", recall_score(y_train, y_train_pred, average='weighted', zero_division=0))
print("F1-score:", f1_score(y_train, y_train_pred, average='weighted', zero_division=0))