In [None]:
%pip install pandas scikit-learn matplotlib seaborn



# **Load and Inspect the Datasets:**

In [6]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv(r'C:\Users\anany\OneDrive\Documents\intership cybersecurity\NSL_KDD_Train.csv')
test_df = pd.read_csv(r'C:\Users\anany\OneDrive\Documents\intership cybersecurity\NSL_KDD_Test.csv')

# Display basic information about the datasets
print("Training Data Info:")
print(train_df.info())
print("\nTesting Data Info:")
print(test_df.info())

# Display the first few rows of the training dataset
print("\nFirst few rows of the training dataset:")
print(train_df.head())


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 42 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   0         125972 non-null  int64  
 1   tcp       125972 non-null  object 
 2   ftp_data  125972 non-null  object 
 3   SF        125972 non-null  object 
 4   491       125972 non-null  int64  
 5   0.1       125972 non-null  int64  
 6   0.2       125972 non-null  int64  
 7   0.3       125972 non-null  int64  
 8   0.4       125972 non-null  int64  
 9   0.5       125972 non-null  int64  
 10  0.6       125972 non-null  int64  
 11  0.7       125972 non-null  int64  
 12  0.8       125972 non-null  int64  
 13  0.9       125972 non-null  int64  
 14  0.10      125972 non-null  int64  
 15  0.11      125972 non-null  int64  
 16  0.12      125972 non-null  int64  
 17  0.13      125972 non-null  int64  
 18  0.14      125972 non-null  int64  
 19  0.15      125972 non-nul

# **Data Preprocessing:**

In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load data (replace with your actual paths)
train_df = pd.read_csv(r'C:\Users\anany\OneDrive\Documents\intership cybersecurity\NSL_KDD_Train.csv')
test_df = pd.read_csv(r'C:\Users\anany\OneDrive\Documents\intership cybersecurity\NSL_KDD_Test.csv')

# Add column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target'
]
train_df.columns = column_names
test_df.columns = column_names

# Verify column names after loading
print(train_df.columns)

# Handle missing values (if any)
train_df = train_df.dropna()
test_df = test_df.dropna()

# Encode categorical variables, INCLUDING 'target'
categorical_columns = ['protocol_type', 'service', 'flag', 'target']
label_encoders = {col: LabelEncoder().fit(train_df[col]) for col in categorical_columns}

# Function to safely transform categorical variables
def transform_with_unknown(label_encoder, series):
    unique_values = set(series.unique())
    known_values = set(label_encoder.classes_)
    unknown_values = unique_values - known_values
    if unknown_values:
        # Add unknown values to the encoder's classes
        label_encoder.classes_ = np.append(label_encoder.classes_, list(unknown_values))
    return label_encoder.transform(series)

for col, le in label_encoders.items():
    train_df[col] = le.transform(train_df[col])
    test_df[col] = transform_with_unknown(le, test_df[col])

# Scale numerical features
scaler = StandardScaler()
numerical_columns = train_df.columns.difference(['target'])
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

# Display the first few rows of the processed training dataset
print("Processed Training Data:")
print(train_df.head())

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'target'],
      dtype='object')
Processed Training Data:
   duration  protocol_type   service      flag  src_bytes  dst_bytes  \
0  -0.11025       2.219302  0.781420  0.751116  -0.

# **Feature and Target Separation:**

In [11]:
from sklearn.model_selection import train_test_split

# Use 'target' instead of 'attack_class'
X_train = train_df.drop('target', axis=1)  
y_train = train_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

# Split training data for training and validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# **Model Training and Evaluation:**

In [12]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Isolation Forest model
iso_forest = IsolationForest(random_state=42)
iso_forest.fit(X_train_split)

# Predict on validation data
iso_val_preds = iso_forest.predict(X_val_split)

# Convert predictions to 0 and 1 (anomaly and normal)
iso_val_preds = [1 if pred == -1 else 0 for pred in iso_val_preds]

# Evaluate the model
iso_accuracy = accuracy_score(y_val_split, iso_val_preds)
iso_precision = precision_score(y_val_split, iso_val_preds, average='weighted')
iso_recall = recall_score(y_val_split, iso_val_preds, average='weighted')
iso_f1 = f1_score(y_val_split, iso_val_preds, average='weighted')

print(f'Isolation Forest - Accuracy: {iso_accuracy}, Precision: {iso_precision}, Recall: {iso_recall}, F1 Score: {iso_f1}')


Isolation Forest - Accuracy: 0.006231395118078984, Precision: 5.380882656693041e-05, Recall: 0.006231395118078984, F1 Score: 0.00010668630771062994


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_split, y_train_split)

# Predict on validation data
rf_val_preds = rf.predict(X_val_split)

# Evaluate the model
rf_accuracy = accuracy_score(y_val_split, rf_val_preds)
rf_precision = precision_score(y_val_split, rf_val_preds, average='weighted')
rf_recall = recall_score(y_val_split, rf_val_preds, average='weighted')
rf_f1 = f1_score(y_val_split, rf_val_preds, average='weighted')

print(f'Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1 Score: {rf_f1}')


Random Forest - Accuracy: 0.9984917642389363, Precision: 0.998175925657884, Recall: 0.9984917642389363, F1 Score: 0.9983153306259545


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Ensemble Model Training and Evaluation:**

In [14]:
import numpy as np
from sklearn.linear_model import LogisticRegression

# Prepare the meta features
train_meta_features = np.column_stack((iso_forest.predict(X_train_split), rf.predict(X_train_split)))
val_meta_features = np.column_stack((iso_forest.predict(X_val_split), rf.predict(X_val_split)))

# Train the meta model
meta_model = LogisticRegression(random_state=42)
meta_model.fit(train_meta_features, y_train_split)

# Predict on validation data using the meta model
val_meta_preds = meta_model.predict(val_meta_features)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_val_split, val_meta_preds)
ensemble_precision = precision_score(y_val_split, val_meta_preds, average='weighted')
ensemble_recall = recall_score(y_val_split, val_meta_preds, average='weighted')
ensemble_f1 = f1_score(y_val_split, val_meta_preds, average='weighted')

print(f'Ensemble Model - Accuracy: {ensemble_accuracy}, Precision: {ensemble_precision}, Recall: {ensemble_recall}, F1 Score: {ensemble_f1}')


Ensemble Model - Accuracy: 0.9113316134153602, Precision: 0.8640899658733974, Recall: 0.9113316134153602, F1 Score: 0.8855117829766441


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Prepare meta features for test set
test_meta_features = np.column_stack((iso_forest.predict(X_test), rf.predict(X_test)))

# Predict on test data
test_meta_preds = meta_model.predict(test_meta_features)

# Evaluate the ensemble model on test data
test_accuracy = accuracy_score(y_test, test_meta_preds)
test_precision = precision_score(y_test, test_meta_preds, average='weighted')
test_recall = recall_score(y_test, test_meta_preds, average='weighted')
test_f1 = f1_score(y_test, test_meta_preds, average='weighted')

print(f'Ensemble Model Test Set - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}')


Ensemble Model Test Set - Accuracy: 0.6878992193044713, Precision: 0.5087245624144348, Recall: 0.6878992193044713, F1 Score: 0.5795289177354543


  _warn_prf(average, modifier, msg_start, len(result))
