<a href="https://colab.research.google.com/github/BinaryNavigator07/Network-Intrusion-Detection-System/blob/main/CICIDS%202017/2017_feature_extraction_with_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install seaborn catboost --quiet
import os
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
np.random.seed(178)

In [None]:
import os
import pandas as pd
import kagglehub

path1 = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")
print("Dataset 1 folder:", path1)

csv_file_names1 = [f for f in os.listdir(path1) if f.endswith(".csv")]
print(f"Found {len(csv_file_names1)} CSV files in dataset 1.")

dfs = []
for file_name in csv_file_names1:
    file_path = os.path.join(path1, file_name)
    print("Loading:", file_path)
    df1 = pd.read_csv(file_path)
    dfs.append(df1)

path2 = kagglehub.dataset_download("solarmainframe/ids-intrusion-csv")
print("Dataset 2 folder:", path2)

csv_file_names2 = [f for f in os.listdir(path2) if f.endswith(".csv")]
print(f"Found {len(csv_file_names2)} CSV files in dataset 2.")

for file_name in csv_file_names2:
    file_path = os.path.join(path2, file_name)
    print("Loading:", file_path)
    df2 = pd.read_csv(file_path)
    dfs.append(df2)

df = pd.concat(dfs, ignore_index=True)
print("Shape of combined data:", df.shape)

df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df[' Label'].nunique()

### Data Cleaning

In [None]:

from itertools import combinations

def data_cleaning(df):
    df.columns=df.columns.str.strip()
    print("Dataset Shape: ",df.shape)

    num=df._get_numeric_data()
    num[num<0]=0

    zero_variance_cols=[]
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(columns=zero_variance_cols,axis=1,inplace=True)
    print("Zero Variance Columns: ",zero_variance_cols, " are dropped!!")
    print("Shape after removing the zero variance columns: ",df.shape)

    df.replace([np.inf,-np.inf],np.nan,inplace=True)
    print(df.isna().any(axis=1).sum(), "rows dropped")
    df.dropna(inplace=True)
    print("Shape after Removing NaN: ",df.shape)

    df.drop_duplicates(inplace=True)
    print("Shape after dropping duplicates: ",df.shape)

    column_pairs = [(i,j) for i,j in combinations(df,2) if df[i].equals(df[j])]
    ide_cols=[]
    for col_pair in column_pairs:
        ide_cols.append(col_pair[1])
    df.drop(columns=ide_cols,axis=1,inplace=True)
    print("Columns which have identical values: ",column_pairs," dropped!")
    print("Shape after removing identical value columns: ",df.shape)
    return df
df=data_cleaning(df)

In [None]:
df.columns

In [None]:
df['Label'].value_counts()

In [None]:
# Install Seaborn if not already installed
!pip install seaborn --quiet

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))

# Make sure 'Label' column exists in your DataFrame
plot = sns.countplot(
    data=df.loc[df['Label'] != 'BENIGN'],
    y='Label'
)

plt.xscale('log')
plt.tight_layout()

# Save the plot
fig = plot.get_figure()
fig.savefig('img1.png')

plt.show()

### Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import pandas as pd

# Feature / target split
X = df.drop('Label', axis=1)
y = df['Label']

# Min-Max scaling for features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# One-hot encoding for the label (compatible with all versions)
try:
    encoder = OneHotEncoder(sparse_output=False)  # New versions
except TypeError:
    encoder = OneHotEncoder(sparse=False)  # Old versions

y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))
y_encoded = pd.DataFrame(
    y_encoded,
    columns=encoder.get_feature_names_out(['Label'])
)

print("X_scaled shape:", X_scaled.shape)
print("y_encoded shape:", y_encoded.shape)


In [None]:
X_scaled

In [None]:
y_encoded

In [None]:
# Combine the scaled features and the one-hot encoded label
df = pd.concat([X_scaled, y_encoded], axis=1)
df

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y_encoded,test_size=0.2)
print(X_train.shape," ",X_test.shape)
print(y_train.shape," ",y_test.shape)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
#selecting 1% of random rows for better running time

X_train = X_train.sample(frac=0.1, replace=True, random_state=1)
y_train = y_train.sample(frac=0.1, replace=True, random_state=1)
X_test = X_test.sample(frac=0.1, replace=True, random_state=1)
y_test = y_test.sample(frac=0.1, replace=True, random_state=1)
print (X_train.shape, y_train.shape)
print( X_test.shape, y_test.shape)

### Feature extraction using Random Forest

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
sel.fit(X_train, y_train)
sel.get_support()

X_train.columns

features = X_train.columns[sel.get_support()]
print(features)

print(len(features))

np.mean(sel.estimator_.feature_importances_)
print(sel.estimator_.feature_importances_)

X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

### Random Forest

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,accuracy_score,f1_score,precision_score,recall_score
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)

### Decision Tree

In [None]:
def run_decisionTree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
run_decisionTree(X_train_rfe, X_test_rfe, y_train, y_test)

### Extreme-Gradient Boosting(XGBoost)

In [None]:
def run_XGBoost(X_train, X_test, y_train, y_test):
    clf = xgb.XGBClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
    print('F1 score: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
run_XGBoost(X_train_rfe, X_test_rfe, y_train, y_test)

### Cat Boost

In [None]:
from sklearn.metrics import classification_report
def run_catBoost(X_train, X_test, y_train, y_test):
    y_train_labels = y_train.idxmax(axis=1)
    y_test_labels = y_test.idxmax(axis=1)
    clf = CatBoostClassifier(iterations=100, random_state=0, verbose=0)
    clf.fit(X_train, y_train_labels, cat_features=[], verbose=0)
    y_pred_labels = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test_labels, y_pred_labels))
    print('Precision: ', precision_score(y_test_labels, y_pred_labels, average='weighted'))
    print('Recall: ', recall_score(y_test_labels, y_pred_labels, average='weighted'))
    print('F1 score: ', f1_score(y_test_labels, y_pred_labels, average='weighted'))

In [None]:
run_catBoost(X_train_rfe, X_test_rfe, y_train, y_test)

In [None]:
models = ['Random Forest', 'Decision Tree', 'XG-Boost', 'Cat-Boost']
random_forest = [99.80, 99.79, 99.80, 99.79]
decision_tree = [99.71, 99.72, 99.71, 99.71]
xgboost = [99.75, 99.77, 99.80, 99.78]
catboost = [99.55, 99.55, 99.55, 99.53]

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values = [random_forest, decision_tree, xgboost, catboost]

fig, axs = plt.subplots(2, 2, figsize=(10, 8))
axs = axs.flatten()

for i, (model, ax) in enumerate(zip(models, axs)):
    max_value = max(values[i]) + 2
    ax.bar(metrics, values[i], color=['#4285F4', '#DB4437', '#F4B400', '#0F9D58'])
    ax.set_title(f'{model} Performance', fontsize=14, fontweight='bold')
    ax.set_ylim(90, max_value)


    for j, v in enumerate(values[i]):
        ax.annotate(f'{v:.2f}%', xy=(j, v), xytext=(0, 8),
                    textcoords="offset points", ha='center', va='bottom', fontsize=10)

    ax.set_xlabel('Metric', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)

plt.tight_layout()
plt.show()