In [None]:
!pip install seaborn catboost --quiet
import tensorflow as tf
import pandas as pd
import numpy as np
import sklearn
from keras.models import Sequential, load_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical

In [None]:
import os
import pandas as pd
import kagglehub

# Download the 2018 CICIDS dataset
path_2018 = kagglehub.dataset_download("solarmainframe/ids-intrusion-csv")
print("Dataset folder for 2018:", path_2018)

# List all CSV files in the 2018 dataset directory
csv_files_2018 = [f for f in os.listdir(path_2018) if f.endswith('.csv')]
print(f"Found {len(csv_files_2018)} CSV files in 2018 dataset.")

# Load only first 5 CSV files for faster processing
csv_files_2018_subset = csv_files_2018[:5]

dfs_2018 = []
for file_name in csv_files_2018_subset:
    file_path = os.path.join(path_2018, file_name)
    print("Loading:", file_path)
    df = pd.read_csv(file_path, low_memory=False)
    dfs_2018.append(df)

# Combine loaded CSV files into one DataFrame
df = pd.concat(dfs_2018, ignore_index=True)
print("Shape of combined 2018 data:", df.shape)

# Preview the combined data
df.head()


In [None]:
df.info()

In [None]:
df.columns

In [None]:
df['Label'].unique()

In [None]:
df['Label'].value_counts()

In [None]:
Label = pd.DataFrame(df.Label)
Label

In [None]:
from sklearn.preprocessing import MinMaxScaler

std_scaler = MinMaxScaler()
def standardization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

numeric_col = df.select_dtypes(include='number').columns
df = standardization(df,numeric_col)

In [9]:
df

KeyboardInterrupt: 

In [None]:
df.info()

In [None]:
X = df.drop('Label', axis=1)
y = df['Label']

In [None]:
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))
y_encoded = pd.DataFrame(y_encoded)
y_encoded

In [None]:
X

In [None]:
df = pd.concat([X, y_encoded], axis=1)
df

In [None]:
df.info()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded,test_size=0.2)
print(X_train.shape," ",X_test.shape)
print(y_train.shape," ",y_test.shape)

In [None]:
#selecting 1% of random rows for better running time

X_train = X_train.sample(frac=0.1, replace=True, random_state=1)
y_train = y_train.sample(frac=0.1, replace=True, random_state=1)
X_test = X_test.sample(frac=0.1, replace=True, random_state=1)
y_test = y_test.sample(frac=0.1, replace=True, random_state=1)
print (X_train.shape, y_train.shape)
print( X_test.shape, y_test.shape)

In [None]:
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
sel.fit(X_train, y_train)
sel.get_support()

X_train.columns

features = X_train.columns[sel.get_support()]
print(features)

print(len(features))

np.mean(sel.estimator_.feature_importances_)
print(sel.estimator_.feature_importances_)

X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [None]:
X_train_rfe=np.array(X_train_rfe)
X_test_rfe=np.array(X_test_rfe)
y=np.array(y)

In [None]:
X_train = np.reshape(X_train_rfe, ( X_train_rfe.shape[0], 1 , X_train_rfe.shape[1] ))
X_test = np.reshape(X_test_rfe, ( X_test_rfe.shape[0], 1,  X_test_rfe.shape[1] ))

In [None]:
from keras.layers import Dense, LSTM, MaxPool1D, Flatten, Dropout # importing dense layer
from keras.models import Sequential #importing Sequential layer
from keras.layers import Input
from keras.models import Model

model = Sequential() # initializing model
model.add(LSTM(64,return_sequences=True,input_shape = (1, X_train_rfe.shape[1])))
model.add(Dropout(0.2))
model.add(LSTM(64,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64,return_sequences=True))
model.add(Flatten())
model.add(Dense(units=50))
# output layer with softmax activation
model.add(Dense(units=7,activation='softmax'))

In [None]:
# defining loss function, optimizer, metrics and then compiling model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# summary of model layers
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=5000,validation_split=0.2)

In [None]:
# predicting target attribute on testing dataset
test_results = model.evaluate(X_test, y_test, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]*100}%')