In [15]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
import plotly.express as px
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dropout, Dense

In [16]:
dataframes = []
folders = ['ctu-1-1', 'ctu-3-1', 'ctu-4-1', 'ctu-5-1', 'ctu-8-1'] 

for folder in folders:
    attack_path = os.path.join(folder, 'attack_data.csv')
    benign_path = os.path.join(folder, 'benign_data.csv')
    
    attack_data = pd.read_csv(attack_path)
    benign_data = pd.read_csv(benign_path)
    
    attack_data['label'] = 1  # 1 for attack
    benign_data['label'] = 0  # 0 for benign
    
    dataframes.append(attack_data)
    dataframes.append(benign_data)

df = pd.concat(dataframes, ignore_index=True)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000)

print(df.head())

  df = pd.concat(dataframes, ignore_index=True)


             ts                 uid        id.orig_h id.orig_p        id.resp_h id.resp_p proto service  duration orig_bytes resp_bytes conn_state local_orig local_resp missed_bytes history orig_pkts orig_ip_bytes resp_pkts resp_ip_bytes  label
0  1.525880e+09  CUmrqr4svHuSXJy5z7  192.168.100.103     51524   65.127.233.163        23   tcp       -  2.999051          0          0         S0          -          -            0       S         3           180         0             0      1
1  1.525880e+09  CH98aB3s1kJeq6SFOc  192.168.100.103     56305    63.150.16.171        23   tcp       -         -          -          -         S0          -          -            0       S         1            60         0             0      1
2  1.525880e+09   C3GBTkINvXNjVGtN5  192.168.100.103     41101     111.40.23.49        23   tcp       -         -          -          -         S0          -          -            0       S         1            60         0             0      1
3  1.525880e+09   CD

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177080 entries, 0 to 1177079
Data columns (total 21 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   ts             1177080 non-null  float64
 1   uid            1177080 non-null  object 
 2   id.orig_h      1177080 non-null  object 
 3   id.orig_p      1177080 non-null  object 
 4   id.resp_h      1177080 non-null  object 
 5   id.resp_p      1177080 non-null  object 
 6   proto          1177080 non-null  object 
 7   service        1177080 non-null  object 
 8   duration       1177080 non-null  object 
 9   orig_bytes     1177080 non-null  object 
 10  resp_bytes     1177080 non-null  object 
 11  conn_state     1177080 non-null  object 
 12  local_orig     1177080 non-null  object 
 13  local_resp     1177080 non-null  object 
 14  missed_bytes   1177080 non-null  object 
 15  history        1177080 non-null  object 
 16  orig_pkts      1177080 non-null  object 
 17  orig_ip_

In [19]:
import plotly.express as px

label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']  # Rename columns for clarity

fig = px.bar(
    label_counts,
    x='label',
    y='count',
    labels={'label': 'Label', 'count': 'Count'},
    color='label',  # Optional: Different colors for attack and benign
    title='Distribution of Labels'
)

fig.show()

In [3]:
for col in df.select_dtypes(include=['object']).columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

In [4]:
X = df.drop(columns=['label'])
y = df['label']

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [6]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest
Accuracy: 0.9998513270126075
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95470
           1       1.00      1.00      1.00    139946

    accuracy                           1.00    235416
   macro avg       1.00      1.00      1.00    235416
weighted avg       1.00      1.00      1.00    235416



In [7]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

# Convert predictions to binary
y_pred_lr = (y_pred_lr > 0.5).astype(int)

print("Linear Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Linear Regression
Accuracy: 0.9592126278587692
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95     95470
           1       0.94      1.00      0.97    139946

    accuracy                           0.96    235416
   macro avg       0.97      0.95      0.96    235416
weighted avg       0.96      0.96      0.96    235416



In [8]:
model_ab = AdaBoostClassifier(random_state=42)
model_ab.fit(X_train, y_train)
y_pred_ab = model_ab.predict(X_test)

print("AdaBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_ab))
print("Classification Report:\n", classification_report(y_test, y_pred_ab))



AdaBoost
Accuracy: 0.9956375097699391
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     95470
           1       0.99      1.00      1.00    139946

    accuracy                           1.00    235416
   macro avg       1.00      0.99      1.00    235416
weighted avg       1.00      1.00      1.00    235416



In [9]:
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

Gradient Boosting
Accuracy: 0.9998810616100859
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95470
           1       1.00      1.00      1.00    139946

    accuracy                           1.00    235416
   macro avg       1.00      1.00      1.00    235416
weighted avg       1.00      1.00      1.00    235416



In [10]:
model_sgd = SGDClassifier(random_state=42)
model_sgd.fit(X_train, y_train)
y_pred_sgd = model_sgd.predict(X_test)

print("SGD Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Classification Report:\n", classification_report(y_test, y_pred_sgd))

SGD Classifier
Accuracy: 0.9600621877867265
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95     95470
           1       0.94      1.00      0.97    139946

    accuracy                           0.96    235416
   macro avg       0.97      0.95      0.96    235416
weighted avg       0.96      0.96      0.96    235416



In [None]:
model_svm = SVC(random_state=42, probability=True)  
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)

print("SVM")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


In [11]:
model_mlp = MLPClassifier(hidden_layer_sizes=(128,), random_state=42, max_iter=300)
model_mlp.fit(X_train, y_train)
y_pred_mlp = model_mlp.predict(X_test)

print("Neural Network (MLP)")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Classification Report:\n", classification_report(y_test, y_pred_mlp))


Neural Network (MLP)
Accuracy: 0.996376626907262
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     95470
           1       0.99      1.00      1.00    139946

    accuracy                           1.00    235416
   macro avg       1.00      1.00      1.00    235416
weighted avg       1.00      1.00      1.00    235416



In [12]:
model_dnn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid') 
])

model_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_dnn.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

y_pred_dnn = (model_dnn.predict(X_test) > 0.5).astype(int)

print("Deep Neural Network (DNN)")
print("Accuracy:", accuracy_score(y_test, y_pred_dnn))
print("Classification Report:\n", classification_report(y_test, y_pred_dnn))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 2ms/step - accuracy: 0.9827 - loss: 0.0670 - val_accuracy: 0.9936 - val_loss: 0.0324
Epoch 2/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.9934 - loss: 0.0337 - val_accuracy: 0.9938 - val_loss: 0.0296
Epoch 3/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9934 - loss: 0.0316 - val_accuracy: 0.9938 - val_loss: 0.0276
Epoch 4/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.9936 - loss: 0.0296 - val_accuracy: 0.9938 - val_loss: 0.0261
Epoch 5/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9937 - loss: 0.0282 - val_accuracy: 0.9940 - val_loss: 0.0245
Epoch 6/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1ms/step - accuracy: 0.9935 - loss: 0.0277 - val_accuracy: 0.9939 - val_loss: 0.024

In [None]:
X_train_reshaped = np.expand_dims(X_train, axis=-1) 
X_test_reshaped = np.expand_dims(X_test, axis=-1)

model_cnn_lstm = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    LSTM(50, return_sequences=False), 
    Dropout(0.3),
    Dense(1, activation='sigmoid') 
])

model_cnn_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_cnn_lstm.fit(X_train_reshaped, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

y_pred_cnn_lstm = (model_cnn_lstm.predict(X_test_reshaped) > 0.5).astype(int)

print("CNN-LSTM")
print("Accuracy:", accuracy_score(y_test, y_pred_cnn_lstm))
print("Classification Report:\n", classification_report(y_test, y_pred_cnn_lstm))


Epoch 1/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 6ms/step - accuracy: 0.9767 - loss: 0.0757 - val_accuracy: 0.9933 - val_loss: 0.0341
Epoch 2/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 6ms/step - accuracy: 0.9931 - loss: 0.0360 - val_accuracy: 0.9938 - val_loss: 0.0329
Epoch 3/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - accuracy: 0.9937 - loss: 0.0337 - val_accuracy: 0.9938 - val_loss: 0.0341
Epoch 4/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 9ms/step - accuracy: 0.9941 - loss: 0.0301 - val_accuracy: 0.9957 - val_loss: 0.0225
Epoch 5/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 8ms/step - accuracy: 0.9951 - loss: 0.0250 - val_accuracy: 0.9962 - val_loss: 0.0223
Epoch 6/10
[1m11771/11771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - accuracy: 0.9956 - loss: 0.0220 - val_accuracy: 0.9961 - val_loss: 0.02