In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras

# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

# Load dataset in chunks to avoid MemoryError
print("Loading dataset in chunks...")
chunk_size = 100000  # Adjust based on available memory
df_list = []

for chunk in pd.read_csv("network_data.csv", chunksize=chunk_size, usecols=["protocol", "src_port", "dst_port", "bytes", "packets", "label"]):
    # Convert categorical columns (e.g., protocol)
    chunk['protocol'] = chunk['protocol'].astype('category').cat.codes
    
    # Append processed chunk to list
    df_list.append(chunk)

# Combine chunks into a single DataFrame
df = pd.concat(df_list, ignore_index=True)
print(f"Dataset loaded successfully! Shape: {df.shape}")

# Encode label column
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
print("Label encoding completed!")

# Split into features (X) and target (y)
X = df.drop(columns=['label'])
y = df['label']

# Normalize the features
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Feature scaling completed!")

# Train-test split
print("Splitting dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Define the deep learning model
print("Building the deep learning model...")
model = keras.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(len(np.unique(y)), activation="softmax")  # Multi-class classification
])
print("Model built successfully!")

# Compile the model
print("Compiling model...")
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])
print("Model compiled successfully!")

# Train the model
print("Starting training...")
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))
print("Training completed!")

# Evaluate the model
print("Evaluating model on test data...")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Save the model
print("Saving the trained model...")
model.save("siem_model.h5")
print("Model saved successfully as 'siem_model.h5'!")

# Final confirmation
print("All steps completed successfully! 🚀")


TensorFlow version: 2.18.0
Loading dataset in chunks...


ValueError: Usecols do not match columns, columns expected but not found: ['packets', 'bytes']

In [3]:
print(df['label'].value_counts())


label
0     1785725
4      347394
9      161323
2       95733
7        9531
3        8364
5        6860
10       5949
1        5508
6        5177
11       2734
13       1358
12         24
8          12
Name: count, dtype: int64


In [11]:
df_sample = pd.read_csv("network_data.csv", nrows=5)
print(df_sample.columns)


Index(['flow_id', 'timestamp', 'src_ip', 'src_port', 'dst_ip', 'dst_port',
       'protocol', 'duration', 'packets_count', 'fwd_packets_count',
       ...
       'bwd_packets_IAT_mean', 'bwd_packets_IAT_std', 'bwd_packets_IAT_max',
       'bwd_packets_IAT_min', 'bwd_packets_IAT_total', 'subflow_fwd_packets',
       'subflow_bwd_packets', 'subflow_fwd_bytes', 'subflow_bwd_bytes',
       'label'],
      dtype='object', length=122)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a simple model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_names = X.columns

# Sort features by importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print(feature_importance.head(30))  # Show top 30 most relevant features
