In [3]:
import os
import pandas as pd

# Set correct path to your folder in Google Drive
folder_path = 'cicids2017'

# List all CSV files in that folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
print("Found CSV files:", csv_files)

# Read and combine all CSV files into a single DataFrame
df_list = [pd.read_csv(os.path.join(folder_path, file), low_memory=False) for file in csv_files]
cicids_df = pd.concat(df_list, ignore_index=True)

print("Shape of combined dataset:", cicids_df.shape)
cicids_df.head()


Found CSV files: ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']
Shape of combined dataset: (2830743, 79)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [6]:
print(cicids_df.columns.tolist())
cicids_df.columns = cicids_df.columns.str.strip()


['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

In [7]:
# Basic info
cicids_df.info()

# Check for missing values
cicids_df.isnull().sum().sort_values(ascending=False).head(20)

# Class distribution (attack types)
print("Attack Types:\n", cicids_df['Label'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

In [8]:
# Create a binary label: 'BENIGN' → 0, all attacks → 1
cicids_df['Attack'] = cicids_df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
cicids_df['Attack'].value_counts()


Attack
0    2273097
1     557646
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

# Drop 'Label' and other irrelevant columns if needed
X = cicids_df.drop(['Label', 'Attack'], axis=1)
y = cicids_df['Attack']

# Fill NaN and Infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf.predict(X_test_scaled)


In [11]:
# Evaluation metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[454312    308]
 [   219 111310]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    454620
           1       1.00      1.00      1.00    111529

    accuracy                           1.00    566149
   macro avg       1.00      1.00      1.00    566149
weighted avg       1.00      1.00      1.00    566149



In [12]:
import joblib

rf = joblib.load("random_forest_model.pkl")
scaler = joblib.load("scaler.save")
feature_columns = joblib.load("feature_columns.save")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [24]:
def prepare_input_for_model(csv_path, feature_columns, scaler):
    import pandas as pd

    df = pd.read_csv(csv_path)

    # One-hot encode the categorical columns
    df_encoded = pd.get_dummies(df)

    # Match features with training data
    X_input = df_encoded.reindex(columns=feature_columns, fill_value=0)

    print("Final shape of X_input:", X_input.shape)

    if X_input.empty:
        raise ValueError("Input features DataFrame is empty after reindexing!")

    # Scale using trained scaler
    X_scaled = scaler.transform(X_input)
    return X_scaled


In [26]:
X_new = prepare_input_for_model("output.csv", feature_columns, scaler)
predictions = rf.predict(X_new)
print(predictions)


Final shape of X_input: (2539, 78)
[0 0 0 ... 0 0 0]


In [28]:
print(["Attack" if p == 1 else "Normal" for p in predictions])


['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal',

In [27]:
import pandas as pd

df = pd.read_csv("output.csv")
print("Shape of output.csv:", df.shape)
print(df.head())


Shape of output.csv: (2539, 6)
   frame.time_epoch         ip.src           ip.dst  ip.proto  \
0      1.744448e+09   192.168.1.13       172.67.6.3       6.0   
1      1.744448e+09   192.168.1.13  142.250.192.131       6.0   
2      1.744448e+09   192.168.1.13  142.250.183.195       6.0   
3      1.744448e+09   192.168.1.13    104.22.27.181       6.0   
4      1.744448e+09  104.22.27.181     192.168.1.13       6.0   

  _ws.col.protocol  frame.len  
0              SSL         55  
1              SSL         55  
2              SSL         55  
3              SSL         55  
4              TCP         66  


In [2]:
import tensorflow
from tensorflow.keras.models import load_model

model = load_model("lstm_intrusion_model.h5")

ModuleNotFoundError: No module named 'tensorflow'