In [1]:
import os
import pandas as pd

folder_path = "./CICIDS_2017/"
file_info = {}

for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        file_info[file_name] = {
            "Number of Columns": len(df.columns),
            "Number of Rows": len(df),
            "Columns": list(df.columns)
        }

for file_name, info in file_info.items():
    print(f"File: {file_name}")
    print(f"  Number of Columns: {info['Number of Columns']}")
    print(f"  Number of Rows: {info['Number of Rows']}")
    print(f"  Columns: {info['Columns']}\n")


File: Friday-WorkingHours-Morning.pcap_ISCX.csv
  Number of Columns: 79
  Number of Rows: 191033
  Columns: [' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance

In [2]:
import os
import pandas as pd

combined_df = pd.DataFrame()

for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    
    if file.endswith('.csv') and os.path.isfile(file_path):
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df.to_csv('./CICIDS_2017/dataset.csv', index=False)


In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Configurable parameters
input_file = './CICIDS_2017/dataset.csv'
output_folder = './CICIDS_2017/'
train_ratio = 0.7
val_test_ratio = 0.15 / (0.15 + 0.15)
random_seed = 42

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Check if the input file exists
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file not found: {input_file}")

# Load the dataset
data = pd.read_csv(input_file)
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")

# Split the data into training and remaining sets
train_data, remaining_data = train_test_split(
    data, test_size=1-train_ratio, random_state=random_seed, shuffle=True
)

# Split the remaining data into validation and test sets
val_data, test_data = train_test_split(
    remaining_data, test_size=1-val_test_ratio, random_state=random_seed, shuffle=True
)

# Print the sizes of each split
print(f"Training set: {train_data.shape[0]} rows")
print(f"Validation set: {val_data.shape[0]} rows")
print(f"Test set: {test_data.shape[0]} rows")

# Define output file paths
train_file = os.path.join(output_folder, 'train.csv')
val_file = os.path.join(output_folder, 'val.csv')
test_file = os.path.join(output_folder, 'test.csv')

# Save the splits to CSV files
train_data.to_csv(train_file, index=False)
val_data.to_csv(val_file, index=False)
test_data.to_csv(test_file, index=False)

print(f"Files saved to {output_folder}")


Dataset loaded with 2830743 rows and 79 columns.
Training set: 1981520 rows
Validation set: 424611 rows
Test set: 424612 rows
Files saved to ./CICIDS_2017/


In [1]:
import pandas as pd
import os

# Define the base folder
base_folder = './CICIDS_2017/'

# File paths
train_file = os.path.join(base_folder, 'train.csv')
val_file = os.path.join(base_folder, 'val.csv')
test_file = os.path.join(base_folder, 'test.csv')

# Load CSV files
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

# Function to clean any non-ASCII characters in labels
def clean_label(label):
    if isinstance(label, str):
        return ''.join(e for e in label if e.isalnum() or e.isspace())  # Keep alphanumeric and spaces only
    return label

# Clean labels in the columns (if any)
train_df.columns = train_df.columns.str.strip().map(clean_label)
val_df.columns = val_df.columns.str.strip().map(clean_label)
test_df.columns = test_df.columns.str.strip().map(clean_label)

# Clean labels in the target column (assuming the target column is named 'Label')
train_df['Label'] = train_df['Label'].map(clean_label)
val_df['Label'] = val_df['Label'].map(clean_label)
test_df['Label'] = test_df['Label'].map(clean_label)

# Convert all column names to string type
train_df.columns = train_df.columns.astype(str)
val_df.columns = val_df.columns.astype(str)
test_df.columns = test_df.columns.astype(str)

# Convert all data to string type (including int) and remove spaces
def convert_to_str(value):
    if isinstance(value, str):  # If the value is a string, strip it
        return value.strip()  # Remove leading/trailing spaces
    return str(value)  # Convert all other types to string

# Apply the function to all the data
train_df = train_df.applymap(convert_to_str)
val_df = val_df.applymap(convert_to_str)
test_df = test_df.applymap(convert_to_str)

# Optionally, save the cleaned CSV files back to disk
train_df.to_csv(train_file, index=False)
val_df.to_csv(val_file, index=False)
test_df.to_csv(test_file, index=False)

# Print cleaned labels for each file
print("Unique labels in train data:", train_df['Label'].unique())
print("Unique labels in validation data:", val_df['Label'].unique())
print("Unique labels in test data:", test_df['Label'].unique())

print("All columns and features have been converted to string and spaces have been removed.")


  train_df = train_df.applymap(convert_to_str)
  val_df = val_df.applymap(convert_to_str)
  test_df = test_df.applymap(convert_to_str)


Unique labels in train data: ['BENIGN' 'DoS Hulk' 'PortScan' 'DDoS' 'DoS GoldenEye' 'Bot' 'FTPPatator'
 'SSHPatator' 'DoS Slowhttptest' 'DoS slowloris' 'Web Attack  Brute Force'
 'Web Attack  XSS' 'Infiltration' 'Web Attack  Sql Injection' 'Heartbleed']
Unique labels in validation data: ['BENIGN' 'DoS Hulk' 'DDoS' 'SSHPatator' 'PortScan' 'DoS GoldenEye'
 'FTPPatator' 'DoS slowloris' 'DoS Slowhttptest' 'Web Attack  XSS' 'Bot'
 'Web Attack  Brute Force' 'Web Attack  Sql Injection' 'Infiltration'
 'Heartbleed']
Unique labels in test data: ['BENIGN' 'DDoS' 'PortScan' 'DoS Hulk' 'SSHPatator' 'FTPPatator'
 'DoS GoldenEye' 'DoS slowloris' 'Web Attack  Brute Force' 'Bot'
 'DoS Slowhttptest' 'Web Attack  XSS' 'Web Attack  Sql Injection'
 'Infiltration' 'Heartbleed']
All columns and features have been converted to string and spaces have been removed.
