In [None]:
### MTF Image Generation with Debugging for CIC-IDS-2017 Dataset

# Install required libraries if not already installed
# !pip install scikit-learn pyts matplotlib pandas

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from pyts.image import MarkovTransitionField

# Parameters
input_folder = './dataset/CIC-IDS-2017/Unzip/MachineLearningCVE/'  # Folder containing the CSV files
output_folder = './output/mtf_images/'
n_bins = 8  # Number of bins for MTF quantization
max_flows_per_file = None  # Set to an integer for quick testing, None for full run

# MTF transformer
mtf = MarkovTransitionField(n_bins=n_bins, strategy='quantile')

# Create output directories
os.makedirs(output_folder, exist_ok=True)

if not os.path.exists(input_folder):
    print(f"❌ Input folder {input_folder} not found. Please check the path.")
    exit(1)

# List all CSV files
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
print(f"🔎 Found {len(csv_files)} CSV files in {input_folder}")

if len(csv_files) == 0:
    print("⚠️ No CSV files found! Exiting...")
    exit(1)

# Process each CSV
for csv_file in csv_files:
    print(f"\n📄 Processing {csv_file}...")
    
    
    # Read CSV
    try:
        df = pd.read_csv(os.path.join(input_folder, csv_file), sep=',', engine='python')
        if df.shape[1] <= 1:
            print("⚠️ Only 1 column detected! Trying with semicolon separator...")
            df = pd.read_csv(os.path.join(input_folder, csv_file), sep=';', engine='python')
    except Exception as e:
        print(f"❌ Error reading {csv_file}: {e}")
        continue

    print(f"✅ Loaded {csv_file} with shape {df.shape}")
    print(df.head(2))
    df.columns = df.columns.str.strip()

    # Drop missing values
    df = df.dropna()
    print(f"🔹 After dropping NA: {df.shape}")

    # Check if Label column exists
    label_col = 'Label'
    if label_col not in df.columns:
        print(f"⚠️ Label column '{label_col}' not found in {csv_file}, skipping.")
        continue

    labels = df[label_col].values
    features = df.select_dtypes(include=[np.number])

    print(f"🔹 Features shape: {features.shape}")
    print(f"🔹 Unique labels: {np.unique(labels)}")

    if features.shape[0] == 0 or features.shape[1] == 0:
        print(f"⚠️ No numeric features found in {csv_file}, skipping.")
        continue

    # Normalize features
    # Separate features and labels
    label_col = 'Label'
    if label_col not in df.columns:
        print(f"Label column not found in {csv_file}, skipping.")
        continue

    labels = df[label_col].values
    features = df.select_dtypes(include=[np.number])

    #  NEW: Handle infinities and NaNs
    features = features.replace([np.inf, -np.inf], np.nan)
    features = features.dropna()

    # Normalize features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)


    # Create day-specific and attack-specific folders
    day_name = os.path.splitext(csv_file)[0]
    day_folder = os.path.join(output_folder, 'by_day', day_name)
    attack_folder = os.path.join(output_folder, 'by_attack_type')
    os.makedirs(day_folder, exist_ok=True)
    os.makedirs(attack_folder, exist_ok=True)

    print("🚀 Starting image generation...")

    # Generate MTF images
    for idx, (flow, label) in enumerate(zip(features_scaled, labels)):
        try:
            flow_2d = flow.reshape(1, -1)
            mtf_image = mtf.fit_transform(flow_2d)[0]

            # Save to day-specific folder
            day_label_folder = os.path.join(day_folder, str(label))
            os.makedirs(day_label_folder, exist_ok=True)
            day_filename = f"flow_{idx}.png"
            day_filepath = os.path.join(day_label_folder, day_filename)
            plt.imsave(day_filepath, mtf_image, cmap='gray')

            # Save to attack-type folder
            attack_label_folder = os.path.join(attack_folder, str(label))
            os.makedirs(attack_label_folder, exist_ok=True)
            attack_filename = f"{day_name}_flow_{idx}.png"
            attack_filepath = os.path.join(attack_label_folder, attack_filename)
            plt.imsave(attack_filepath, mtf_image, cmap='gray')

            if idx % 500 == 0:
                print(f"✅ Saved {idx} images so far for {csv_file}...")

            if max_flows_per_file and idx >= max_flows_per_file:
                print(f"🛑 Max flow limit {max_flows_per_file} reached for {csv_file}.")
                break

        except Exception as e:
            print(f"❌ Error processing flow {idx}: {e}")
            continue

print("\n🎉 All CSV files processed. MTF image generation complete.")


🔎 Found 8 CSV files in ./dataset/CIC-IDS-2017/Unzip/MachineLearningCVE/

📄 Processing Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
✅ Loaded Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv with shape (225745, 79)
    Destination Port   Flow Duration   Total Fwd Packets  \
0              54865               3                   2   
1              55054             109                   1   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                           12   
1                        1                            6   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       6   
1                             6                       6   

    Fwd Packet Length Min   Fwd Packet Length Mean   Fwd Packet Length Std  \
0                       6                      6.0                     0.0   
1                       6                      6.0                     0.0   

   ...  

  warn("Some quantiles are equal. The number of bins will "
