In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from pyts.image import GramianAngularField

# Parameters
input_folder = './data'
output_folder = './output/gaf_images/'
n_components = 30  # PCA reduced dimensions
gaf_size = 32       # GAF image size
# max_flows_per_file = 5000  # Change or set to None for full run

# Initialize transformers
scaler = MinMaxScaler()
pca = PCA(n_components=n_components)
gaf = GramianAngularField(method='summation', image_size=gaf_size)

# Create output directory
os.makedirs(output_folder, exist_ok=True)

# Process each CSV file
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
# for csv_file in csv_files:
#     print(f"\n📄 Processing {csv_file}...")
#     df_path = os.path.join(input_folder, csv_file)
#     df = pd.read_csv(df_path)
#     df = df.dropna()
#     print("Unique labels found:", df[' Label'].unique())
#     labels = df[' Label']
#     features = df.replace([np.inf, -np.inf], np.nan).dropna()
#     print("Unique labels found:", labels.unique())


In [2]:
for csv_file in csv_files:
    print(f"\n📄 Processing {csv_file}...")
    df_path = os.path.join(input_folder, csv_file)
    df = pd.read_csv(df_path)
    df.columns = df.columns.str.strip()

    if 'Label' not in df.columns:
        print(f"⚠️ Skipping {csv_file} — 'Label' column not found.")
        continue

    df = df.dropna()
    labels = df['Label'].values
    features = df.select_dtypes(include=[np.number])

    # Remove inf or very large values
    features = features.replace([np.inf, -np.inf], np.nan).dropna()
    features_scaled = scaler.fit_transform(features)
    features_reduced = pca.fit_transform(features_scaled)

    # Prepare output folders
    day_name = os.path.splitext(csv_file)[0]
    day_folder = os.path.join(output_folder, 'by_day', day_name)
    attack_folder = os.path.join(output_folder, 'by_attack_type')
    os.makedirs(day_folder, exist_ok=True)
    os.makedirs(attack_folder, exist_ok=True)

    print(f"Generating GAF images for up to all flows...")

    for idx, (flow, label) in enumerate(zip(features_reduced, labels)):
        try:
            flow_2d = flow.reshape(1, -1)
            gaf.image_size = min(gaf.image_size, flow_2d.shape[1])
            gaf_image = gaf.fit_transform(flow_2d)[0]


            # Save by day
            day_label_folder = os.path.join(day_folder, str(label))
            os.makedirs(day_label_folder, exist_ok=True)
            gaf_day_path = os.path.join(day_label_folder, f"flow_{idx}.png")
            plt.imsave(gaf_day_path, gaf_image, cmap='gray')

            # Save by attack type
            attack_label_folder = os.path.join(attack_folder, str(label))
            os.makedirs(attack_label_folder, exist_ok=True)
            gaf_attack_path = os.path.join(attack_label_folder, f"{day_name}_flow_{idx}.png")
            plt.imsave(gaf_attack_path, gaf_image, cmap='gray')

            # if max_flows_per_file and idx >= max_flows_per_file:
            #     break

        except Exception as e:
            print(f"⚠️ Error on flow {idx}: {e}")
            continue

print("\n✅ GAF image generation complete.")


📄 Processing Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Monday-WorkingHours.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Friday-WorkingHours-Morning.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Tuesday-WorkingHours.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Wednesday-workingHours.pcap_ISCX.csv...
Generating GAF images for up to all flows...

📄 Processing Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...
Generating GAF images for up to all flows...

✅ GAF image generation complete.
