In [None]:
!pip install scapy

Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1


In [None]:
# --- Alpha (α) Component Extraction Script ---
# This script reads the 10,284 filtered .pcap files, extracts the
# first 128 packet sizes (with direction), and then trains an
# LSTM autoencoder to generate the 32-dimensional alpha (α) features.

print("--- Initializing Alpha (α) v2 Component Script ---")

# --- Step 0: Install necessary libraries ---
# Scapy is required for reading .pcap files
try:
    import scapy.all as scapy
except ImportError:
    print("Installing scapy...")
    # Use 'pip install' in a Colab cell, not subprocess
    # For this script, we'll assume it's run after !pip install scapy
    print("Please run '!pip install scapy' in a Colab cell and restart the runtime.")
    # In a notebook, run: !pip install scapy

import os
import collections
import time
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scapy.all import rdpcap, IP

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# TensorFlow and Keras for the autoencoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

print("All libraries imported successfully.")

# --- PART 1: Configuration & Labeling Map ---

# --- File & Path Configuration ---
# Directory containing the 10,284 .pcap files
FLOW_DIR = "/content/drive/MyDrive/1 Skripsi/Dataset/ISCX-VPN-NonVPN-2016/v2-final_flows"
# Output CSV file
OUTPUT_CSV = "/content/drive/MyDrive/1 Skripsi/alpha_component_v2.csv"
# We'll also save the trained encoder model
ENCODER_MODEL_SAVE_PATH = "/content/drive/MyDrive/1 Skripsi/alpha_encoder_v2.h5"


# --- Feature Extraction Configuration ---
N_PACKETS = 128     # First N packets to use
MAX_PACKET_SIZE = 1500.0 # For normalization (maps to -1.0 to 1.0)

# --- Autoencoder Configuration ---
LATENT_DIM = 32     # Dimensions of the final alpha vector (α)
EPOCHS = 50
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.2

# --- Labeling Map (Copied from your script) ---
KEYWORD_MAP = collections.OrderedDict([
    ('facebook_chat', ('Facebook', 'Chat')),
    ('facebookchat', ('Facebook', 'Chat')),
    ('hangouts_chat', ('Hangout', 'Chat')),
    ('hangout_chat', ('Hangout', 'Chat')),
    ('gmailchat', ('Gmail', 'Chat')),
    ('icq_chat', ('ICQ', 'Chat')),
    ('icqchat', ('ICQ', 'Chat')),
    ('skype_chat', ('Skype', 'Chat')),
    ('aim_chat', ('AIM Chat', 'Chat')),
    ('aimchat', ('AIM Chat', 'Chat')),

    ('facebook_audio', ('Facebook', 'VoIP')),
    ('hangouts_audio', ('Hangout', 'VoIP')),
    ('skype_audio', ('Skype', 'VoIP')),
    ('voipbuster', ('VOIPBuster', 'VoIP')),
    ('facebook_video', ('Facebook', 'VoIP')),
    ('hangouts_video', ('Hangout', 'VoIP')),
    ('skype_video', ('Skype', 'VoIP')),

    ('skype_file', ('Skype', 'File Transfer')),
    ('ftps', ('FTP', 'File Transfer')),
    ('sftp', ('SFTP', 'File Transfer')),
    ('scp', ('SCP', 'File Transfer')),
    ('ftp', ('FTP', 'File Transfer')),

    ('email', ('Email', 'Email')),
    ('gmail', ('Gmail', 'Email')),

    ('netflix', ('Netflix', 'Streaming')),
    ('spotify', ('Spotify', 'Streaming')),
    ('vimeo', ('Vimeo', 'Streaming')),
    ('youtube', ('YouTube', 'Streaming')),

    ('bittorrent', ('BitTorrent', 'P2P')),
])

# --- List of the 6 applications we are using ---
# This is used to sanity-check the labeling
TARGET_APPS = {
    'Skype', 'Email', 'SCP', 'VOIPBuster', 'YouTube', 'BitTorrent'
}


def get_flow_labels(filename):
    """
    Parses a filename to get its labels (application, category, binary_type).
    """
    lower_filename = filename.lower()

    # 1. Determine Binary Type
    binary_type = 'VPN' if lower_filename.startswith('vpn_') else 'NonVPN'

    # 2. Determine Application and Category
    for keyword, (application, category) in KEYWORD_MAP.items():
        if keyword in lower_filename:
            # Sanity check if the app is one we expect
            if application not in TARGET_APPS:
                # This should not happen if the filtering script was correct
                print(f"Warning: Found app '{application}' not in TARGET_APPS.")

            return application, category, binary_type

    # Fallback in case no keyword matches (e.g., FTP files named just 'ftp_...pcap')
    # This logic is based on our 6 target apps
    if 'scp' in lower_filename:
        return 'SCP', 'File Transfer', binary_type
    if 'email' in lower_filename:
        return 'Email', 'Email', binary_type
    if 'youtube' in lower_filename:
        return 'YouTube', 'Streaming', binary_type
    if 'bittorrent' in lower_filename:
        return 'BitTorrent', 'P2P', binary_type

    # If we get here, it's an unclassified file
    return None, None, None

def process_pcap_file(filename, base_dir):
    """
    Reads a single .pcap file and extracts its packet sequence and labels.
    This function is designed to be run in parallel.
    """
    filepath = os.path.join(base_dir, filename)

    # 1. Get labels
    application, category, binary_type = get_flow_labels(filename)
    if application is None:
        print(f"Skipping file (label not found): {filename}")
        return None

    # 2. Initialize packet sequence
    # We use a list and then convert to numpy array
    packet_sequence = []

    try:
        packets = rdpcap(filepath)

        # Find the client IP (source IP of the first IP packet)
        client_ip = None
        for pkt in packets:
            if IP in pkt:
                client_ip = pkt[IP].src
                break

        if client_ip is None:
            # print(f"Skipping file (no IP packets found): {filename}")
            return None # Skip non-IP flows

        # 3. Extract packet sizes with direction
        for pkt in packets:
            if IP in pkt:
                packet_size = pkt[IP].len

                # Client-to-Server = positive
                if pkt[IP].src == client_ip:
                    packet_sequence.append(packet_size)
                # Server-to-Client = negative
                elif pkt[IP].dst == client_ip:
                    packet_sequence.append(-packet_size)

            if len(packet_sequence) >= N_PACKETS:
                break # We only want the first N packets

    except Exception as e:
        # This catches corrupted or unreadable files
        # print(f"Skipping file (error reading pcap): {filename}, Error: {e}")
        return None

    if len(packet_sequence) == 0:
        # print(f"Skipping file (empty sequence): {filename}")
        return None

    # 4. Pad or Truncate the sequence
    final_sequence = np.zeros(N_PACKETS)
    if len(packet_sequence) >= N_PACKETS:
        final_sequence = np.array(packet_sequence[:N_PACKETS])
    else:
        final_sequence[:len(packet_sequence)] = np.array(packet_sequence)

    # 5. Normalize the sequence
    # Divide by 1500 to scale data between -1.0 and 1.0
    # This keeps 0.0 as 0.0 (padding)
    normalized_sequence = final_sequence / MAX_PACKET_SIZE

    # Return all data as a dictionary
    return {
        'filename': filename,
        'application': application,
        'category': category,
        'binary_type': binary_type,
        'sequence': normalized_sequence
    }

def build_autoencoder(n_packets, latent_dim):
    """
    Builds the LSTM Autoencoder model.
    """
    # Input shape is (timesteps, features) -> (128, 1)
    input_shape = (n_packets, 1)

    # --- Encoder ---
    inputs = Input(shape=input_shape)
    # Using 64 units as an intermediate layer
    x = LSTM(64, activation='relu', return_sequences=True)(inputs)
    # The 'encoder_output' layer is our 32-dim latent vector
    encoder_output = LSTM(latent_dim, activation='relu', name='encoder_output')(x)

    # --- Decoder ---
    # Repeat the latent vector for each timestep
    x = RepeatVector(n_packets)(encoder_output)
    x = LSTM(64, activation='relu', return_sequences=True)(x)
    # Reconstruct the original (128, 1) shape
    decoder_output = TimeDistributed(Dense(1))(x)

    # --- Autoencoder Model ---
    autoencoder = Model(inputs=inputs, outputs=decoder_output)
    autoencoder.compile(optimizer='adam', loss='mse')

    # --- Encoder-Only Model ---
    # This is the model we'll use to generate features
    encoder = Model(inputs=inputs, outputs=encoder_output)

    return autoencoder, encoder

# --- PART 2: Main Execution ---
def main():
    print(f"\n--- PART 1: Extracting Packet Sequences ---")
    print(f"Reading from: {FLOW_DIR}")
    print(f"Using N_PACKETS = {N_PACKETS} and LATENT_DIM = {LATENT_DIM}")

    if not os.path.isdir(FLOW_DIR):
        print(f"FATAL: Source directory not found. Please check the path.")
        return

    filenames = os.listdir(FLOW_DIR)
    # Filter out non-pcap files just in case
    pcap_files = [f for f in filenames if f.endswith('.pcap') or f.endswith('.pcapng')]
    print(f"Found {len(pcap_files)} .pcap files in the directory.")

    start_time = time.time()

    # Use joblib to process files in parallel
    # n_jobs=-1 uses all available CPU cores
    print("Processing files in parallel... (This may take several minutes)")
    results = Parallel(n_jobs=-1, verbose=5)(
        delayed(process_pcap_file)(f, FLOW_DIR) for f in pcap_files
    )

    end_time = time.time()
    print(f"File processing finished in {end_time - start_time:.2f} seconds.")

    # Filter out 'None' results from skipped/empty files
    valid_results = [r for r in results if r is not None]

    if not valid_results:
        print("FATAL: No valid data was extracted. Stopping script.")
        return

    print(f"Successfully processed {len(valid_results)} files.")
    print(f"Skipped {len(pcap_files) - len(valid_results)} empty/corrupted/unlabeled files.")

    # Create a DataFrame from the results
    df = pd.DataFrame(valid_results)

    # --- PART 2: Preparing Data for Autoencoder ---
    print("\n--- PART 2: Preparing Data for Autoencoder ---")

    # Extract the sequences and reshape for LSTM: (samples, timesteps, features)
    sequences = np.array(df['sequence'].tolist())
    # Reshape from (n_samples, 128) to (n_samples, 128, 1)
    X = sequences.reshape((sequences.shape[0], sequences.shape[1], 1))

    print(f"Data shape for autoencoder: {X.shape}")

    # Split into train/test for autoencoder training
    # We use the full dataset (X) to train the autoencoder
    X_train, X_test = train_test_split(X, test_size=VALIDATION_SPLIT, random_state=42)

    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_test.shape}")

    # --- PART 3: Building & Training Autoencoder ---
    print("\n--- PART 3: Building & Training Autoencoder ---")

    autoencoder, encoder = build_autoencoder(N_PACKETS, LATENT_DIM)
    autoencoder.summary()

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    print("Training autoencoder...")
    history = autoencoder.fit(
        X_train, X_train, # Autoencoder learns to reconstruct itself
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_test, X_test),
        callbacks=[early_stopping],
        verbose=1
    )

    print("Autoencoder training complete.")

    # --- PART 4: Generating Alpha (α) Features ---
    print("\n--- PART 4: Generating Alpha (α) Features ---")

    # Use the trained 'encoder' model to transform ALL sequences (not just X_train)
    # into the 32-dimensional latent space
    print(f"Generating {LATENT_DIM}-dimensional features for all {X.shape[0]} samples...")
    alpha_features = encoder.predict(X)

    print(f"Generated alpha features with shape: {alpha_features.shape}")

    # --- PART 5: Saving Final Dataset ---
    print("\n--- PART 5: Saving Final Dataset ---")

    # Create column names for the alpha features
    alpha_cols = [f'alpha_{i}' for i in range(LATENT_DIM)]

    # Create a new DataFrame for the features
    df_alpha = pd.DataFrame(alpha_features, columns=alpha_cols, index=df.index)

    # Concatenate the original labels with the new alpha features
    df_final = pd.concat([
        df[['filename', 'application', 'category', 'binary_type']],
        df_alpha
    ], axis=1)

    # Save the final CSV
    try:
        df_final.to_csv(OUTPUT_CSV, index=False)
        print(f"Successfully saved final alpha component (v2) to:")
        print(OUTPUT_CSV)
    except Exception as e:
        print(f"Error saving final CSV: {e}")

    # Save the encoder model for later use (optional)
    try:
        encoder.save(ENCODER_MODEL_SAVE_PATH)
        print(f"Successfully saved encoder model to:")
        print(ENCODER_MODEL_SAVE_PATH)
    except Exception as e:
        print(f"Error saving encoder model: {e}")

if __name__ == "__main__":
    # Ensure you have mounted your Google Drive
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
        print("from google.colab import drive; drive.mount('/content/drive')")
    else:
        # Run the main process
        main()

print("\n--- Alpha (α) v2 Script Finished ---")


--- Initializing Alpha (α) v2 Component Script ---
All libraries imported successfully.

--- PART 1: Extracting Packet Sequences ---
Reading from: /content/drive/MyDrive/1 Skripsi/Dataset/ISCX-VPN-NonVPN-2016/v2-final_flows
Using N_PACKETS = 128 and LATENT_DIM = 32
Found 10284 .pcap files in the directory.
Processing files in parallel... (This may take several minutes)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1085 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3040 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3708 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 5404 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 7541 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 9786 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 10284 out of 10284 | elapsed:  8.2min finished


File processing finished in 489.18 seconds.
Successfully processed 10105 files.
Skipped 179 empty/corrupted/unlabeled files.

--- PART 2: Preparing Data for Autoencoder ---
Data shape for autoencoder: (10105, 128, 1)
Training data shape: (8084, 128, 1)
Validation data shape: (2021, 128, 1)

--- PART 3: Building & Training Autoencoder ---


Training autoencoder...
Epoch 1/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 206ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: nan - val_loss: nan
Epoch 6/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - loss: nan - val_loss: nan
Autoencoder training complete.

--- PART 4: Generating Alpha (α) Features ---
Generating 32-dimensional features for all 10105 samples...
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step
Generated alpha features with shape: (10105, 32)

--- PART 5: Saving Fi



Successfully saved final alpha component (v2) to:
/content/drive/MyDrive/1 Skripsi/alpha_component_v2.csv
Successfully saved encoder model to:
/content/drive/MyDrive/1 Skripsi/alpha_encoder_v2.h5

--- Alpha (α) v2 Script Finished ---


In [None]:
import pandas as pd

# Path to the generated CSV file
OUTPUT_CSV = "/content/drive/MyDrive/1 Skripsi/alpha_component_v2.csv"

# Read the CSV file into a pandas DataFrame
try:
    df_alpha_component = pd.read_csv(OUTPUT_CSV)
    # Display the head of the DataFrame
    display(df_alpha_component.head())
except FileNotFoundError:
    print(f"Error: The file {OUTPUT_CSV} was not found.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

Unnamed: 0,filename,application,category,binary_type,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,...,alpha_22,alpha_23,alpha_24,alpha_25,alpha_26,alpha_27,alpha_28,alpha_29,alpha_30,alpha_31
0,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,,,,,,,...,,,,,,,,,,
1,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,,,,,,,...,,,,,,,,,,
2,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,,,,,,,...,,,,,,,,,,
3,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,,,,,,,...,,,,,,,,,,
4,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,,,,,,,...,,,,,,,,,,


In [None]:
# --- Alpha (α) Component Extraction Script ---
# This script reads the 10,284 filtered .pcap files, extracts the
# first 128 packet sizes (with direction), and then trains an
# LSTM autoencoder to generate the 32-dimensional alpha (α) features.

# --- v2-FIXED ---
# 1. Added np.clip() to the data processing to prevent exploding gradients.
# 2. Changed LSTM activations from 'relu' to 'tanh' for better stability.
# ------------------

print("--- Initializing Alpha (α) v2 Component Script (FIXED) ---")

# --- Step 0: Install necessary libraries ---
# Scapy is required for reading .pcap files
try:
    import scapy.all as scapy
except ImportError:
    print("Installing scapy...")
    # Use 'pip install' in a Colab cell, not subprocess
    # For this script, we'll assume it's run after !pip install scapy
    print("Please run '!pip install scapy' in a Colab cell and restart the runtime.")
    # In a notebook, run: !pip install scapy

import os
import collections
import time
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scapy.all import rdpcap, IP

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# TensorFlow and Keras for the autoencoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

print("All libraries imported successfully.")

# --- PART 1: Configuration & Labeling Map ---

# --- File & Path Configuration ---
# Directory containing the 10,284 .pcap files
FLOW_DIR = "/content/drive/MyDrive/1 Skripsi/Dataset/ISCX-VPN-NonVPN-2016/v2-final_flows"
# Output CSV file
OUTPUT_CSV = "/content/drive/MyDrive/1 Skripsi/alpha_component_v2.csv"
# We'll also save the trained encoder model
ENCODER_MODEL_SAVE_PATH = "/content/drive/MyDrive/1 Skripsi/alpha_encoder_v2.h5"


# --- Feature Extraction Configuration ---
N_PACKETS = 128     # First N packets to use
MAX_PACKET_SIZE = 1500.0 # For normalization (maps to -1.0 to 1.0)

# --- Autoencoder Configuration ---
LATENT_DIM = 32     # Dimensions of the final alpha vector (α)
EPOCHS = 50
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.2

# --- Labeling Map (Copied from your script) ---
KEYWORD_MAP = collections.OrderedDict([
    ('facebook_chat', ('Facebook', 'Chat')),
    ('facebookchat', ('Facebook', 'Chat')),
    ('hangouts_chat', ('Hangout', 'Chat')),
    ('hangout_chat', ('Hangout', 'Chat')),
    ('gmailchat', ('Gmail', 'Chat')),
    ('icq_chat', ('ICQ', 'Chat')),
    ('icqchat', ('ICQ', 'Chat')),
    ('skype_chat', ('Skype', 'Chat')),
    ('aim_chat', ('AIM Chat', 'Chat')),
    ('aimchat', ('AIM Chat', 'Chat')),

    ('facebook_audio', ('Facebook', 'VoIP')),
    ('hangouts_audio', ('Hangout', 'VoIP')),
    ('skype_audio', ('Skype', 'VoIP')),
    ('voipbuster', ('VOIPBuster', 'VoIP')),
    ('facebook_video', ('Facebook', 'VoIP')),
    ('hangouts_video', ('Hangout', 'VoIP')),
    ('skype_video', ('Skype', 'VoIP')),

    ('skype_file', ('Skype', 'File Transfer')),
    ('ftps', ('FTP', 'File Transfer')),
    ('sftp', ('SFTP', 'File Transfer')),
    ('scp', ('SCP', 'File Transfer')),
    ('ftp', ('FTP', 'File Transfer')),

    ('email', ('Email', 'Email')),
    ('gmail', ('Gmail', 'Email')),

    ('netflix', ('Netflix', 'Streaming')),
    ('spotify', ('Spotify', 'Streaming')),
    ('vimeo', ('Vimeo', 'Streaming')),
    ('youtube', ('YouTube', 'Streaming')),

    ('bittorrent', ('BitTorrent', 'P2P')),
])

# --- List of the 6 applications we are using ---
# This is used to sanity-check the labeling
TARGET_APPS = {
    'Skype', 'Email', 'SCP', 'VOIPBuster', 'YouTube', 'BitTorrent'
}


def get_flow_labels(filename):
    """
    Parses a filename to get its labels (application, category, binary_type).
    """
    lower_filename = filename.lower()

    # 1. Determine Binary Type
    binary_type = 'VPN' if lower_filename.startswith('vpn_') else 'NonVPN'

    # 2. Determine Application and Category
    for keyword, (application, category) in KEYWORD_MAP.items():
        if keyword in lower_filename:
            # Sanity check if the app is one we expect
            if application not in TARGET_APPS:
                # This should not happen if the filtering script was correct
                # We can allow 'SCP' to not be in the keyword map
                if application == 'SCP':
                   return 'SCP', 'File Transfer', binary_type

                print(f"Warning: Found app '{application}' not in TARGET_APPS.")

            return application, category, binary_type

    # Fallback in case no keyword matches (e.g., FTP files named just 'ftp_...pcap')
    # This logic is based on our 6 target apps
    if 'scp' in lower_filename:
        return 'SCP', 'File Transfer', binary_type
    if 'email' in lower_filename:
        return 'Email', 'Email', binary_type
    if 'youtube' in lower_filename:
        return 'YouTube', 'Streaming', binary_type
    if 'bittorrent' in lower_filename:
        return 'BitTorrent', 'P2P', binary_type

    # If we get here, it's an unclassified file
    # Check for 'skype', 'voipbuster' which might not have hit keywords
    if 'skype' in lower_filename:
        return 'Skype', 'Unknown', binary_type # Fallback, should be caught by keywords
    if 'voipbuster' in lower_filename:
        return 'VOIPBuster', 'VoIP', binary_type

    return None, None, None

def process_pcap_file(filename, base_dir):
    """
    Reads a single .pcap file and extracts its packet sequence and labels.
    This function is designed to be run in parallel.
    """
    filepath = os.path.join(base_dir, filename)

    # 1. Get labels
    application, category, binary_type = get_flow_labels(filename)
    if application is None:
        # print(f"Skipping file (label not found): {filename}")
        return None

    # 2. Initialize packet sequence
    packet_sequence = []

    try:
        packets = rdpcap(filepath)

        client_ip = None
        for pkt in packets:
            if IP in pkt:
                client_ip = pkt[IP].src
                break

        if client_ip is None:
            return None # Skip non-IP flows

        # 3. Extract packet sizes with direction
        for pkt in packets:
            if IP in pkt:
                packet_size = pkt[IP].len

                if pkt[IP].src == client_ip:
                    packet_sequence.append(packet_size)
                elif pkt[IP].dst == client_ip:
                    packet_sequence.append(-packet_size)

            if len(packet_sequence) >= N_PACKETS:
                break

    except Exception as e:
        return None

    if len(packet_sequence) == 0:
        return None

    # 4. Pad or Truncate the sequence
    final_sequence = np.zeros(N_PACKETS)
    if len(packet_sequence) >= N_PACKETS:
        final_sequence = np.array(packet_sequence[:N_PACKETS])
    else:
        final_sequence[:len(packet_sequence)] = np.array(packet_sequence)

    # 5. Normalize the sequence
    normalized_sequence = final_sequence / MAX_PACKET_SIZE

    # --- THIS IS THE FIX ---
    # Clip values to [-1.0, 1.0] to prevent extreme values from jumbo frames
    normalized_sequence = np.clip(normalized_sequence, -1.0, 1.0) # <-- FIX 1
    # --- END FIX ---

    return {
        'filename': filename,
        'application': application,
        'category': category,
        'binary_type': binary_type,
        'sequence': normalized_sequence
    }

def build_autoencoder(n_packets, latent_dim):
    """
    Builds the LSTM Autoencoder model.
    """
    input_shape = (n_packets, 1)

    # --- Encoder ---
    inputs = Input(shape=input_shape)
    # Using 64 units as an intermediate layer
    # Changed activation to 'tanh' for stability
    x = LSTM(64, activation='tanh', return_sequences=True)(inputs) # <-- FIX 2
    # The 'encoder_output' layer is our 32-dim latent vector
    encoder_output = LSTM(latent_dim, activation='tanh', name='encoder_output')(x) # <-- FIX 3

    # --- Decoder ---
    x = RepeatVector(n_packets)(encoder_output)
    x = LSTM(64, activation='tanh', return_sequences=True)(x) # <-- FIX 4
    decoder_output = TimeDistributed(Dense(1))(x)

    autoencoder = Model(inputs=inputs, outputs=decoder_output)
    autoencoder.compile(optimizer='adam', loss='mse')

    encoder = Model(inputs=inputs, outputs=encoder_output)

    return autoencoder, encoder

# --- PART 2: Main Execution ---
def main():
    print(f"\n--- PART 1: Extracting Packet Sequences ---")
    print(f"Reading from: {FLOW_DIR}")
    print(f"Using N_PACKETS = {N_PACKETS} and LATENT_DIM = {LATENT_DIM}")

    if not os.path.isdir(FLOW_DIR):
        print(f"FATAL: Source directory not found. Please check the path.")
        return

    filenames = os.listdir(FLOW_DIR)
    pcap_files = [f for f in filenames if f.endswith('.pcap') or f.endswith('.pcapng')]
    print(f"Found {len(pcap_files)} .pcap files in the directory.")

    start_time = time.time()

    print("Processing files in parallel... (This may take several minutes)")
    results = Parallel(n_jobs=-1, verbose=5)(
        delayed(process_pcap_file)(f, FLOW_DIR) for f in pcap_files
    )

    end_time = time.time()
    print(f"File processing finished in {end_time - start_time:.2f} seconds.")

    valid_results = [r for r in results if r is not None]

    if not valid_results:
        print("FATAL: No valid data was extracted. Stopping script.")
        return

    print(f"Successfully processed {len(valid_results)} files.")
    print(f"Skipped {len(pcap_files) - len(valid_results)} empty/corrupted/unlabeled files.")

    df = pd.DataFrame(valid_results)

    # --- PART 2: Preparing Data for Autoencoder ---
    print("\n--- PART 2: Preparing Data for Autoencoder ---")

    sequences = np.array(df['sequence'].tolist())
    X = sequences.reshape((sequences.shape[0], sequences.shape[1], 1))

    # --- Check for NaNs *before* training ---
    if np.isnan(X).any():
        print("FATAL: 'NaN' values found in the processed sequences *before* training.")
        print("This indicates a problem with the process_pcap_file function.")
        return
    else:
        print("Data sanity check passed: No NaN values found in input data.")

    print(f"Data shape for autoencoder: {X.shape}")

    X_train, X_test = train_test_split(X, test_size=VALIDATION_SPLIT, random_state=42)

    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_test.shape}")

    # --- PART 3: Building & Training Autoencoder ---
    print("\n--- PART 3: Building & Training Autoencoder ---")

    autoencoder, encoder = build_autoencoder(N_PACKETS, LATENT_DIM)
    autoencoder.summary()

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    print("Training autoencoder...")
    history = autoencoder.fit(
        X_train, X_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_test, X_test),
        callbacks=[early_stopping],
        verbose=1
    )

    print("Autoencoder training complete.")

    # --- PART 4: Generating Alpha (α) Features ---
    print("\n--- PART 4: Generating Alpha (α) Features ---")

    print(f"Generating {LATENT_DIM}-dimensional features for all {X.shape[0]} samples...")
    alpha_features = encoder.predict(X)

    print(f"Generated alpha features with shape: {alpha_features.shape}")

    # --- Check for NaNs *after* training ---
    if np.isnan(alpha_features).any():
        print("ERROR: 'NaN' values found in the *output* features.")
        print("This means the model training still failed. Check loss values above.")
    else:
        print("Feature generation sanity check passed: No NaN values found in output.")

    # --- PART 5: Saving Final Dataset ---
    print("\n--- PART 5: Saving Final Dataset ---")

    alpha_cols = [f'alpha_{i}' for i in range(LATENT_DIM)]
    df_alpha = pd.DataFrame(alpha_features, columns=alpha_cols, index=df.index)

    df_final = pd.concat([
        df[['filename', 'application', 'category', 'binary_type']],
        df_alpha
    ], axis=1)

    try:
        df_final.to_csv(OUTPUT_CSV, index=False)
        print(f"Successfully saved final alpha component (v2) to:")
        print(OUTPUT_CSV)
    except Exception as e:
        print(f"Error saving final CSV: {e}")

    try:
        encoder.save(ENCODER_MODEL_SAVE_PATH)
        print(f"Successfully saved encoder model to:")
        print(ENCODER_MODEL_SAVE_PATH)
    except Exception as e:
        print(f"Error saving encoder model: {e}")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
        print("from google.colab import drive; drive.mount('/content/drive')")
    else:
        main()

print("\n--- Alpha (α) v2 Script Finished (FIXED) ---")


--- Initializing Alpha (α) v2 Component Script (FIXED) ---
All libraries imported successfully.

--- PART 1: Extracting Packet Sequences ---
Reading from: /content/drive/MyDrive/1 Skripsi/Dataset/ISCX-VPN-NonVPN-2016/v2-final_flows
Using N_PACKETS = 128 and LATENT_DIM = 32
Found 10284 .pcap files in the directory.
Processing files in parallel... (This may take several minutes)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 786 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1021 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1708 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 3252 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3772 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 5245 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 7590 tasks      | elapsed:  4.6min


File processing finished in 459.79 seconds.
Successfully processed 10105 files.
Skipped 179 empty/corrupted/unlabeled files.

--- PART 2: Preparing Data for Autoencoder ---
Data sanity check passed: No NaN values found in input data.
Data shape for autoencoder: (10105, 128, 1)
Training data shape: (8084, 128, 1)
Validation data shape: (2021, 128, 1)

--- PART 3: Building & Training Autoencoder ---


[Parallel(n_jobs=-1)]: Done 10284 out of 10284 | elapsed:  7.7min finished


Training autoencoder...
Epoch 1/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 40ms/step - loss: 0.0148 - val_loss: 0.0154
Epoch 2/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - loss: 0.0136 - val_loss: 0.0146
Epoch 3/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - loss: 0.0129 - val_loss: 0.0145
Epoch 4/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - loss: 0.0129 - val_loss: 0.0143
Epoch 5/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - loss: 0.0129 - val_loss: 0.0145
Epoch 6/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - loss: 0.0130 - val_loss: 0.0165
Epoch 7/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - loss: 0.0133 - val_loss: 0.0141
Epoch 8/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - loss: 0.0128 - val_loss: 0.0150
Epoch 9



Successfully saved final alpha component (v2) to:
/content/drive/MyDrive/1 Skripsi/alpha_component_v2.csv
Successfully saved encoder model to:
/content/drive/MyDrive/1 Skripsi/alpha_encoder_v2.h5

--- Alpha (α) v2 Script Finished (FIXED) ---
