# the new dataset from week of April

Key Components:
Imports: All necessary libraries are imported.
File Paths: File paths for the scaler, autoencoder, and threshold are defined.
Resource Loading: Resources are loaded from the specified paths.
Utility Functions:
ip_to_int: Converts IP addresses to integers.
preprocess_samples: Preprocesses the DataFrame by converting IPs, encoding categorical features, imputing missing values, and scaling.
predict_anomalies: Applies the preprocessing function, predicts reconstruction errors, and identifies anomalies.
Dataset Processing:
Loads a larger dataset from a CSV file.
Processes the dataset to detect anomalies.
Prints the results, including reconstruction errors and anomaly labels.
This script assumes that you have a CSV file containing the dataset with the required features and that your model and scaler were trained with a similar schema. Adjust file paths, mappings, and feature names as needed based on your specific context.

In [None]:
import numpy as np
import pandas as pd
import struct
import socket
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import load_model
from joblib import dump, load

# Define file paths
scaler_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/scaler.joblib'
autoencoder_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/autoencoder_model.keras'
threshold_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/threshold.npy'

# Load resources
scaler = load(scaler_file_path)
autoencoder = load_model(autoencoder_file_path)
threshold = np.load(threshold_file_path)

# Load a larger dataset
dataset_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/new_dataset.csv'
new_dataset_df = pd.read_csv(dataset_file_path)

# Print columns to check for discrepancies
print("Columns in the dataset:", new_dataset_df.columns)

# Categorical features to be encoded
categorical_features = ['protocol', 'flag']

# Initialize mappings dictionary
mappings = {}

# Perform label encoding for categorical features and store mappings
for feature in categorical_features:
    # Create a LabelEncoder object
    label_encoder = LabelEncoder()
    # Fit and transform the feature in new_dataset_df
    new_dataset_df[feature] = label_encoder.fit_transform(new_dataset_df[feature])
    # Store the mapping for later use
    mappings[feature] = {label: index for index, label in enumerate(label_encoder.classes_)}

# Extract mappings for protocol and flag
protocol_mapping = mappings.get('protocol', {})
flag_mapping = mappings.get('flag', {})

def ip_to_int(ip_address):
    try:
        return struct.unpack("!I", socket.inet_aton(ip_address))[0]
    except socket.error:
        return 0  # Return 0 if IP address is invalid

def preprocess_samples(samples_df, scaler):
    # Ensure all required features are present and impute missing values with zeros
    original_features = ['duration', 'source_ip', 'destination_ip', 'source_port',
                         'destination_port', 'forwarding_status', 'protocol',
                         'flag', 'tos', 'packets', 'bytes']

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    samples_df = samples_df.copy()

    for feature in original_features:
        if feature not in samples_df:
            samples_df[feature] = 0  # Impute missing features with zeros

    # Convert IP addresses to integers
    samples_df['source_ip'] = samples_df['source_ip'].apply(ip_to_int)
    samples_df['destination_ip'] = samples_df['destination_ip'].apply(ip_to_int)

    # Encode categorical features
    samples_df['protocol'] = samples_df['protocol'].map(protocol_mapping).fillna(-1).astype(int)
    samples_df['flag'] = samples_df['flag'].map(flag_mapping).fillna(-1).astype(int)

    # Ensure that the DataFrame only contains columns that the scaler expects
    if set(original_features) - set(samples_df.columns):
        raise ValueError("Missing features in the DataFrame.")

    # Reorder columns to match the scaler's expected feature order
    samples_df = samples_df[original_features]

    # Fill missing values in numerical features
    samples_df.fillna(0, inplace=True)


    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    samples_normalized = scaler.fit_transform(samples_df)

    return samples_df



def predict_anomalies(samples_df, scaler, autoencoder, threshold):
    # Preprocess the samples
    preprocessed_samples_df = preprocess_samples(samples_df, scaler)

    # Predict reconstructed data using the autoencoder
    reconstructed_data = autoencoder.predict(preprocessed_samples_df)

    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(preprocessed_samples_df - reconstructed_data), axis=1)

    # Determine if each reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Map the anomalies to the expected output
    results_df = samples_df.copy()
    results_df['reconstruction_error'] = reconstruction_errors
    results_df['anomaly'] = ['Background' if not is_anomaly else 'Anomaly' for is_anomaly in anomalies]

    return results_df

def preprocess_new_sample(new_sample, scaler, protocol_mapping, flag_mapping):
    # Create a DataFrame for the new sample
    new_sample_df = pd.DataFrame([new_sample])

    # Preprocess the new sample
    return preprocess_samples(new_sample_df, scaler)

def predict_anomaly(new_sample, scaler, autoencoder, threshold):
    # Preprocess the new sample
    new_sample_df = preprocess_new_sample(new_sample, scaler, protocol_mapping, flag_mapping)

    # Predict reconstructed data using the autoencoder
    reconstructed_data = autoencoder.predict(new_sample_df)

    # Calculate reconstruction error
    reconstruction_error = np.mean(np.square(new_sample_df.values - reconstructed_data))

    # Determine if the reconstruction error exceeds the threshold
    is_anomaly = reconstruction_error > threshold

    # Return the anomaly status and reconstruction error
    return "Anomaly" if is_anomaly else "Normal", reconstruction_error

# Call the function and capture the results
results_df = predict_anomalies(new_dataset_df, scaler, autoencoder, threshold)

# Print results
print(results_df.head())


Columns in the dataset: Index(['date_time', 'duration', 'source_ip', 'destination_ip', 'source_port',
       'destination_port', 'protocol', 'flag', 'forwarding_status', 'tos',
       'packets', 'bytes', 'label'],
      dtype='object')
          date_time  duration       source_ip  destination_ip  source_port  \
0  27/07/2016 18:29     0.668    211.58.234.3   42.219.155.56        50099   
1   28/07/2016 4:29     0.540    193.27.1.120  42.219.156.212           25   
2  27/07/2016 17:17     3.604  70.210.176.159   42.219.155.28        33990   
3   28/07/2016 0:49     0.000   42.219.156.30  42.219.150.247         7921   
4   28/07/2016 3:52     0.000   43.164.49.177   42.219.155.26           80   

   destination_port  protocol  flag  forwarding_status  tos  packets  bytes  \
0                80         5    15                  0    0       34   1988   
1             40937         5    15                  0    0        5    961   
2               443         5    19                  0    

1- Load Pre-trained Model: Load a pre-trained model that has been trained on a similar dataset.
2- Adjust Architecture: Modify or add layers as needed for the new task.
3- Retrain: Fine-tune the model on the new dataset by continuing the training process.
4- Evaluate: Assess the performance of the fine-tuned model using metrics like classification report, confusion matrix, and ROC curve.

In [None]:
import numpy as np
import pandas as pd
import struct
import socket
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import load_model
from joblib import load

# Define file paths
scaler_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/scaler.joblib'
autoencoder_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/autoencoder_model.keras'
threshold_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/threshold.npy'

# Load resources
scaler = load(scaler_file_path)  # Load the pre-trained scaler
autoencoder = load_model(autoencoder_file_path)
threshold = np.load(threshold_file_path)

# Load a larger dataset
dataset_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/new_dataset.csv'
new_dataset_df = pd.read_csv(dataset_file_path)

# Print columns to check for discrepancies
print("Columns in the dataset:", new_dataset_df.columns)

# Categorical features to be encoded
categorical_features = ['protocol', 'flag']

# Initialize mappings dictionary
mappings = {}

# Perform label encoding for categorical features and store mappings
for feature in categorical_features:
    # Create a LabelEncoder object
    label_encoder = LabelEncoder()
    # Fit and transform the feature in new_dataset_df
    new_dataset_df[feature] = label_encoder.fit_transform(new_dataset_df[feature])
    # Store the mapping for later use
    mappings[feature] = {label: index for index, label in enumerate(label_encoder.classes_)}

# Extract mappings for protocol and flag
protocol_mapping = mappings.get('protocol', {})
flag_mapping = mappings.get('flag', {})

def ip_to_int(ip_address):
    try:
        return struct.unpack("!I", socket.inet_aton(ip_address))[0]
    except socket.error:
        return 0  # Return 0 if IP address is invalid

def preprocess_samples(samples_df, scaler):
    # Define the exact column order as used during scaler fitting
    original_features = ['duration', 'source_ip', 'destination_ip', 'source_port',
                         'destination_port', 'forwarding_status', 'protocol',
                         'flag', 'tos', 'packets', 'bytes']

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    samples_df = samples_df.copy()

    # Ensure all required features are present and impute missing values with zeros
    for feature in original_features:
        if feature not in samples_df:
            samples_df[feature] = 0  # Impute missing features with zeros

    # Convert IP addresses to integers
    samples_df['source_ip'] = samples_df['source_ip'].apply(ip_to_int)
    samples_df['destination_ip'] = samples_df['destination_ip'].apply(ip_to_int)

    # Encode categorical features
    samples_df['protocol'] = samples_df['protocol'].map(protocol_mapping).fillna(-1).astype(int)
    samples_df['flag'] = samples_df['flag'].map(flag_mapping).fillna(-1).astype(int)

    # Reorder columns to match the scaler's expected feature order
    samples_df = samples_df[original_features]

    # Fill missing values in numerical features
    samples_df.fillna(0, inplace=True)

    # Print column names before scaling
    print("Columns before scaling:", samples_df.columns)

    # Normalize the data using the pre-loaded StandardScaler
    samples_normalized = scaler.transform(samples_df)

    # Print column names after scaling
    print("Columns after scaling:", samples_df.columns)

    return samples_normalized

def predict_anomalies(samples_df, scaler, autoencoder, threshold):
    # Preprocess the samples
    preprocessed_samples_df = preprocess_samples(samples_df, scaler)

    # Predict reconstructed data using the autoencoder
    reconstructed_data = autoencoder.predict(preprocessed_samples_df)

    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(preprocessed_samples_df - reconstructed_data), axis=1)

    # Determine if each reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Map the anomalies to the expected output
    results_df = samples_df.copy()
    results_df['reconstruction_error'] = reconstruction_errors
    results_df['anomaly'] = ['Background' if not is_anomaly else 'Anomaly' for is_anomaly in anomalies]

    return results_df

def preprocess_new_sample(new_sample, scaler, protocol_mapping, flag_mapping):
    # Create a DataFrame for the new sample
    new_sample_df = pd.DataFrame([new_sample])

    # Preprocess the new sample
    return preprocess_samples(new_sample_df, scaler)

# Call the function and capture the results
results_df = predict_anomalies(new_dataset_df, scaler, autoencoder, threshold)

# Print results
print(results_df.head())


Columns in the dataset: Index(['date_time', 'duration', 'source_ip', 'destination_ip', 'source_port',
       'destination_port', 'protocol', 'flag', 'forwarding_status', 'tos',
       'packets', 'bytes', 'label'],
      dtype='object')
Columns before scaling: Index(['duration', 'source_ip', 'destination_ip', 'source_port',
       'destination_port', 'forwarding_status', 'protocol', 'flag', 'tos',
       'packets', 'bytes'],
      dtype='object')


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [None]:
# Save the entire DataFrame to a CSV file
results_df.to_csv('/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/results_df.csv', index=False)
print("Results saved to /content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/results_df.csv")


Results saved to /content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/results_df.csv


In [None]:
# Print the entire DataFrame (use cautiously for very large datasets)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results_df)


Output hidden; open in https://colab.research.google.com to view.

# 1. Loading the Pre-trained Model


In [101]:
import numpy as np
import pandas as pd
import struct
import socket
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from joblib import load

# Define file paths
scaler_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/scaler.joblib'
autoencoder_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/autoencoder_model.keras'
threshold_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/threshold.npy'
dataset_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/new_dataset.csv'

# Load resources
scaler = load(scaler_file_path)
autoencoder = load_model(autoencoder_file_path)
threshold = np.load(threshold_file_path)

# Load the dataset
dataset_df = pd.read_csv(dataset_file_path)

# Print columns to check for discrepancies
print("Columns in the dataset:", dataset_df.columns)

# Categorical features to be encoded
categorical_features = ['protocol', 'flag']

# Initialize mappings dictionary
mappings = {}

# Perform label encoding for categorical features and store mappings
for feature in categorical_features:
    label_encoder = LabelEncoder()
    dataset_df[feature] = label_encoder.fit_transform(dataset_df[feature])
    mappings[feature] = {label: index for index, label in enumerate(label_encoder.classes_)}

# Extract mappings for protocol and flag
protocol_mapping = mappings.get('protocol', {})
flag_mapping = mappings.get('flag', {})

def ip_to_int(ip_address):
    try:
        return struct.unpack("!I", socket.inet_aton(ip_address))[0]
    except socket.error:
        return 0  # Return 0 if IP address is invalid

def preprocess_samples(samples_df, scaler):
    # Ensure all required features are present and impute missing values with zeros
    original_features = ['duration', 'source_ip', 'destination_ip', 'source_port',
                         'destination_port', 'forwarding_status', 'protocol',
                         'flag', 'tos', 'packets', 'bytes', 'label']

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    samples_df = samples_df.copy()

    for feature in original_features:
        if feature not in samples_df:
            samples_df[feature] = 0  # Impute missing features with zeros

    # Convert IP addresses to integers
    samples_df['source_ip'] = samples_df['source_ip'].apply(ip_to_int)
    samples_df['destination_ip'] = samples_df['destination_ip'].apply(ip_to_int)

    # Encode categorical features
    samples_df['protocol'] = samples_df['protocol'].map(protocol_mapping).fillna(-1).astype(int)
    samples_df['flag'] = samples_df['flag'].map(flag_mapping).fillna(-1).astype(int)

    # Ensure that the DataFrame only contains columns that the scaler expects
    if set(original_features) - set(samples_df.columns):
        raise ValueError("Missing features in the DataFrame.")

    # Reorder columns to match the scaler's expected feature order
    samples_df = samples_df[original_features]

    # Fill missing values in numerical features
    samples_df.fillna(0, inplace=True)

    # Normalize the data using StandardScaler
    samples_normalized = scaler.transform(samples_df)  # Use the pre-fitted scaler

    return samples_normalized

def predict_anomalies(samples_df, scaler, autoencoder, threshold):
    # Preprocess the samples
    preprocessed_samples_df = preprocess_samples(samples_df, scaler)

    # Predict reconstructed data using the autoencoder
    reconstructed_data = autoencoder.predict(preprocessed_samples_df)

    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(preprocessed_samples_df - reconstructed_data), axis=1)

    # Determine if each reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Return a DataFrame with the results
    results_df = samples_df.copy()
    results_df['reconstruction_error'] = reconstruction_errors
    results_df['anomaly'] = ['Anomaly' if is_anomaly else 'Normal' for is_anomaly in anomalies]

    return results_df

def evaluate_model(y_true, y_pred):
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly']))

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, marker='o', label=f'ROC curve (AUC = {auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

# Prepare features and labels
X = preprocess_samples(dataset_df, scaler)
y = dataset_df['label'].apply(lambda x: 1 if x == 'Anomaly' else 0)  # Assuming 'label' is the column for true labels

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Make predictions on the validation set
results_df = predict_anomalies(pd.DataFrame(X_val, columns=dataset_df.columns), scaler, autoencoder, threshold)

# Evaluate model
evaluate_model(y_val, results_df['anomaly'].apply(lambda x: 1 if x == 'Anomaly' else 0))


Columns in the dataset: Index(['date_time', 'duration', 'source_ip', 'destination_ip', 'source_port',
       'destination_port', 'protocol', 'flag', 'forwarding_status', 'tos',
       'packets', 'bytes', 'label'],
      dtype='object')


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- label


In [102]:
import numpy as np
import pandas as pd
import struct
import socket
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from joblib import load

# Define file paths
scaler_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/scaler.joblib'
autoencoder_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/autoencoder_model.keras'
threshold_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/threshold.npy'
dataset_file_path = '/content/drive/MyDrive/azizah_alqahtani_project/IDS_codes/new_dataset.csv'

# Load resources
scaler = load(scaler_file_path)
autoencoder = load_model(autoencoder_file_path)
threshold = np.load(threshold_file_path)

# Load the dataset
dataset_df = pd.read_csv(dataset_file_path)

# Print columns to check for discrepancies
print("Columns in the dataset:", dataset_df.columns)

# Categorical features to be encoded
categorical_features = ['protocol', 'flag']

# Initialize mappings dictionary
mappings = {}

# Perform label encoding for categorical features and store mappings
for feature in categorical_features:
    label_encoder = LabelEncoder()
    dataset_df[feature] = label_encoder.fit_transform(dataset_df[feature])
    mappings[feature] = {label: index for index, label in enumerate(label_encoder.classes_)}

# Extract mappings for protocol and flag
protocol_mapping = mappings.get('protocol', {})
flag_mapping = mappings.get('flag', {})

def ip_to_int(ip_address):
    try:
        return struct.unpack("!I", socket.inet_aton(ip_address))[0]
    except socket.error:
        return 0  # Return 0 if IP address is invalid

def preprocess_samples(samples_df, scaler):
    # Define feature columns (excluding 'label')
    feature_columns = ['duration', 'source_ip', 'destination_ip', 'source_port',
                       'destination_port', 'forwarding_status', 'protocol',
                       'flag', 'tos', 'packets', 'bytes']

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    samples_df = samples_df.copy()

    # Convert IP addresses to integers
    samples_df['source_ip'] = samples_df['source_ip'].apply(ip_to_int)
    samples_df['destination_ip'] = samples_df['destination_ip'].apply(ip_to_int)

    # Encode categorical features
    samples_df['protocol'] = samples_df['protocol'].map(protocol_mapping).fillna(-1).astype(int)
    samples_df['flag'] = samples_df['flag'].map(flag_mapping).fillna(-1).astype(int)

    # Ensure all feature columns are present
    for feature in feature_columns:
        if feature not in samples_df:
            samples_df[feature] = 0  # Impute missing features with zeros

    # Reorder columns to match the scaler's expected feature order
    samples_df = samples_df[feature_columns]

    # Fill missing values in numerical features
    samples_df.fillna(0, inplace=True)

    # Normalize the data using StandardScaler
    samples_normalized = scaler.transform(samples_df)

    return samples_normalized

def predict_anomalies(samples_df, scaler, autoencoder, threshold):
    # Preprocess the samples
    preprocessed_samples_df = preprocess_samples(samples_df, scaler)

    # Predict reconstructed data using the autoencoder
    reconstructed_data = autoencoder.predict(preprocessed_samples_df)

    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(preprocessed_samples_df - reconstructed_data), axis=1)

    # Determine if each reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Return a DataFrame with the results
    results_df = samples_df.copy()
    results_df['reconstruction_error'] = reconstruction_errors
    results_df['anomaly'] = ['Anomaly' if is_anomaly else 'Normal' for is_anomaly in anomalies]

    return results_df

def evaluate_model(y_true, y_pred):
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly']))

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, marker='o', label=f'ROC curve (AUC = {auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

# Prepare features and labels
X = preprocess_samples(dataset_df, scaler)
y = dataset_df['label'].apply(lambda x: 1 if x == 'Anomaly' else 0)  # Assuming 'label' is the column for true labels

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Make predictions on the validation set
X_val_df = pd.DataFrame(X_val, columns=['duration', 'source_ip', 'destination_ip', 'source_port',
                                         'destination_port', 'forwarding_status', 'protocol',
                                         'flag', 'tos', 'packets', 'bytes'])
results_df = predict_anomalies(X_val_df, scaler, autoencoder, threshold)

# Evaluate model
evaluate_model(y_val, results_df['anomaly'].apply(lambda x: 1 if x == 'Anomaly' else 0))


Columns in the dataset: Index(['date_time', 'duration', 'source_ip', 'destination_ip', 'source_port',
       'destination_port', 'protocol', 'flag', 'forwarding_status', 'tos',
       'packets', 'bytes', 'label'],
      dtype='object')


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


#2. Adjusting the Model Architecture
Typically, you would modify the model for the new task, like adding or modifying layers. However, in the case of an autoencoder, I don't change the architecture much but focus on retraining it.

# 3. Retraining the Model


# 4. Evaluating the Model

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


# Next Steps:
# Save the Fine-Tuned Model:
After training, save the fine-tuned model for future use.
#Hyperparameter Tuning:
Adjust parameters like learning rate, number of epochs, or batch size to optimize performance.
# Model Comparison:
Compare the fine-tuned model's performance with a model trained from scratch to evaluate the benefits of transfer learning.